In [1]:
import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
data = pd.read_feather("train.f")

# Block CV (1-block)

In [3]:
data['block'] = np.trunc(data['era']*.1).astype(int)
data.loc[data['block'] == 12, 'block'] = 11

In [4]:
data['block'].value_counts().sort_index()

0     24515
1     34600
2     37444
3     41101
4     43439
5     48186
6     46831
7     40403
8     43971
9     45609
10    46107
11    49602
Name: block, dtype: int64

In [5]:
results_val = []

for block in range(12):
    print("Validation Block {}".format(block))
    

    train = data[data['block'] != block]
    val = data[data['block'] == block]
    
    X_train = train.filter(regex=r'feature')
    X_val = val.filter(regex=r'feature')

    y_train = train['target_kazutsugi']
    y_val = val['target_kazutsugi']
     

    mdl = LGBMRegressor(max_depth=5, num_leaves=2**5, learning_rate=0.01, n_estimators=2000, colsample_bytree=0.1, random_state=0)
    mdl.fit(X_train, y_train)
    
    predictions = pd.Series(mdl.predict(X_val))
    ranked_predictions = predictions.rank(pct=True, method="first")
    correlation = np.corrcoef(y_val, ranked_predictions)[0, 1]
    #print(correlation)
    
    results_val.append(correlation)
    print("Correlation {}".format(correlation))
    print()
   

Validation Block 0
Correlation 0.06711982432645597

Validation Block 1
Correlation 0.07207312167119041

Validation Block 2
Correlation 0.0737811751573178

Validation Block 3
Correlation 0.06260542085119478

Validation Block 4
Correlation 0.05524753270154888

Validation Block 5
Correlation 0.0406986702554227

Validation Block 6
Correlation 0.02701121633777277

Validation Block 7
Correlation 0.06469652449034577

Validation Block 8
Correlation 0.04383883097679592

Validation Block 9
Correlation 0.06805487272587811

Validation Block 10
Correlation 0.045569279898783954

Validation Block 11
Correlation 0.04476211397290722



In [6]:
np.median(results_val)

0.05892647677637183

In [7]:
np.min(results_val)

0.02701121633777277

In [8]:
np.max(results_val)

0.0737811751573178

In [9]:
np.mean(results_val)

0.05545488194713452

In [10]:
len(results_val)

12

# hv-Block CV (1-block)
- "We call it h-block cross-validation, because the idea is to reduce the training set by removing the h observations preceding and following the observation in the test s"
- gap original - remover apenas era mais próxima do treino

In [11]:
results_val = []

for block in range(2,12):
    print("Train block {} - Gap Block {} - Validation Block {}".format(block - 2, block - 1,  block))

    if block == 0:
        train = data[data['block'] > block+1]
    elif block == 11:
        train = data[data['block'] < block-1]
    else:
        train = data[(data['block'] != block-1) & (data['block'] != block) & (data['block'] != block+1)]
    
    val = data[data['block'] == block]
    
    X_train = train.filter(regex=r'feature')
    X_val = val.filter(regex=r'feature')

    y_train = train['target_kazutsugi']
    y_val = val['target_kazutsugi']
     

    mdl = LGBMRegressor(max_depth=5, num_leaves=2**5, learning_rate=0.01, n_estimators=2000, colsample_bytree=0.1, random_state=0)
    mdl.fit(X_train, y_train)
    
    predictions = pd.Series(mdl.predict(X_val))
    ranked_predictions = predictions.rank(pct=True, method="first")
    correlation = np.corrcoef(y_val, ranked_predictions)[0, 1]
    #print(correlation)
    
    results_val.append(correlation)
    print("Correlation {}".format(correlation))
    print()
   

Train block 0 - Gap Block 1 - Validation Block 2
Correlation 0.06891527161934806

Train block 1 - Gap Block 2 - Validation Block 3
Correlation 0.05608920907352532

Train block 2 - Gap Block 3 - Validation Block 4
Correlation 0.050683289487780864

Train block 3 - Gap Block 4 - Validation Block 5
Correlation 0.03800523049303772

Train block 4 - Gap Block 5 - Validation Block 6
Correlation 0.02052873989528275

Train block 5 - Gap Block 6 - Validation Block 7
Correlation 0.060454668169158655

Train block 6 - Gap Block 7 - Validation Block 8
Correlation 0.04014577185049007

Train block 7 - Gap Block 8 - Validation Block 9
Correlation 0.06519420956831873

Train block 8 - Gap Block 9 - Validation Block 10
Correlation 0.04113250552613722

Train block 9 - Gap Block 10 - Validation Block 11
Correlation 0.04157351090598695



In [12]:
np.median(results_val)

0.04612840019688391

In [13]:
np.min(results_val)

0.02052873989528275

In [14]:
np.max(results_val)

0.06891527161934806

In [15]:
np.mean(results_val)

0.04827224065890663

In [16]:
len(results_val)

10

# Combinatorial Purged (Gap) (Block)-CV
- purging original - remover apenas era mais próxima do treino
- "Advances in Financial Machine Learning" - Marcos López de Prado

In [30]:
from itertools import permutations

block_combos = list(permutations(range(12), 2)) # pode ser mais que 2

total_purged_pairs = sum([1 for train_block, val_block in block_combos if abs(train_block - val_block) != 1])
print("Total de pares com gap {}".format(total_purged_pairs))

results_val = []
for train_block, val_block in block_combos:
    if abs(train_block - val_block) == 1:
        continue
        
    print("Train block {} - Validation Block {}".format(train_block, val_block))

    train = data[data['block'] == train_block]
    val = data[data['block'] == val_block]
    
    X_train = train.filter(regex=r'feature')
    X_val = val.filter(regex=r'feature')

    y_train = train['target_kazutsugi']
    y_val = val['target_kazutsugi']
     

    mdl = LGBMRegressor(max_depth=5, num_leaves=2**5, learning_rate=0.01, n_estimators=2000, colsample_bytree=0.1, random_state=0)
    mdl.fit(X_train, y_train)
    
    predictions = pd.Series(mdl.predict(X_val))
    ranked_predictions = predictions.rank(pct=True, method="first")
    correlation = np.corrcoef(y_val, ranked_predictions)[0, 1]
    #print(correlation)
    
    results_val.append(correlation)
    print("Correlation {}".format(correlation))
    print()
   

Total de pares com gap 110
Train block 0 - Validation Block 2
Correlation 0.05345227304313937

Train block 0 - Validation Block 3
Correlation 0.04537896326256358

Train block 0 - Validation Block 4
Correlation 0.028638840809714506

Train block 0 - Validation Block 5
Correlation 0.012266404939832694

Train block 0 - Validation Block 6
Correlation 0.006774445659950066

Train block 0 - Validation Block 7
Correlation 0.04735954213583829

Train block 0 - Validation Block 8
Correlation 0.035712999834871376

Train block 0 - Validation Block 9
Correlation 0.04633006527379678

Train block 0 - Validation Block 10
Correlation 0.02171331752472719

Train block 0 - Validation Block 11
Correlation 0.016752894694547108

Train block 1 - Validation Block 3
Correlation 0.04509813993002888

Train block 1 - Validation Block 4
Correlation 0.03552693549724267

Train block 1 - Validation Block 5
Correlation 0.01210165993262393

Train block 1 - Validation Block 6
Correlation 0.015728398473927964

Train block 1

In [31]:
np.median(results_val)

0.02928063155056037

In [32]:
np.min(results_val)

0.006774445659950066

In [33]:
np.max(results_val)

0.06416408511024498

In [34]:
np.mean(results_val)

0.0307766272715858

In [35]:
len(results_val)

110