# Capítulo 5 - Residualização

### Parte 1 - Carregar Bibliotecas e Dataset

In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
#%reload_ext autoreload
#make_dataset.new_func()

In [7]:
#basics
import sys,os
sys.path.insert(1, os.path.dirname(os.getcwd()))

#utils
import paths

#main libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option("display.precision", 4)
%matplotlib inline

#model Libraries
from sklearn import utils
from sklearn import preprocessing

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GroupKFold

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import make_scorer
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error


#skopt
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from tune_sklearn import TuneSearchCV

In [8]:
#private modules 

#utils
from src.utils import memory_usage
from src.utils import quick_setup
quick_setup.get_libs(local=True)

#data
from src.data import make_dataset

#models
from src.models import train_model
from src.models import evaluation
from src.models import rank_cv
from src.models import meta_model
from src.models import neutralize


#validation
from src.validation import combinatorial_split
from src.validation import group_ts_split
from src.validation import metrics_era
from src.validation import metrics
from src.validation import metrics_description
from src.validation import dsr
from src.validation import models_dict

#visualization
from src.visualization import visualize


#sklearn
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import MinMaxScaler

#min cols
min_cols = ["Feat_exp_max", 'Validation_SD', 'corr_with_example_preds']


In [None]:
#checando versoes para evitar conflitos
import xgboost as xgb
print("xgb version:", xgb.__version__)

import lightgbm as lgbm
print("lgbm version:", lgbm.__version__)

import sklearn
print("sklearn version:", sklearn.__version__)

import catboost
print("catboost version:", catboost.__version__)

In [4]:
#private class (tentar treinar com a classe da pasta [no colab] a o inves de colar local)
from sklearn.base import BaseEstimator, RegressorMixin

from xgboost import XGBRanker
class MyXGBRanker(XGBRanker, BaseEstimator, RegressorMixin):
    def fit(self, x, y):
        cdf = x.groupby('era').agg(['count'])
        group = cdf[cdf.columns[0]].values
        return super().fit(x[features], y, group=group)

    def predict(self, x):
        return super().predict(x[features])

#### Training

In [62]:
%%time
df_training,features,target = make_dataset.get_data(nrows=None,
                                                    low_memory=False, 
                                                    dataset="training", 
                                                    feather=True) #false on AWS/colab

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 501808 entries, 0 to 501807
Columns: 314 entries, id to target
dtypes: float32(311), int32(1), object(2)
memory usage: 604.9+ MB
None
CPU times: user 1.17 s, sys: 1.92 s, total: 3.09 s
Wall time: 9.03 s


#### Validation

In [9]:
%%time
df_validation,features,target = make_dataset.get_data(nrows=None,
                                                    low_memory=False, 
                                                    dataset="validation", 
                                                    feather=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 137779 entries, 0 to 137778
Columns: 314 entries, id to target
dtypes: float32(311), int32(1), object(2)
memory usage: 166.1+ MB
None
CPU times: user 251 ms, sys: 538 ms, total: 789 ms
Wall time: 2.22 s


# Continuação

## Revisando Métricas

Quais métricas otimizamos direta ou indiretamente?

<img src="https://github.com/nicholasrichers/dissertacao/blob/master/references/figures/cap3/diag_rio.png?raw=true" width=200 />


### (MAX) Feature exposure (por era)

* Feature mais correlacionada com as predições $(fe\_max)$ gerada pelo modelo (por era)

$$fe\_max = \frac{\sum_{era=1}^{120} fe\_max_{era}}{120}$$

#### Relembrando Cap3 (Performance de uma feature)

<img src="https://github.com/nicholasrichers/dissertacao/blob/master/references/figures/cap3/mm10_strenght34.png?raw=true" width=300 />




#### Regularização

* Podemos controlar a exposição as features a partir do parâmetro de regularização ($\alpha$) que é encontrado no modelo **Ridge** e também nos modelos de árvore.
* Existem outras formas de regularização como o **Dropout** em Redes Neurais.

### Feature Neutral Mean (Neutralization/Residualization)

#### A residualização é uma aplicação do Teorema de Frisch-Waugh-Lovell (1933)


* O Teorema FWL fornce uma expressão fechada para obter um subconjunto do vetor de coeficientes de uma regressão múltipla (Greene 2003). 


* Para dois conjuntos de variáveis, X1 e X2 o subconjunto de coeficientes ($b_2$) é obtido quando os resíduos da regressão de y em X1 são **regredidos no conjunto de resíduos** obtidos quando cada coluna de X2 é regredida em X1 (Greene 2003).

#### Regressão em 2 passos

1. Regressão de y em $X_1$: $M_{X1}y$ é o resíduo
2. Regressão de $X_2$ em $X_1$: $M_{X1}X_2$ é o resíduo

- Temos que:

$$M_{X1}y = M_{X1}X_2 \cdot \widehat{\beta_2} + \widehat{\mu}$$



* $M_{XI}y$ : Tudo de y que não foi explicado por $X_1$, mas pode ser explicado por $X_2$
* $M_{XI}X_2$ : Tudo de $X_2$ que não está sendo explicado por $X_1$

* O Teorema FWL prova que $\widehat{\beta_2}$ é igual a $b_2$


*Fonte: GREENE, W.H. Econometric Analysis. 5th  ed. New York, NY, PearsonEducation, 2003*

### Aplicando ao nosso problema


* A Neutralização é um método de subtrair a contribuição de outro vetor numérico, enquanto mantém a **informação original**. 

* Iremos neutralizar a propria previsao feita pelo modelo, por um modelo linear treinado sobre as previsões. O residuo resultante será ortogonal a um OLS.


$$score\_neutro = scores - prop \cdot exp \cdot (exp^{\dagger } \cdot scores)$$



* $exp^{\dagger }$ é a pseudo inversa de $exp$
* $ exp^{\dagger } \cdot scores = \beta$ (OLS)
* $prop$ entre $[0,1]$


In [66]:
def neutralize_series(series, by, proportion=1.0):
    scores = series.values.reshape(-1, 1)
    exposures = by.values.reshape(-1, 1)
    exposures = np.hstack((exposures, np.array([np.mean(series)] * len(exposures)).reshape(-1, 1)))
    correction = proportion * (exposures.dot(np.linalg.lstsq(exposures, scores,rcond=None)[0]))
    corrected_scores = scores - correction
    neutralized = pd.Series(corrected_scores.ravel(), index=series.index)
    return neutralized

#### Neutralizando feature_strength34 e, relação ao target

* Note que após a neutralização a feature se tornou ortogonal ao target
* Contudo possui correlação próxima a 1, anes da neutralização

In [68]:
f1     = df_training["feature_strength34"]
f1_nt  = neutralize_series(f1, df_training["target"])

In [72]:
print("Corr fs_34 & target:             ", np.corrcoef(f1, df_training["target"])[0, 1])   
print("Corr fs_34 100% neutr. & target: ", np.corrcoef(f1_nt, df_training["target"])[0, 1])
print("Corr fs_34 & fs_34 100% neutr.:  ", np.corrcoef(f1, f1_nt)[0, 1])

Corr fs_34 & target:              0.012310288017607295
Corr fs_34 100% neutr. & target:  1.7673754128672442e-08
Corr fs_34 & fs_34 100% neutr.:   0.999924225751144


#### A neutralização ajuda a mitigar o feature exposure, mas elas não sao a mesma coisa

* Feature exposure é uma métrica que mede a dissimilaridade das correlações entre as features

* Feature Neutralization é uma operação que remove a correlação entre as features

#### Sobre as métricas

* Feature neutral mean (neutralizado pelo OLS das predicoes)

* MMC Mean  (neutralizado pelo meta modelo da numerai) (proxy ex_preds)

## Comparativo com e sem FN

* Atualizamos o modelos Medellin (lgbm_exp20) e o lgbm_slider20 de forma a considerar todas as eras

#### Sem FN (0 %)

In [233]:
%%time
path = '../../reports/predicoes_validacao/raw/'
models_nr = ['ex_preds','nr__rio', 'nr__sao_paulo', 'lgbm_exp20', 'lgbm_slider20']

preds_nr, feat_corrs_nr = dict(), dict()
era_scores_nr, df_metrics_nr = dict(), dict()


from sklearn.linear_model import LinearRegression
ml_model = LinearRegression(fit_intercept=False)

FN=0.0
print("FN = ", FN)

for model in models_nr:
    
    #predicoes val1 & val2
    print("creating predictions to:", model)
    df_validation['preds'] =  pd.read_csv(path+model+'_preds_test.csv', index_col='id').values.reshape(1,-1)[0]

    #preds neutralized after
    preds_nr[model] = neutralize.preds_neutralized(df_validation, ['preds'], features, ml_model, FN)
    

    #salvando as metricas
    if model =='ex_preds': mmc=False
    else: mmc=True
    era_scores_nr[model], df_metrics_nr[model], feat_corrs_nr[model], ex_preds = \
                        metrics.submission_metrics(df_validation, preds_nr[model], model, mmc)   


#dict to dataframe
df_preds_nr = pd.DataFrame.from_dict(preds_nr)
df_era_scores_nr = pd.DataFrame.from_dict(era_scores_nr)
df_feat_corrs_nr = pd.DataFrame.from_dict(feat_corrs_nr)
df_metrics_cons_nr = metrics.metrics_consolidated(df_metrics_nr)

FN =  0.0
creating predictions to: nr__rio
creating predictions to: nr__sao_paulo
creating predictions to: lgbm_exp20
creating predictions to: lgbm_slider20
creating predictions to: ex_preds
CPU times: user 4min 17s, sys: 30.5 s, total: 4min 47s
Wall time: 4min 20s


In [17]:
#print("Menor é melhor:", min_cols)
leaderboard_nr = df_metrics_cons_nr[df_metrics_cons_nr.Categoria.isin(["Performance", "Risk", "MMC"])].loc[:,models_nr[0:]]
leaderboard_nr.astype(float).style.apply(visualize.diagnostic_colors).apply(visualize.highlight_max, axis = 1)

Unnamed: 0_level_0,ex_preds,NR_Rio,NR_Sao_Paulo,lgbm_exp20,lgbm_slider20
Metrica,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Validation_Sharpe,0.9757,0.95,0.8757,0.9209,0.9678
Validation_Mean,0.0266,0.0264,0.0245,0.0244,0.0256
Feat_neutral_mean,0.0215,0.0206,0.0183,0.0182,0.0187
Validation_SD,0.0272,0.0278,0.028,0.0265,0.0264
Feat_exp_max,0.2694,0.2601,0.2928,0.2642,0.2651
Max_Drawdown,-0.0651,-0.0867,-0.0859,-0.0995,-0.0703
corr_plus_mmc_sharpe,0.9757,0.8973,0.7949,0.8359,0.9121
val_mmc_mean,0.0,0.0017,-0.0008,0.001,0.0014
corr_with_example_preds,1.0,0.9088,0.963,0.8687,0.8942


In [205]:
visualize.plot_feat_cors(df_feat_corrs_nr, models_nr)

#### Com FN (100 %)

* Os modelos são mais estáveis

In [241]:
%%time
path = '../../reports/predicoes_validacao/raw/'
models_nr = ['ex_preds','nr__rio', 'nr__sao_paulo', 'lgbm_exp20', 'lgbm_slider20']

preds_nr, feat_corrs_nr = dict(), dict()
era_scores_nr, df_metrics_nr = dict(), dict()


from sklearn.linear_model import LinearRegression
ml_model = LinearRegression(fit_intercept=False)

FN=1.0
print("FN = ", FN)

for model in models_nr:
    
    #predicoes val1 & val2
    print("creating predictions to:", model)
    df_validation['preds'] =  pd.read_csv(path+model+'_preds_test.csv', index_col='id').values.reshape(1,-1)[0]

    #preds neutralized after
    preds_nr[model] = neutralize.preds_neutralized(df_validation, ['preds'], features, ml_model, FN)
    

    #salvando as metricas
    if model =='ex_preds': mmc=False
    else: mmc=True
    era_scores_nr[model], df_metrics_nr[model], feat_corrs_nr[model], ex_preds = \
                        metrics.submission_metrics(df_validation, preds_nr[model], model, mmc)   


#dict to dataframe
df_preds_nr = pd.DataFrame.from_dict(preds_nr)
df_era_scores_nr = pd.DataFrame.from_dict(era_scores_nr)
df_feat_corrs_nr = pd.DataFrame.from_dict(feat_corrs_nr)
df_metrics_cons_nr = metrics.metrics_consolidated(df_metrics_nr)

FN =  1.0
creating predictions to: ex_preds
creating predictions to: nr__rio
creating predictions to: nr__sao_paulo
creating predictions to: lgbm_exp20
creating predictions to: lgbm_slider20
CPU times: user 4min 13s, sys: 28.2 s, total: 4min 41s
Wall time: 3min 51s


In [235]:
#print("Menor é melhor:", min_cols)
leaderboard_nr = df_metrics_cons_nr[df_metrics_cons_nr.Categoria.isin(["Performance", "Risk", "MMC"])].loc[:,models_nr]
leaderboard_nr.astype(float).style.apply(visualize.diagnostic_colors).apply(visualize.highlight_max, axis = 1)

Unnamed: 0_level_0,ex_preds,nr__rio,nr__sao_paulo,lgbm_exp20,lgbm_slider20
Metrica,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Validation_Sharpe,0.9757,1.2246,1.1207,1.0191,1.3628
Validation_Mean,0.0266,0.0226,0.021,0.0195,0.0212
Feat_neutral_mean,0.0215,0.0217,0.0202,0.0186,0.0205
Validation_SD,0.0272,0.0185,0.0188,0.0191,0.0155
Feat_exp_max,0.2694,0.0108,0.0167,0.0089,0.0111
Max_Drawdown,-0.0651,-0.0352,-0.0353,-0.0462,-0.0202
corr_plus_mmc_sharpe,0.9757,1.0213,0.8512,0.8088,1.0958
val_mmc_mean,0.0,0.007,0.0046,0.0053,0.0062
corr_with_example_preds,1.0,0.5009,0.5617,0.4696,0.4874


## Testando Múltiplos FN

In [None]:
%%time
path = '../../reports/predicoes_validacao/raw/'
models_nr = ['ex_preds','nr__rio', 'nr__sao_paulo', 'lgbm_exp20', 'lgbm_slider20']

preds_nr, feat_corrs_nr = dict(), dict()
era_scores_nr, df_metrics_nr = dict(), dict()


from sklearn.linear_model import LinearRegression
ml_model = LinearRegression(fit_intercept=False)

FN=1.0
print("FN = ", FN)

for model in models_nr:
    
    #predicoes val1 & val2
    print("creating predictions to:", model)
    df_validation['preds'] =  pd.read_csv(path+model+'_preds_test.csv', index_col='id').values.reshape(1,-1)[0]

    #preds neutralized after
    preds_nr[model] = neutralize.preds_neutralized(df_validation, ['preds'], features, ml_model, FN)
    

    #salvando as metricas
    if model =='ex_preds': mmc=False
    else: mmc=True
    era_scores_nr[model], df_metrics_nr[model], feat_corrs_nr[model], ex_preds = \
                        metrics.submission_metrics(df_validation, preds_nr[model], model, mmc)   


#dict to dataframe
df_preds_nr = pd.DataFrame.from_dict(preds_nr)
df_era_scores_nr = pd.DataFrame.from_dict(era_scores_nr)
df_feat_corrs_nr = pd.DataFrame.from_dict(feat_corrs_nr)
df_metrics_cons_nr = metrics.metrics_consolidated(df_metrics_nr)

In [29]:
%%time
path = '../../reports/predicoes_validacao/raw/'
models_nr = ['ex_preds', 'NR_Rio', 'NR_Sao_Paulo', 'lgbm_exp20', 'lgbm_slider20']

FN=[0, .25, .4, .5, .6, .75, .9, 1.0]

from sklearn.linear_model import LinearRegression
ml_model = LinearRegression(fit_intercept=False)

preds_nr, feat_corrs_nr = dict(), dict()
era_scores_nr, df_metrics_nr = dict(), dict()

for fnv in FN:
    
    model = 'NR_Rio'
    #predicoes val1 & val2
    print("creating predictions to:", model+str(fnv))
    df_validation['preds'] =  pd.read_csv(path+model+'_preds_test.csv', index_col='id').values.reshape(1,-1)[0]



    #preds neutralized after
    preds_nr[model] = neutralize.preds_neutralized(df_validation, ['preds'], features, ml_model, fnv)
    
    
    #salvando as metricas
    era_scores_nr[model], df_metrics_nr[model], feat_corrs_nr[model], ex_preds = \
                        metrics.submission_metrics(df_validation, preds_nr[model], model, True)   



#add ex_preds
print("creating predictions to: ex_preds")
preds_nr['ex_preds'] = np.array(ex_preds)
era_scores_nr['ex_preds'], df_metrics_nr['ex_preds'], feat_corrs_nr['ex_preds'], ex_preds = \
                        metrics.submission_metrics(df_validation, preds_nr['ex_preds'], "ex_preds", False)



#dict to dataframe
df_preds_nr = pd.DataFrame.from_dict(preds_nr)
df_era_scores_nr = pd.DataFrame.from_dict(era_scores_nr)
df_feat_corrs_nr = pd.DataFrame.from_dict(feat_corrs_nr)
df_metrics_cons_nr = metrics.metrics_consolidated(df_metrics_nr)

creating predictions to: NR_Rio0
creating predictions to: NR_Rio0.25
creating predictions to: NR_Rio0.4
creating predictions to: NR_Rio0.5
creating predictions to: NR_Rio0.6
creating predictions to: NR_Rio0.75
creating predictions to: NR_Rio0.9
creating predictions to: NR_Rio1.0
creating predictions to: ex_preds
CPU times: user 7min 55s, sys: 45 s, total: 8min 40s
Wall time: 6min 33s


In [30]:
model = 'NR_Rio_'
models_fn = ['ex_preds']+[model+str(fnv) for fnv in FN]

leaderboard_nr = df_metrics_cons_nr[df_metrics_cons_nr.Categoria.isin(["Performance", "Risk", "MMC"])].loc[:,models_fn[0:]]
leaderboard_nr.astype(float).style.apply(visualize.diagnostic_colors).apply(visualize.highlight_max, axis = 1)

Unnamed: 0_level_0,ex_preds,NR_Rio_0,NR_Rio_0.25,NR_Rio_0.4,NR_Rio_0.5,NR_Rio_0.6,NR_Rio_0.75,NR_Rio_0.9,NR_Rio_1.0
Metrica,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Validation_Sharpe,0.9757,0.95,1.0188,1.0704,1.1126,1.1559,1.2141,1.2465,1.2246
Validation_Mean,0.0266,0.0264,0.0268,0.0269,0.0268,0.0264,0.0255,0.024,0.0226
Feat_neutral_mean,0.0215,0.0206,0.0211,0.0213,0.0214,0.0214,0.0214,0.0215,0.0217
Validation_SD,0.0272,0.0278,0.0263,0.0251,0.0241,0.0229,0.021,0.0192,0.0185
Feat_exp_max,0.2694,0.2601,0.2221,0.191,0.1663,0.138,0.0893,0.0371,0.0108
Max_Drawdown,-0.0651,-0.0867,-0.0715,-0.0584,-0.0468,-0.041,-0.0401,-0.0373,-0.0352
corr_plus_mmc_sharpe,0.9757,0.8973,0.9866,1.0429,1.0819,1.1109,1.1231,1.0846,1.0213
val_mmc_mean,0.0,0.0017,0.0027,0.0034,0.0039,0.0045,0.0055,0.0064,0.007
corr_with_example_preds,1.0,0.9088,0.879,0.8447,0.8122,0.7703,0.6877,0.5818,0.5009


## Customizando a neutralização

* No lugar do OLS, podemos usar na verdade qualquer modelo
* Modelos lineares com regularização como o ridge ou lasso anularam o efeito da neutralização
* Modelos não lineares neutralizariam todo o sinal original encontrado pelo modelo

* COMENTAR MIXED
rio sgd - ridge 25%
sp sgp puro
med ols - ridge (1-prop)


#### SGD Regressor melhorou a validation_mean sem prejuízo as demais métricas

<img src="https://ichi.pro/assets/images/max/724/1*e88JKNWAFok3vpjeuPfHig.gif" width=300 />


## Atualizando os Modelos com FN

 * NR__Medellin passou para o *lgbm_slider20* com FN
 * Atualizamos também o **baseline** com FN=100%

In [48]:
%%time
path = '../../reports/predicoes_validacao/'
models_nr = ['ex_preds', 'ex_FN100', 'nr__rio', 'nr__sao_paulo', 'nr__medellin']


preds_nr, feat_corrs_nr = dict(), dict()
era_scores_nr, df_metrics_nr = dict(), dict()



for model in models_nr[0:]:
    
    #predicoes val1 & val2
    print("creating predictions to:", model)
    preds_nr[model]=  pd.read_csv(path+model+'_preds_test.csv', index_col='id').values.reshape(1,-1)[0]


    #salvando as metricas
    era_scores_nr[model], df_metrics_nr[model], feat_corrs_nr[model], ex_preds = \
                        metrics.submission_metrics(df_validation, preds_nr[model], model, True)   




#dict to dataframe
df_preds_nr = pd.DataFrame.from_dict(preds_nr)
df_era_scores_nr = pd.DataFrame.from_dict(era_scores_nr)
df_feat_corrs_nr = pd.DataFrame.from_dict(feat_corrs_nr)
df_metrics_cons_nr = metrics.metrics_consolidated(df_metrics_nr)

creating predictions to: ex_preds
creating predictions to: ex_FN100
creating predictions to: nr__rio
creating predictions to: nr__sao_paulo
creating predictions to: nr__medellin
CPU times: user 3min 26s, sys: 19.4 s, total: 3min 46s
Wall time: 3min 46s


In [17]:
#print("Menor é melhor:", min_cols)
leaderboard_nr = df_metrics_cons_nr[df_metrics_cons_nr.Categoria.isin(["Performance", "Risk", "MMC"])].loc[:,models_nr[0:]]
leaderboard_nr.astype(float).style.apply(visualize.diagnostic_colors).apply(visualize.highlight_max, axis = 1)

Unnamed: 0_level_0,ex_preds,ex_FN100,nr__rio,nr__sao_paulo,nr__medellin
Metrica,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Validation_Sharpe,0.9757,1.2648,1.2787,1.2194,1.3584
Validation_Mean,0.0266,0.0234,0.0235,0.0238,0.0211
Feat_neutral_mean,0.0215,0.0226,0.0214,0.0196,0.0204
Validation_SD,0.0272,0.0185,0.0184,0.0195,0.0156
Feat_exp_max,0.2708,0.0146,0.0299,0.0614,0.0084
Max_Drawdown,-0.0651,-0.027,-0.0373,-0.0417,-0.0203
corr_plus_mmc_sharpe,0.9757,0.991,1.0776,1.005,1.0919
val_mmc_mean,0.0,0.0055,0.0075,0.0041,0.0062
corr_with_example_preds,1.0,0.6078,0.5091,0.681,0.4869


In [49]:
1#print("Menor é melhor:", min_cols)
leaderboard_nr = df_metrics_cons_nr[df_metrics_cons_nr.Categoria.isin(["Performance",
                                                                       "Risk", 
                                                                       "MMC", 
                                                                       "MMC_FN", 
                                                                       "Special"])].loc[:,models_nr[0:]]

leaderboard_nr.astype(float).style.apply(visualize.highlight_max, axis = 1)

Unnamed: 0_level_0,ex_preds,ex_FN100,nr__rio,nr__sao_paulo,nr__medellin
Metrica,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Validation_Sharpe,0.9757,1.2648,1.2787,1.2194,1.3584
Validation_Mean,0.0266,0.0234,0.0235,0.0238,0.0211
Feat_neutral_mean,0.0215,0.0226,0.0214,0.0196,0.0204
Validation_SD,0.0272,0.0185,0.0184,0.0195,0.0156
Feat_exp_max,0.2708,0.0146,0.0299,0.0614,0.0084
Max_Drawdown,-0.0651,-0.027,-0.0373,-0.0417,-0.0203
corr_plus_mmc_sharpe,0.9757,1.2648,1.0776,1.005,1.0919
val_mmc_mean,0.0,0.0,0.0075,0.0041,0.0062
corr_with_example_preds,1.0,1.0,0.5103,0.6815,0.4871
val_mmc_mean_FN,0.0,0.0,0.0036,0.0022,0.0021


In [36]:
visualize.plot_era_scores(df_era_scores_nr, models_nr)

# Modelo NR__Guadalajara


<img src="https://github.com/nicholasrichers/dissertacao/blob/master/references/figures/cidades/nr__guadalajara.jpg?raw=true" width=600 />


In [13]:
%%time
path = '../../reports/predicoes_validacao/raw/'
models_nr = ['ex_preds', 'ex_FN100', 'nr__rio', 'nr__sao_paulo', 'nr__medellin']


preds_nr, feat_corrs_nr = dict(), dict()
era_scores_nr, df_metrics_nr = dict(), dict()


from sklearn.linear_model import LinearRegression
ml_model = LinearRegression(fit_intercept=False)


for model in models_nr[0:]:
    
    #predicoes val1 & val2
    print("creating predictions to:", model)
    df_validation['preds'] =  pd.read_csv(path+model+'_preds_test.csv', index_col='id').values.reshape(1,-1)[0]

    
    #preds neutralized after
    preds_nr[model] = neutralize.preds_neutralized(df_validation, 
                                                   ['preds'], 
                                                   features,
                                                   #ml_model,
                                                   neutralize.fn_strategy_dict[model]['model'], 
                                                   neutralize.fn_strategy_dict[model]['factor'])
    

    #salvando as metricas
    if model =='ex_preds': mmc=False
    else: mmc=True
    era_scores_nr[model], df_metrics_nr[model], feat_corrs_nr[model], ex_preds = \
                        metrics.submission_metrics(df_validation, preds_nr[model], model, mmc)   




#dict to dataframe
df_preds_nr = pd.DataFrame.from_dict(preds_nr)
df_era_scores_nr = pd.DataFrame.from_dict(era_scores_nr)
df_feat_corrs_nr = pd.DataFrame.from_dict(feat_corrs_nr)
df_metrics_cons_nr = metrics.metrics_consolidated(df_metrics_nr)

[autoreload of neutralize failed: Traceback (most recent call last):
  File "/Users/nicholasrichers/Documents/virtualenvs/numerai_env/lib/python3.8/site-packages/IPython/extensions/autoreload.py", line 245, in check
    superreload(m, reload, self.old_objects)
  File "/Users/nicholasrichers/Documents/virtualenvs/numerai_env/lib/python3.8/site-packages/IPython/extensions/autoreload.py", line 394, in superreload
    module = reload(module)
  File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/imp.py", line 314, in reload
    return importlib.reload(module)
  File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/importlib/__init__.py", line 168, in reload
    raise ModuleNotFoundError(f"spec not found for the module {name!r}", name=name)
ModuleNotFoundError: spec not found for the module 'neutralize'
]


creating predictions to: ex_preds
creating predictions to: ex_FN100
creating predictions to: nr__rio
creating predictions to: nr__sao_paulo
creating predictions to: nr__medellin
CPU times: user 4min 6s, sys: 23.3 s, total: 4min 30s
Wall time: 3min 27s


In [14]:
1#print("Menor é melhor:", min_cols) #mixed
leaderboard_nr = df_metrics_cons_nr[df_metrics_cons_nr.Categoria.isin(["Performance", "Risk", "MMC"])].loc[:,models_nr[:]]
leaderboard_nr.astype(float).style.apply(visualize.diagnostic_colors).apply(visualize.highlight_max, axis = 1)

Unnamed: 0_level_0,ex_preds,ex_FN100,nr__rio,nr__sao_paulo,nr__medellin
Metrica,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Validation_Sharpe,0.9757,1.2648,1.2787,1.2194,1.3584
Validation_Mean,0.0266,0.0234,0.0235,0.0238,0.0211
Feat_neutral_mean,0.0215,0.0226,0.0214,0.0196,0.0204
Validation_SD,0.0272,0.0185,0.0184,0.0195,0.0156
Feat_exp_max,0.2708,0.0146,0.0299,0.0614,0.0084
Max_Drawdown,-0.0651,-0.027,-0.0373,-0.0417,-0.0203
corr_plus_mmc_sharpe,0.9757,0.991,1.0776,1.005,1.0919
val_mmc_mean,0.0,0.0055,0.0075,0.0041,0.0062
corr_with_example_preds,1.0,0.6078,0.5091,0.681,0.4869


## Live eras performance

In [34]:
live_dict =  models_dict.sp_dict

In [33]:
live_dict

{'models': {'integration_test': 'ex_preds',
  'nrichers': 'xgb_ranker_ts',
  'nick_richers': 'lgbm_forest',
  'nr__rio': 'nr__rio',
  'nr__sao_paulo': 'nr__sao_paulo'},
 'round': 234}

In [36]:
visualize.plot_live_scores(live_dict['models'], live_dict['round'], base="acum")

[autoreload of visualize failed: Traceback (most recent call last):
  File "/Users/nicholasrichers/Documents/virtualenvs/numerai_env/lib/python3.8/site-packages/IPython/extensions/autoreload.py", line 245, in check
    superreload(m, reload, self.old_objects)
  File "/Users/nicholasrichers/Documents/virtualenvs/numerai_env/lib/python3.8/site-packages/IPython/extensions/autoreload.py", line 394, in superreload
    module = reload(module)
  File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/imp.py", line 314, in reload
    return importlib.reload(module)
  File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/importlib/__init__.py", line 168, in reload
    raise ModuleNotFoundError(f"spec not found for the module {name!r}", name=name)
ModuleNotFoundError: spec not found for the module 'visualize'
]


In [37]:
visualize.plot_live_scores(live_dict['models'], live_dict['round'], base="weekly")

# Resumo do modelo (relatorio final)


* bla bla bla

### Para o proximo modelo 

* Diversidade

### Verificando modelos

In [15]:
#%%time
#
#from joblib import dump, load
#path = '../../reports/predicoes_validacao/'
#
#models, res_cv, preds = dict(), dict(), dict()
#models_nr = ['ex_preds', 'ex_FN100', 'nr__rio', 'nr__sao_paulo', 'nr__medellin']
#
#
#for model in models_nr:
#    #load model complete pipe
#    print("creating predictions to:", model)
#    #models[model] = load(file_path + model + '-cv.pkl')
#    #preds[model] = models[model].model.predict(df_validation[features])
#
#    # predictions must have an `id` column and a `prediction_kazutsugi` column
#    predictions_df = df_validation["id"].to_frame()
#    predictions_df[model] = preds_nr[model]
#    predictions_df.to_csv(path+model +"_preds_test.csv", index=False)
#

creating predictions to: ex_preds
creating predictions to: ex_FN100
creating predictions to: nr__rio
creating predictions to: nr__sao_paulo
creating predictions to: nr__medellin
CPU times: user 3.41 s, sys: 209 ms, total: 3.61 s
Wall time: 4.14 s


In [31]:
import glob
from joblib import dump, load
#from joblib import dump, load
#file_path = '/content/dissertacao/models/sao_paulo/'
file_path = '../../models/guadalajara/'

files_path_list = glob.glob(file_path+'*-cv.pkl')
models_verify = dict()


for file_ in files_path_list:
    
    #load model complete pipe
    #print("creating predictions to:", model)
    models_verify[file_] = load(file_)#(file_path + model + '-cv.pkl') #-120

In [32]:
models_verify

{'../../models/medellin/xgb_slider20-cv.pkl': <train_model.BuildModel at 0x7fa72ade0df0>,
 '../../models/medellin/lgbm_slider20-cv.pkl': <train_model.BuildModel at 0x7fa7e5867d30>,
 '../../models/medellin/xgb_exp20-cv.pkl': <train_model.TunedModel_Skopt at 0x7fa71cdc7fd0>,
 '../../models/medellin/lgbm_exp20-cv.pkl': <train_model.TunedModel_Skopt at 0x7fa72ae00940>,
 '../../models/medellin/omp_exp20-cv.pkl': <train_model.TunedModel_Skopt at 0x7fa72ae00ca0>}

# Create Slides

In [37]:
!jupyter nbconvert Cap5_Residualizacao.ipynb --to slides  --template output_toggle  --SlidesExporter.reveal_scroll=True --SlidesExporter.reveal_theme=white  #--reveal-prefix=reveal.js --post serve

[NbConvertApp] Converting notebook Cap5_Residualizacao.ipynb to slides
[NbConvertApp] Writing 380367 bytes to Cap5_Residualizacao.slides.html
