# Porto Seguro

Local CV

1. LGBM simples, todas as variáveis, sem otimização, np.nan: 0.6395581804921483
2. LGBM simples, todas as variáveis, setando is_unbalance, np.nan: 0.6594159675776389
3. LGBM simples, todas as variáveis, setando is_unbalance, nulos como -999: 0.6608989474619081

In [2]:
# Libs to deal with tabular data
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 2000)

# Statistics
from scipy.stats import chi2_contingency
from scipy.stats.contingency import expected_freq

# Plotting packages
import seaborn as sns
sns.axes_style("darkgrid")
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('seaborn')

# Machine Learning
from xverse.transformer import WOE
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.inspection import permutation_importance
from sklearn.feature_selection import mutual_info_classif
from sklearn.metrics import recall_score, f1_score
from boruta import BorutaPy
from imblearn.under_sampling import RandomUnderSampler
from lightgbm import LGBMClassifier

# Optimization
import optuna
from optuna.samplers import TPESampler
from optuna.visualization import plot_contour, plot_optimization_history
from optuna.visualization import plot_param_importances, plot_slice

# To display stuff in notebook
from IPython.display import display, Markdown

# Misc 
from joblib import Parallel, delayed
from tqdm.notebook import tqdm
import time
import os
import glob


In [3]:
def cramers_v(var1, var2):
    cont_freq = pd.crosstab(var1, var2).values
    n_obs = cont_freq.sum().sum()
    chi2_test = chi2_contingency(cont_freq)
    cramers_v = np.sqrt(chi2_test[0] / (n_obs * (min(cont_freq.shape) - 1)))
    return cramers_v

In [4]:
train = pd.read_csv('../data/raw/train.csv')
test = pd.read_csv('../data/raw/test.csv')
feats = pd.read_csv('../data/raw/metadata.csv')

In [5]:
feats = feats[~feats['Variavel cod'].isin(['id', 'y'])]
feats['Variavel tipo'] = feats['Variavel tipo'].replace({
    'Qualitativo nominal': 'quali_nom',
    'Quantitativo discreto': 'quanti_dis',
    'Quantitativo continua': 'quanti_cont',
    'Qualitativo ordinal': 'quali_ord'
})
feat_type_dict = feats.groupby('Variavel tipo')['Variavel cod'].unique().to_dict()
feat_type_dict

{'quali_nom': array(['var1', 'var2', 'var3', 'var4', 'var5', 'var6', 'var7', 'var8',
        'var9', 'var10', 'var11', 'var12', 'var13', 'var14', 'var15',
        'var16', 'var17', 'var18', 'var19', 'var20', 'var21', 'var22',
        'var23', 'var28', 'var29', 'var30', 'var31', 'var33', 'var34',
        'var35', 'var36', 'var37', 'var38', 'var39', 'var41'], dtype=object),
 'quali_ord': array(['var26', 'var32', 'var42', 'var43'], dtype=object),
 'quanti_cont': array(['var55', 'var56', 'var57', 'var58', 'var59', 'var60', 'var61',
        'var62', 'var63', 'var64', 'var65', 'var66'], dtype=object),
 'quanti_dis': array(['var24', 'var25', 'var27', 'var40', 'var44', 'var45', 'var46',
        'var47', 'var48', 'var49', 'var50', 'var51', 'var52', 'var53',
        'var54', 'var67', 'var68'], dtype=object)}

In [6]:
display(train.shape)
display(test.shape)

(14123, 70)

(21183, 69)

# Análise

- 68 variáveis 
- train 14123 linhas 
- test 21183 linhas
- 20% eventos
- Mesma quantidade de missing: (65,66), (26,58), (9,52), (15,16,17,18), (2,3)

In [None]:
train = train.replace({-999:np.nan})
test = test.replace({-999:np.nan})

nulos = pd.concat([train.isnull().mean(), test.isnull().mean()], axis=1, keys=['train', 'test'])
nulos['diff'] = (nulos['train'] - nulos['test']).abs()
nulos.sort_values('train', ascending=False)
#nulos.sort_values('diff', ascending=False)

## Quanti cont

- Algumas bem correlacionadas entre si, tanto positivo quanto negativo
- 8 tem nulos, 2 com a mesma quantidade de nulos
- vars não padronizadas, escaladas entre 0 e 1
- tem mesma média e desvio padrão no teste

In [None]:
train[feat_type_dict['quanti_cont']].sample(10)

In [None]:
for col in feat_type_dict['quanti_cont']:
    sns.histplot(data=train, x=col, hue='y')
    plt.title(col)
    plt.show()

In [None]:
train[feat_type_dict['quanti_cont']].nunique()

In [None]:
train[feat_type_dict['quanti_cont']].isnull().mean()

In [None]:
sns.heatmap(train[feat_type_dict['quanti_cont']].corr())

In [None]:
pd.concat([
    train[feat_type_dict['quanti_cont']].describe(),
    test[feat_type_dict['quanti_cont']].describe()
], axis=0, keys=['train', 'test'])

In [None]:
train[feat_type_dict['quanti_cont']].corrwith(train['y'], method='spearman').sort_values()

## Quanti dis

- 15 sao discretas, 2 parecem continuas
- mesmas distribuicoes no teste
- o teste tem alguns valores maximos que não estão no treino

In [None]:
train[feat_type_dict['quanti_dis']].sample(10)

In [None]:
train[feat_type_dict['quanti_dis']].isnull().mean()

In [None]:
train[feat_type_dict['quanti_dis']].nunique()

In [None]:
for col in train[feat_type_dict['quanti_dis']]:
    sns.countplot(data=train, x= col, hue='y')
    plt.title(col)
    plt.show()

In [None]:
pd.concat([
    train[feat_type_dict['quanti_dis']].describe(),
    test[feat_type_dict['quanti_dis']].describe()
], axis=0, keys=['train', 'test'])

In [None]:
train[feat_type_dict['quanti_dis']].corrwith(train['y'], method=cramers_v).sort_values(ascending=False)

## Quali nom

- treino e test similares
- algumas variaveis nao parecem qualitativas nominais.
- a % de missing maximo eh de 15%
- a maioria das variaveis tem muitas categorias
- 2 variaveis tem pouquissimos valores iguais (muitos niveis)

In [None]:
train[feat_type_dict['quali_nom']].sample(10)

In [None]:
train[feat_type_dict['quali_nom']].isnull().mean().sort_values(ascending=False)

In [None]:
train[feat_type_dict['quali_nom']].nunique()

In [None]:
train[feat_type_dict['quali_nom']].nunique()

In [None]:
for col in feat_type_dict['quali_nom']:
    sns.histplot(data=train, x=col, hue='y')
    plt.title(col)
    plt.show()

In [None]:
pd.concat([
    train[feat_type_dict['quali_nom']].describe(),
    test[feat_type_dict['quali_nom']].describe()
], axis=0, keys=['train', 'test'])

In [None]:
train[feat_type_dict['quali_nom']].corrwith(train['y'], method='spearman').sort_values(ascending=False)

## Quali ord

In [None]:
train[feat_type_dict['quali_ord']].sample(10)

In [None]:
train[feat_type_dict['quali_ord']].isnull().mean()

In [None]:
train[feat_type_dict['quali_ord']].nunique()

In [None]:
for col in train[feat_type_dict['quali_ord']]:
    sns.countplot(data=train, x= col, hue='y')
    plt.title(col)
    plt.show()

In [None]:
pd.concat([
    train[feat_type_dict['quali_ord']].describe(),
    test[feat_type_dict['quali_ord']].describe()
], axis=0, keys=['train', 'test'])

In [None]:
train[feat_type_dict['quali_ord']].corrwith(train['y'], method=cramers_v).sort_values(ascending=False)

# Feature selection

In [7]:
X_train = train.drop(columns=['id', 'y'])
y_train = train['y']

In [None]:
woe = WOE().fit(X_train, y_train)

In [None]:
iv_importance = woe.iv_df.sort_values('Information_Value', ascending=False)['Variable_Name'].tolist()
woe.iv_df

In [None]:
val_scores = []
for idx in tqdm(range(1, len(woe.iv_df)+1)):
    X_train_red = X_train.loc[:, iv_importance[:idx]]
    cv = StratifiedKFold(5, random_state = 42, shuffle=True)
    cv_scores = []
    for array_idxs in cv.split(X_train_red, y_train):
        train_index, val_index = array_idxs[0], array_idxs[1]
        X_train_kf, X_val = X_train_red.loc[train_index], X_train_red.loc[val_index]
        y_train_kf, y_val = y_train.loc[train_index], y_train.loc[val_index]
        clf = LGBMClassifier(random_state=42, n_jobs=-1).fit(X_train_kf, y_train_kf)
        cv_scores.append(f1_score(y_val, clf.predict(X_val)))
    val_scores.append(sum(cv_scores)/5)

In [None]:
sns.lineplot(x = range(1, len(woe.iv_df)+1), y = val_scores)
plt.show()

# Hyperparameters optimization

In [None]:
def lgbm_f1_score(y_true, y_pred):
    y_pred = np.round(y_pred)
    return 'f1', f1_score(y_true, y_pred), True

def lgbm_recall_score(y_true, y_pred):
    y_pred = np.round(y_pred)
    return 'recall', recall_score(y_true, y_pred), True

class Light_GBM_CV:
    def __init__(self, X, y, folds=5, random_state=42):
        self.X = X
        self.y = y
        self.folds = folds
        self.random_state = random_state

    def __call__(self, trial):
        cv = StratifiedKFold(
            self.folds, 
            random_state = self.random_state, 
            shuffle=True
        )
        
        clf = LGBMClassifier(
            boosting_type = 'gbdt',
            objective = 'binary',
            random_state = self.random_state,
            num_leaves = trial.suggest_int('num_leaves', 2, 500),
            max_depth = trial.suggest_int('max_depth', 2, 500),
            learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1.0),
            min_child_samples = trial.suggest_int('min_child_samples', 5, 1000),
            n_estimators = trial.suggest_int('n_estimators', 10, 1000),
            lambda_l1 = trial.suggest_loguniform('lambda_l1', 1e-5, 1.0),
            lambda_l2 = trial.suggest_loguniform('lambda_l2', 1e-5, 1.0),
            max_bin = trial.suggest_int('max_bin', 10, 1000),
            feature_fraction = trial.suggest_float('feature_fraction', 0.1, 1),
            bagging_fraction = trial.suggest_float('bagging_fraction', 0.1, 1),
            is_unbalance = trial.suggest_categorical('is_unbalance', [False, True])
            #scale_pos_weight = trial.suggest_float('scale_pos_weight', 0.01, 1)
        )
        
        cv_scores = []

        for array_idxs in cv.split(self.X, self.y):
            train_index, val_index = array_idxs[0], array_idxs[1]
            X_train, X_val = self.X.loc[train_index], self.X.loc[val_index]
            y_train, y_val = self.y.loc[train_index], self.y.loc[val_index]
            
            clf.fit(
                X_train, y_train,
                eval_set = [(X_val, y_val), (X_train, y_train)],
                eval_metric = [lgbm_f1_score, lgbm_recall_score, 'auc'],
                early_stopping_rounds = 10,
                verbose = False,
                categorical_feature = feat_type_dict['quali_nom'].tolist()
            )
            cv_scores.append(clf.best_score_['valid_0']['f1'])

        return sum(cv_scores) / len(cv_scores)

In [None]:
%%time
lgbm_cv = Light_GBM_CV(X_train, y_train)
study = optuna.create_study(sampler=TPESampler(seed = 42), direction='maximize')
study.optimize(lgbm_cv, n_trials=50)

print('Best model')
print('Mean validation F1: ', study.best_value, '\n')
study.best_params

plot_optimization_history(study)

In [None]:
%%time

models_list = []
cv_scores = []

kf = StratifiedKFold(n_splits=5, random_state=24, shuffle=True)
for fold, (trn_idx, val_idx) in enumerate(kf.split(X_train, y_train)):
    print("Fold :", fold+1)
    
    # Dataset creation
    X_train_kf, y_train_kf = X_train.loc[trn_idx], y_train[trn_idx]
    X_valid, y_valid = X_train.loc[val_idx], y_train[val_idx]
    
    # Modelling
    model = LGBMClassifier(
        objective = "binary",
        boosting_type = "gbdt",
        importance_type = 'gain',
        random_state = 42,
        **study.best_params
    )
    
    model.fit(
        X_train_kf, y_train_kf,
        eval_set = [(X_valid, y_valid), (X_train, y_train)],
        eval_metric = [lgbm_f1_score, lgbm_recall_score, 'auc'],
        early_stopping_rounds = 10,
        verbose = 10,
        categorical_feature = feat_type_dict['quali_nom'].tolist()
    )
    
    # validation
    f1_val = model.best_score_['valid_0']['f1']
    print(f'Performance fold #{fold+1}: {f1_val}')

    #keep scores and models
    cv_scores.append(f1_val)
    models_list.append(model)
    print("*" * 100)

In [None]:
print(f'CV score:', pd.Series(cv_scores).mean())
cv_scores

In [None]:
raw_imp_vetors = [model.feature_importances_.reshape(1, -1) for model in models_list]
raw_imp_matrix = np.concatenate(raw_imp_vetors, axis=0)
norm_imp = raw_imp_matrix / raw_imp_matrix.sum(1).reshape(-1, 1)
mean_imp = norm_imp.mean(0)
imp_series = pd.Series(mean_imp, index=X_train.columns).sort_values(ascending=False)

In [None]:
imp_series

# Simple LGBM

In [8]:
models_list = []
cv_scores = []
cv = StratifiedKFold(5, random_state = 42, shuffle=True)
for array_idxs in tqdm(cv.split(X_train, y_train)):
    train_index, val_index = array_idxs[0], array_idxs[1]
    X_train_kf, X_val = X_train.loc[train_index], X_train.loc[val_index]
    y_train_kf, y_val = y_train.loc[train_index], y_train.loc[val_index]
    clf = LGBMClassifier(random_state=42, n_jobs=-1, importance_type='gain', is_unbalance=True).fit(X_train_kf, y_train_kf)
    cv_scores.append(f1_score(y_val, clf.predict(X_val)))
    models_list.append(clf)
display(cv_scores)
print(pd.Series(cv_scores).mean())

0it [00:00, ?it/s]

[0.6252983293556087,
 0.6661143330571666,
 0.6753670473083198,
 0.6582278481012659,
 0.6794871794871795]

0.6608989474619081


In [None]:
raw_imp_vetors = [model.feature_importances_.reshape(1, -1) for model in models_list]
raw_imp_matrix = np.concatenate(raw_imp_vetors, axis=0)
norm_imp = raw_imp_matrix / raw_imp_matrix.sum(1).reshape(-1, 1)
mean_imp = norm_imp.mean(0)
imp_series = pd.Series(mean_imp, index=X_train.columns).sort_values(ascending=False)
imp_series

# Random undersampling

In [9]:
def rus_objective(trial):
    cv_scores = []
    cv = StratifiedKFold(5, random_state = 42, shuffle=True)
    for array_idxs in cv.split(X_train, y_train):
        train_index, val_index = array_idxs[0], array_idxs[1]
        X_train_kf, X_val = X_train.loc[train_index], X_train.loc[val_index]
        y_train_kf, y_val = y_train.loc[train_index], y_train.loc[val_index]
        rus = RandomUnderSampler(
            sampling_strategy = trial.suggest_float('sampling_strategy', 0.253, 1), 
            random_state=42
        )
        X_train_res, y_train_res = rus.fit_resample(X_train_kf, y_train_kf)
        clf = LGBMClassifier(
            random_state=42, 
            n_jobs=-1, 
            importance_type='gain', 
            is_unbalance=trial.suggest_categorical('is_unbalance', [False, True])
        ).fit(X_train_res, y_train_res)
        cv_scores.append(f1_score(y_val, clf.predict(X_val)))
    return pd.Series(cv_scores).mean()

study = optuna.create_study(sampler=TPESampler(seed = 42), direction='maximize')
study.optimize(rus_objective, n_trials=50)

print('Best model')
print('Mean validation F1: ', study.best_value, '\n')
study.best_params

plot_optimization_history(study)

[32m[I 2021-08-29 20:30:26,846][0m A new study created in memory with name: no-name-98742bc5-c8c6-44c0-bb94-0a3a52122759[0m
[32m[I 2021-08-29 20:30:28,016][0m Trial 0 finished with value: 0.6588147055392923 and parameters: {'sampling_strategy': 0.5327814687789798, 'is_unbalance': False}. Best is trial 0 with value: 0.6588147055392923.[0m
[32m[I 2021-08-29 20:30:29,026][0m Trial 1 finished with value: 0.6510173190403088 and parameters: {'sampling_strategy': 0.7001978876951863, 'is_unbalance': False}. Best is trial 0 with value: 0.6588147055392923.[0m
[32m[I 2021-08-29 20:30:30,237][0m Trial 2 finished with value: 0.6469987836303699 and parameters: {'sampling_strategy': 0.296388458289645, 'is_unbalance': False}. Best is trial 0 with value: 0.6588147055392923.[0m
[32m[I 2021-08-29 20:30:31,127][0m Trial 3 finished with value: 0.6393078762395825 and parameters: {'sampling_strategy': 0.781930215613646, 'is_unbalance': True}. Best is trial 0 with value: 0.6588147055392923.[0m


[32m[I 2021-08-29 20:31:16,927][0m Trial 37 finished with value: 0.662642221334908 and parameters: {'sampling_strategy': 0.29809974740039974, 'is_unbalance': True}. Best is trial 17 with value: 0.6632726221373149.[0m
[32m[I 2021-08-29 20:31:18,347][0m Trial 38 finished with value: 0.6562757843577475 and parameters: {'sampling_strategy': 0.43211370276204886, 'is_unbalance': False}. Best is trial 17 with value: 0.6632726221373149.[0m
[32m[I 2021-08-29 20:31:19,548][0m Trial 39 finished with value: 0.6354453690233525 and parameters: {'sampling_strategy': 0.8235193040013951, 'is_unbalance': True}. Best is trial 17 with value: 0.6632726221373149.[0m
[32m[I 2021-08-29 20:31:20,994][0m Trial 40 finished with value: 0.6608583461050834 and parameters: {'sampling_strategy': 0.3467512134402687, 'is_unbalance': False}. Best is trial 17 with value: 0.6632726221373149.[0m
[32m[I 2021-08-29 20:31:22,528][0m Trial 41 finished with value: 0.6613614292998722 and parameters: {'sampling_stra

Best model
Mean validation F1:  0.6632726221373149 



ImportError: Tried to import 'plotly' but failed. Please make sure that the package is installed correctly to use this feature. Actual error: No module named 'plotly'.

In [10]:
models_list = []
cv_scores = []
cv = StratifiedKFold(5, random_state = 42, shuffle=True)
for array_idxs in tqdm(cv.split(X_train, y_train)):
    train_index, val_index = array_idxs[0], array_idxs[1]
    X_train_kf, X_val = X_train.loc[train_index], X_train.loc[val_index]
    y_train_kf, y_val = y_train.loc[train_index], y_train.loc[val_index]
    rus = RandomUnderSampler(sampling_strategy = study.best_params['sampling_strategy'], random_state=42)
    X_train_res, y_train_res = rus.fit_resample(X_train_kf, y_train_kf)
    clf = LGBMClassifier(
        random_state=42, 
        n_jobs=-1, 
        importance_type='gain', 
        is_unbalance=study.best_params['is_unbalance']
    ).fit(X_train_res, y_train_res)
    cv_scores.append(f1_score(y_val, clf.predict(X_val)))
    models_list.append(clf)
display(cv_scores)
print(pd.Series(cv_scores).mean())

SyntaxError: invalid syntax (<ipython-input-10-98982f601c2b>, line 10)

In [None]:
raw_imp_vetors = [model.feature_importances_.reshape(1, -1) for model in models_list]
raw_imp_matrix = np.concatenate(raw_imp_vetors, axis=0)
norm_imp = raw_imp_matrix / raw_imp_matrix.sum(1).reshape(-1, 1)
mean_imp = norm_imp.mean(0)
imp_series = pd.Series(mean_imp, index=X_train.columns).sort_values(ascending=False)
imp_series

# Submission

In [None]:
X_test = test.drop(columns=['id'])
submission = test[['id']]

# Scoring ensemble
target = np.zeros(len(X_test))
for model in models_list:
    pred = model.predict(X_test, num_iteration=model.best_iteration_)
    target += pred / len(models_list)

submission = submission.assign(predicted = np.round(target)).astype('int')

In [None]:
submission.head()

In [None]:
submission.to_csv('submission.csv', index=False)