In [1]:
import pandas as pd
import numpy as np
import gc

import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, roc_auc_score, auc, precision_score, recall_score
from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler, StandardScaler, RobustScaler

SEED = 1996
FOLDS = 5
TARGET = "y"
VERSION = "lgbm-v5"

# Load Data

In [2]:
df_train = pd.read_csv("../data/train.csv")
df_train.head()

Unnamed: 0,id,var1,var2,var3,var4,var5,var6,var7,var8,var9,...,var60,var61,var62,var63,var64,var65,var66,var67,var68,y
0,1,18,19,2853,29442,1386,2435,35,-999,3,...,0.311441,0.142303,0.056146,0.632694,0.024054,0.253356,0.00603,0.132353,0.139706,1
1,8,4,110,1986,13684,7189,-999,-999,17,3,...,-999.0,-999.0,0.070991,0.773966,0.019315,-999.0,-999.0,0.147059,0.106618,0
2,30,0,39,1019,10232,678,791,16,-999,3,...,-999.0,0.200814,0.051046,0.980827,0.018536,-999.0,-999.0,0.382353,0.242647,0
3,43,20,39,1751,2689,8235,1042,13,10,1,...,-999.0,0.352379,0.044301,0.951564,0.023684,0.36337,0.00201,0.147059,0.132353,0
4,46,7,44,2262,29428,6031,304,16,-999,3,...,0.021226,0.226161,0.059125,0.906155,0.020733,-999.0,-999.0,0.455882,0.132353,1


In [3]:
df_test = pd.read_csv("../data/test.csv")
df_test.head()

Unnamed: 0,id,var1,var2,var3,var4,var5,var6,var7,var8,var9,...,var59,var60,var61,var62,var63,var64,var65,var66,var67,var68
0,0,5,126,1353,28956,743,1289,27,-999,1,...,0.201839,0.353965,0.166641,0.049108,0.986882,0.016683,-999.0,-999.0,0.176471,0.253676
1,2,6,126,1446,7803,5151,935,35,-999,3,...,0.072127,0.074555,0.217009,0.144403,0.892028,0.038323,-999.0,-999.0,0.147059,0.099265
2,4,5,44,243,4325,1109,1903,33,24,1,...,0.32477,0.384992,0.33068,0.072864,0.930373,0.021052,-999.0,-999.0,0.294118,0.136029
3,7,4,53,419,743,7750,183,35,-999,3,...,0.13107,-999.0,0.244936,0.158088,0.986882,0.022649,-999.0,-999.0,0.294118,0.220588
4,15,4,126,1863,22693,5625,965,9,-999,3,...,0.225166,0.05994,0.252794,0.080405,0.944501,0.021806,-999.0,-999.0,0.352941,0.113971


In [4]:
df_sub = pd.read_csv("../data/submission_sample.csv")
df_sub.head()

Unnamed: 0,id,predicted
0,0,1
1,2,1
2,4,1
3,7,0
4,15,1


In [5]:
df_meta = pd.read_csv("../data/metadata.csv")
df_meta.head()

Unnamed: 0,Variavel cod,Variavel tipo
0,id,Qualitativo nominal
1,var1,Qualitativo nominal
2,var2,Qualitativo nominal
3,var3,Qualitativo nominal
4,var4,Qualitativo nominal


In [6]:
df_meta["Variavel tipo"].value_counts()

Qualitativo nominal      36
Quantitativo discreto    18
Quantitativo continua    12
Qualitativo ordinal       4
Name: Variavel tipo, dtype: int64

## Vars type

- **Variável qualitativa nominal** = valores que expressam atributos, sem
nenhum tipo de ordem. Ex: cor dos olhos, sexo, estado civil, presença ou
ausência...


- **Variável qualitativa ordinal** = valores que expressam atributos, porém com
algum tipo de ordem, ou grau. Ex: grau de escolaridade (1º grau, 2º grau, 3º
grau, pós-graduação...); resposta de um paciente (nenhuma melhora, alguma
melhora, muita melhora); classe social (alta, média, baixa)...


- **Variável quantitativa discreta** = valores observados somente em pontos
isolados ao longo de uma escala de valores (contagem). Valores positivos
inteiros (incluindo o zero). Ex: No de filhos; No de faltas; alunos com notas abaixo de 5,0.


- **Variável quantitativa contínua** = valores em qualquer ponto fracionário ao
longo de um intervalo especificado de valores (medição). Ex: temperatura do
corpo; altura (em metros); índice do PIB...

In [7]:
cols_type = df_meta.drop(0).groupby("Variavel tipo")["Variavel cod"].apply(list)
cols_type

Variavel tipo
Qualitativo nominal      [var1, var2, var3, var4, var5, var6, var7, var...
Qualitativo ordinal                           [var26, var32, var42, var43]
Quantitativo continua    [var55, var56, var57, var58, var59, var60, var...
Quantitativo discreto    [var24, var25, var27, var40, var44, var45, var...
Name: Variavel cod, dtype: object

# Features

## Features ideas

There are many approaches to feature engineering, which could be also used on anonymized data:

- transformation of continuous variables: log, power, normalization;
- aggregations: `df.groupby(['cat_column']).agg({'continuous_column': ['min', 'max', 'mean', 'std'])
- interactions of continuous variables: addition, subtraction, multiplications, division And so on.

In [8]:
# df_temp = df_train.drop(["id", TARGET], axis=1).apply(np.log)
# df_temp.columns = [f"{col}_log" for col in df_temp.columns]
# df_train = pd.concat([df_train, df_temp], axis=1)

# df_temp = df_train.drop(["id", TARGET], axis=1).apply(lambda x: np.power(x, 2))
# df_temp.columns = [f"{col}_pow2" for col in df_temp.columns]
# df_train = pd.concat([df_train, df_temp], axis=1)

# print(df_train.shape)
# df_train.tail()

In [9]:
# df_temp = df_test.drop("id", axis=1).apply(np.log)
# df_temp.columns = [f"{col}_log" for col in df_temp.columns]
# df_test = pd.concat([df_test, df_temp], axis=1)

# df_temp = df_test.drop("id", axis=1).apply(lambda x: np.power(x, 2))
# df_temp.columns = [f"{col}_pow2" for col in df_temp.columns]
# df_test = pd.concat([df_test, df_temp], axis=1)

# print(df_test.shape)
# df_test.tail()

In [10]:
# max_abs_scl = MaxAbsScaler()
# min_max_scl = MinMaxScaler()
# std_scl = StandardScaler()
# robust_scaler = RobustScaler()

# scalers = [
#     (max_abs_scl, "max_abs"),
#     (min_max_scl, "min_max"),
#     (std_scl, "std_scl"),
#     (robust_scaler, "rob_scl")
# ]
# cols_to_drop = [ft for ft in df_train.columns if "_" in ft]
# # cols_to_drop.append("y")
# # cols_to_drop.append("id")

# for scl in scalers:
#     df_train_temp = df_train.drop(cols_to_drop + [TARGET], axis=1)
#     df_test_temp = df_test.drop(cols_to_drop, axis=1)
#     scl[0].fit(pd.concat([df_train_temp, df_test_temp], axis=0))
    
#     new_cols = [f"{col}_{scl[1]}" for col in df_train_temp.columns]
#     df_train[new_cols] = scl[0].transform(df_train_temp)
    
#     new_cols = [f"{col}_{scl[1]}" for col in df_test_temp.columns]
#     df_test[new_cols] = scl[0].transform(df_test_temp)

In [11]:
df_train.shape, df_test.shape

((14123, 70), (21183, 69))

## 'has one feat' features from kaggle

https://www.kaggle.com/c/santander-customer-transaction-prediction/discussion/89003
https://www.kaggle.com/fl2ooo/create-data

In [12]:
cols_type

Variavel tipo
Qualitativo nominal      [var1, var2, var3, var4, var5, var6, var7, var...
Qualitativo ordinal                           [var26, var32, var42, var43]
Quantitativo continua    [var55, var56, var57, var58, var59, var60, var...
Quantitativo discreto    [var24, var25, var27, var40, var44, var45, var...
Name: Variavel cod, dtype: object

In [25]:
cols_type.apply(len)

Variavel tipo
Qualitativo nominal      35
Qualitativo ordinal       4
Quantitativo continua    12
Quantitativo discreto    18
Name: Variavel cod, dtype: int64

In [14]:
from tqdm import tqdm

orig = df_test.columns# + cols_type["Qualitativo ordinal"]
# orig = df_test.columns
has_one = [f'{col}_has_one' for col in orig]
has_zero = [f'{col}_has_zero' for col in orig]
not_u = [f'{orig}_not_unique' for col in orig]

for f in tqdm(orig):
    unique_v = df_test[f].value_counts()
    unique_v = unique_v.index[unique_v == 1]
    df_test[f + '_u'] = df_test[f].isin(unique_v)

100%|██████████| 69/69 [00:00<00:00, 219.57it/s]


In [15]:
df_test['has_unique'] = df_test[[f + '_u' for f in orig]].any(axis=1)
print(df_test['has_unique'].sum())

21183


In [16]:
real_samples = df_test.loc[df_test['has_unique'], orig]
ref = pd.concat([df_train, real_samples], axis=0)
print(ref.shape)

(35306, 70)


In [17]:
for f in tqdm(orig):
    df_train[f + '_has_one'] = 0
    df_train[f + '_has_zero'] = 0
    f_1 = df_train.loc[df_train[TARGET] == 1, f].value_counts()
    
    f_1_1 = set(f_1.index[f_1 > 1])
    f_0_1 = set(f_1.index[f_1 > 0])

    f_0 = df_train.loc[df_train[TARGET] == 0, f].value_counts()
    f_0_0 = set(f_0.index[f_0 > 1])
    f_1_0 = set(f_0.index[f_0 > 0])
    
    df_train.loc[df_train[TARGET] == 1, f + '_has_one'] = df_train.loc[df_train[TARGET] == 1, f].isin(f_1_1).astype(int)
    df_train.loc[df_train[TARGET] == 0, f + '_has_one'] = df_train.loc[df_train[TARGET] == 0, f].isin(f_0_1).astype(int)

    df_train.loc[df_train[TARGET] == 1, f + '_has_zero'] = df_train.loc[df_train[TARGET] == 1, f].isin(f_1_0).astype(int)
    df_train.loc[df_train[TARGET] == 0, f + '_has_zero'] = df_train.loc[df_train[TARGET] == 0, f].isin(f_0_0).astype(int)

df_train.loc[:, has_one] = 2*df_train.loc[:, has_one].values + df_train.loc[:, has_zero].values

100%|██████████| 69/69 [00:02<00:00, 25.47it/s]


In [18]:
for f in tqdm(orig):
    df_test[f + '_has_one'] = 0
    df_test[f + '_has_zero'] = 0
    f_1 = df_train.loc[df_train[TARGET] == 1, f].unique()
    f_0 = df_train.loc[df_train[TARGET] == 0, f].unique()
    df_test.loc[:, f + '_has_one'] = df_test[f].isin(f_1).astype(int)
    df_test.loc[:, f + '_has_zero'] = df_test[f].isin(f_0).astype(int)
    
df_test.loc[:, has_one] = 2*df_test.loc[:, has_one].values + df_test.loc[:, has_zero].values

100%|██████████| 69/69 [00:01<00:00, 41.52it/s]


In [19]:
for f in tqdm(orig):
    v = ref[f].value_counts()
    
    non_unique_v = v.index[v != 1]
    
    m_trd = df_train[f].isin(non_unique_v)
    df_train[f + '_not_unique'] = m_trd  * df_train[f] + (~m_trd) * df_train[f].mean()
    
    m_df_test = df_test[f].isin(non_unique_v)
    df_test[f + '_not_unique'] = m_df_test  * df_test[f] + (~m_df_test) * df_train[f].mean()
    
    df_train.loc[~m_trd, f + '_has_one'] = 4
    df_test.loc[~m_df_test, f + '_has_one'] = 4

100%|██████████| 69/69 [00:01<00:00, 46.88it/s]


## Dummies

In [20]:
# cols = cols_type['Qualitativo nominal']
# cols.extend(cols_type['Qualitativo ordinal'])

# Training LGB

In [21]:
df_train.shape, df_test.shape

((14123, 277), (21183, 346))

In [22]:
df_train.columns[~df_train.columns.isin(df_test.columns)]

Index(['y'], dtype='object')

In [23]:
df_test.columns[~df_test.columns.isin(df_train.columns)]

Index(['id_u', 'var1_u', 'var2_u', 'var3_u', 'var4_u', 'var5_u', 'var6_u',
       'var7_u', 'var8_u', 'var9_u', 'var10_u', 'var11_u', 'var12_u',
       'var13_u', 'var14_u', 'var15_u', 'var16_u', 'var17_u', 'var18_u',
       'var19_u', 'var20_u', 'var21_u', 'var22_u', 'var23_u', 'var24_u',
       'var25_u', 'var26_u', 'var27_u', 'var28_u', 'var29_u', 'var30_u',
       'var31_u', 'var32_u', 'var33_u', 'var34_u', 'var35_u', 'var36_u',
       'var37_u', 'var38_u', 'var39_u', 'var40_u', 'var41_u', 'var42_u',
       'var43_u', 'var44_u', 'var45_u', 'var46_u', 'var47_u', 'var48_u',
       'var49_u', 'var50_u', 'var51_u', 'var52_u', 'var53_u', 'var54_u',
       'var55_u', 'var56_u', 'var57_u', 'var58_u', 'var59_u', 'var60_u',
       'var61_u', 'var62_u', 'var63_u', 'var64_u', 'var65_u', 'var66_u',
       'var67_u', 'var68_u', 'has_unique'],
      dtype='object')

In [24]:
import lightgbm as lgb

features_to_drop = [TARGET]
features = [ft for ft in df_train.columns if ft not in features_to_drop]

importances = df_train[features].columns.to_frame()
train_preds = df_train[TARGET].to_frame()
train_preds["preds"] = 0

df_sub["predicted"] = 0
df_sub[TARGET] = 0

kfold = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=SEED)
for fold, (train_index, valid_index) in enumerate(kfold.split(df_train, df_train[TARGET])):
    x_train, x_valid = df_train.loc[train_index][features], df_train.loc[valid_index][features]
    y_train, y_valid = df_train.loc[train_index][TARGET], df_train.loc[valid_index][TARGET]
    
    clf = lgb.LGBMClassifier(
        learning_rate=0.01,
        n_estimators=5000,
        random_state=SEED,
        boosting="gbdt",
        objective="binary",
        subsample=0.8,
        subsample_freq=10,
        colsample_bytree=0.8,
        max_depth=-1
    )
#     clf = lgb.LGBMClassifier(
#         bagging_freq=5,
#         bagging_fraction=1.0,
#         n_estimators=5000,
#         boost_from_average=False,
#         boost='gbdt',
#         feature_fraction= 1.0,
#         learning_rate= 0.005,
#         max_depth= -1,
#         metric='binary_logloss',
#         min_data_in_leaf= 30,
#         min_sum_hessian_in_leaf= 10.0,
#         num_leaves=64,
#         num_threads= -1,
#         tree_learner= 'serial',
#         objective='binary'
#     )
    
    clf.fit(
        x_train,
        y_train,
#         categorical_feature=cols,
        eval_set=[(x_valid, y_valid)],
        eval_metric="auc",
        early_stopping_rounds=500,
        verbose=500
    )
    
    y_pred = clf.predict_proba(x_valid, num_iteration=clf.best_iteration_)[:, 1]
    y_pred = (pd.Series(y_pred).rank()/len(x_valid)).values
    train_preds.loc[valid_index, "preds"] = y_pred
    
    test_preds = clf.predict_proba(df_test[features], num_iteration=clf.best_iteration_)[:, 1]
    test_preds = (pd.Series(test_preds).rank()/len(test_preds)).values
    df_sub[TARGET] += test_preds/FOLDS
    
    importances[fold] = clf.feature_importances_    
    
    print(f"Fold {fold+1}, AUC: {roc_auc_score(y_valid, y_pred)}")
    gc.collect()

print(f"CV AUC: {roc_auc_score(train_preds[TARGET], train_preds['preds'])}")
      
# fold 1 AUC 0.8970132649
# fold 2 AUC 0.8917944529
# fold 3 AUC 0.9014680826
# fold 4 AUC 0.8959658463
# fold 5 AUC 0.8873916157
# OOF AUC 0.8947271932
# OOF F1 0.6857142857

Training until validation scores don't improve for 500 rounds
[500]	valid_0's auc: 0.892549	valid_0's binary_logloss: 0.311497
[1000]	valid_0's auc: 0.894612	valid_0's binary_logloss: 0.307956
Early stopping, best iteration is:
[739]	valid_0's auc: 0.895568	valid_0's binary_logloss: 0.307517
Fold 1, AUC: 0.8955677441941885
Training until validation scores don't improve for 500 rounds
[500]	valid_0's auc: 0.886779	valid_0's binary_logloss: 0.30797
[1000]	valid_0's auc: 0.889032	valid_0's binary_logloss: 0.304937
[1500]	valid_0's auc: 0.889794	valid_0's binary_logloss: 0.306237
Early stopping, best iteration is:
[1053]	valid_0's auc: 0.889493	valid_0's binary_logloss: 0.304813
Fold 2, AUC: 0.8894931341657915
Training until validation scores don't improve for 500 rounds
[500]	valid_0's auc: 0.894636	valid_0's binary_logloss: 0.302878
[1000]	valid_0's auc: 0.897626	valid_0's binary_logloss: 0.297806
[1500]	valid_0's auc: 0.897817	valid_0's binary_logloss: 0.298879
Early stopping, best iter

In [None]:
f1, precision, recall = [], [], []
max_f1_threshold = -1
max_f1 = -1
for i in tqdm(range(1000)):
    _i = i / 1000.0
    
    y_pred = np.where(train_preds["preds"] > _i, 1, 0)
    
    _precision = precision_score(train_preds[TARGET], y_pred)
    if _precision == 1.0:
        break
    _f1 = f1_score(train_preds[TARGET], y_pred)
    if _f1 > max_f1:
        max_f1 = _f1
        max_f1_threshold = _i
    
    f1.append(_f1)
    precision.append(_precision)
    recall.append(recall_score(train_preds[TARGET], y_pred))
    
plt.figure(figsize=[10, 5])
plt.plot(range(len(f1)), f1, label="f1")
plt.plot(range(len(f1)), precision, label="precision")
plt.plot(range(len(f1)), recall, label="recall")
plt.legend()
plt.tight_layout()
plt.show()

print(f"max_f1 [{max_f1_threshold}]: {max_f1}")
# max_f1 [0.776]: 0.6822942643391521

  6%|▋         | 64/1000 [00:01<00:24, 37.59it/s]

## Feature importance

In [None]:
importances_sorted = importances.mean(axis=1).sort_values(ascending=False).head(100)
plt.figure(figsize=(10,15))
sns.barplot(y=importances_sorted.index, x=importances_sorted, orient="h")

## Clip and save sub

In [None]:
print(f"VERSION ::: {VERSION}")

In [None]:
df_sub[TARGET].hist()

In [None]:
df_sub["predicted"] = np.where(df_sub[TARGET] > max_f1_threshold, 1, 0)
df_sub.head()

In [None]:
df_sub[["predicted", "y"]].describe()

In [None]:
train_preds.columns = ["predicted", "y"]
train_preds["predicted"] = np.where(train_preds["y"] > max_f1_threshold, 1, 0)
train_preds.head()

In [None]:
train_preds["id"] = df_train["id"]
train_preds = train_preds[["id", "predicted", "y"]]
train_preds.head()

In [None]:
train_preds.to_csv(f"../output/mario/train_preds_{VERSION}.csv", index=False)
df_sub.to_csv(f"../output/mario/test_preds_{VERSION}.csv", index=False)
df_sub.drop("y", axis=1).to_csv(f"../output/mario/sub_{VERSION}.csv", index=False)

# Confusion Matix

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_true=df_train[TARGET], y_pred=np.where(train_preds[TARGET] > max_f1_threshold, 1, 0), labels=[0, 1])

ax= plt.subplot()
sns.heatmap(cm, annot=True, fmt="g", ax=ax);  #annot=True to annotate cells, ftm="g" to disable scientific notation

# labels, title and ticks
ax.set_xlabel("Predicted labels");
ax.set_ylabel("True labels"); 
ax.set_title("Confusion Matrix");