In [21]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings('ignore')
import pickle

# Preprocessing
- Random sampling
- Categorical values
- Missing values
- Feature extraction
- Scale variables

In [2]:
train_data = pd.read_parquet('train_data.parquet')

In [3]:
train_data.describe()

Unnamed: 0,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,D_42,D_43,...,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145,target
count,5485466.0,5531451.0,5531451.0,5529435.0,5531451.0,4510907.0,5529435.0,5529435.0,791314.0,3873055.0,...,194699.0,194699.0,5429903.0,5490819.0,5429903.0,944408.0,5429903.0,5490724.0,5429903.0,5531451.0
mean,0.6581885,0.1514543,0.1225995,0.6105151,0.07632776,0.2264676,0.05831698,0.1302262,0.184966,0.154414,...,0.01424414,0.1646144,0.1750432,0.02634348,0.1615358,0.390781,0.1749181,0.05131478,0.06084983,0.2490972
std,0.2441398,0.2705955,0.2131456,0.4029192,0.2221323,0.1925726,0.1984337,0.2343981,0.22814,0.2129113,...,0.09569677,0.2670328,0.3696215,0.1442015,0.3456256,0.236142,0.3695211,0.1791743,0.188925,0.4324903
min,-0.4589548,5.02619e-09,-7.588799,9.19228e-09,1.534223e-09,-0.6271321,5.566545e-10,6.285293e-09,-0.000454,1.15455e-07,...,1.078787e-08,3.307923e-08,3.767347e-10,3.725073e-09,1.6501e-10,-0.014539,5.549692e-09,2.500991e-09,1.226024e-09,0.0
25%,0.4803307,0.004528464,0.008863645,0.1053313,0.002895934,0.1272588,0.002873244,0.00522757,0.037516,0.04227546,...,0.00253247,0.003517452,0.003027212,0.002555848,0.003026087,0.199399,0.003028116,0.002752895,0.003028347,0.0
50%,0.694295,0.009056902,0.03132968,0.8143328,0.00578223,0.1639082,0.005746725,0.009777229,0.120519,0.08851244,...,0.00506983,0.007037814,0.00605301,0.005110523,0.00605159,0.382136,0.006053151,0.005508129,0.006053247,0.0
75%,0.8648159,0.2366407,0.1259019,1.002403,0.00866059,0.2581017,0.008615665,0.1550507,0.250869,0.1843206,...,0.007573434,0.5015469,0.009080455,0.007663697,0.009078914,0.559307,0.009076287,0.008260448,0.00908093,0.0
max,1.01,5.389619,1.32406,1.01,3.256284,5.482888,8.988807,1.625262,4.191119,10.11162,...,1.009998,3.005383,1.01,1.01,1.33991,2.229368,1.01,1.343331,4.82763,1.0


## Random Sampling
We select 10% of the clients

In [4]:
clients = train_data['customer_ID'].unique()
n = int(0.1*len(clients))
selected_clients = np.random.choice(clients, n, replace=False)

In [5]:
train = train_data[train_data.customer_ID.isin(selected_clients)]
print("Original train dataset shape was:", train_data.shape)
print("Random sample dataset now is: ", train.shape)

Original train dataset shape was: (5531451, 191)
Random sample dataset now is:  (553697, 191)


We verify our sample represents correctly the original train set

In [6]:
train.describe()

Unnamed: 0,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,D_42,D_43,...,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145,target
count,549074.0,553697.0,553697.0,553509.0,553697.0,451255.0,553509.0,553509.0,79158.0,389039.0,...,19391.0,19391.0,543462.0,549535.0,543462.0,94521.0,543462.0,549519.0,543462.0,553697.0
mean,0.656129,0.153519,0.123318,0.6202384,0.07954329,0.22638,0.06033329,0.1324444,0.18579,0.1547614,...,0.01435675,0.1649387,0.1787974,0.02579877,0.1645867,0.395964,0.178673,0.05444669,0.06204691,0.250289
std,0.244189,0.2702014,0.210495,0.4017786,0.2279593,0.194216,0.2050364,0.2342118,0.233211,0.2084194,...,0.0964768,0.2656294,0.379301,0.142706,0.3486367,0.236682,0.3791972,0.1862663,0.1917907,0.43318
min,-0.37584,3.785832e-08,-0.799453,2.482187e-07,2.066058e-08,-0.491539,3.516009e-08,6.285293e-09,-0.000251,6.194853e-07,...,2.534351e-08,9.859704e-08,6.598718e-08,1.807036e-08,1.6501e-10,-0.014228,3.909845e-08,1.325409e-08,2.246565e-08,0.0
25%,0.479678,0.00453883,0.008864,0.1048189,0.002898846,0.126996,0.002878733,0.005214345,0.036873,0.04235244,...,0.002516954,0.003516602,0.003020772,0.002559071,0.003025374,0.205362,0.003031009,0.002759574,0.003028541,0.0
50%,0.694792,0.009080932,0.031372,0.8143299,0.005783508,0.163844,0.005751648,0.009780595,0.118108,0.08934197,...,0.00498927,0.00706266,0.006050527,0.005116322,0.006050216,0.388025,0.006060432,0.005526314,0.00604385,0.0
75%,0.863288,0.2368908,0.125867,1.0024,0.008668484,0.258857,0.008627107,0.1561719,0.249029,0.1863929,...,0.007507271,0.5016676,0.009083773,0.007663543,0.009074696,0.566239,0.009081175,0.008294351,0.00908392,1.0
max,1.009999,5.357074,1.324059,1.01,2.756794,3.502517,7.510052,1.625262,4.186889,9.321696,...,1.009998,2.008491,1.01,1.009999,1.252589,1.837051,1.01,1.343293,3.279571,1.0


## Categorical Values

In [7]:
data_types = { col: train[col].dtypes for col in train.columns}
non_numeric_cols = [d for d in data_types.keys() if data_types[d] == 'O' and d not in ['customer_ID', 'S_2']]
non_numeric_cols

['D_63', 'D_64']

In [8]:
# categorical columns D_63 and D_64
for col in non_numeric_cols:
    le = preprocessing.LabelEncoder()
    train[col] = le.fit_transform(train[col])

## Missing values

In [9]:
# imputation of missing values
X = train.set_index(['customer_ID', 'S_2'])
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(X)
train_new = pd.DataFrame(imp.transform(X), index = X.index, columns= X.columns)
train_new = train_new.reset_index()
train_new

Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145,target
0,00001b22f846c82c51f6e3958ccd81970162bae8b007e8...,2017-03-11,0.876615,0.001469,0.001472,0.810796,0.005955,0.226382,0.008401,0.004826,...,0.014357,0.164939,0.003652,0.004395,0.002179,0.395966,0.001196,0.002738,0.000443,0.0
1,00001b22f846c82c51f6e3958ccd81970162bae8b007e8...,2017-04-11,0.887385,0.004229,0.002125,0.819078,0.004521,0.226382,0.008008,0.007282,...,0.014357,0.164939,0.002876,0.000840,0.009347,0.395966,0.003332,0.009278,0.004498,0.0
2,00001b22f846c82c51f6e3958ccd81970162bae8b007e8...,2017-05-12,0.884522,0.003399,0.003495,0.818569,0.007541,0.226382,0.007537,0.000859,...,0.014357,0.164939,0.003084,0.000304,0.000891,0.395966,0.002980,0.009026,0.002959,0.0
3,00001b22f846c82c51f6e3958ccd81970162bae8b007e8...,2017-06-10,0.852514,0.006877,0.007627,0.819987,0.009290,0.226382,0.003959,0.007532,...,0.014357,0.164939,0.007998,0.003724,0.005225,0.395966,0.005043,0.001033,0.005859,0.0
4,00001b22f846c82c51f6e3958ccd81970162bae8b007e8...,2017-07-12,0.891656,0.000802,0.009997,0.811041,0.003540,0.226382,0.009034,0.000626,...,0.014357,0.164939,0.009134,0.005561,0.006398,0.395966,0.002139,0.003595,0.005110,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
553692,fffd765f1f5fa550f044acb297092807227f4742fa11bb...,2017-11-28,0.686476,0.211116,0.055055,1.004938,0.005941,0.141466,0.008663,0.002887,...,0.000364,0.009860,0.008247,0.000969,0.001880,0.395966,0.009615,0.001447,0.005410,0.0
553693,fffd765f1f5fa550f044acb297092807227f4742fa11bb...,2017-12-03,0.684926,0.361464,0.060740,1.000378,0.002943,0.145406,0.008054,0.006243,...,0.003319,0.003286,0.003205,0.000724,0.002089,0.395966,0.004947,0.005034,0.007218,0.0
553694,fffd765f1f5fa550f044acb297092807227f4742fa11bb...,2018-01-02,0.709665,0.330995,0.081282,1.003688,0.007902,0.131936,0.008661,0.010533,...,0.002360,0.009094,0.002246,0.005936,0.006764,0.395966,0.000953,0.003639,0.002407,0.0
553695,fffd765f1f5fa550f044acb297092807227f4742fa11bb...,2018-02-24,0.733438,0.149128,0.034771,1.006038,0.001128,0.125726,0.002577,0.010326,...,0.001384,0.002081,0.001148,0.004779,0.005688,0.395966,0.001870,0.009649,0.004466,0.0


## Feature extraction

In [10]:
#mean_df = train_new.drop(columns='S_2').groupby('customer_ID').diff(axis = 0).drop(columns = ['target']).fillna(0)
#mean_df['customer_ID'] = train_new['customer_ID']
#mean_df = mean_df.groupby('customer_ID').mean()
#max_df = train_new.drop(columns=['S_2', 'target']).groupby('customer_ID').max()
mean_df = train_new.drop(columns=['S_2', 'target']).groupby('customer_ID').mean()
last_df = train_new.groupby('customer_ID').last().drop(columns = ['S_2'])

In [11]:
df = pd.concat([mean_df, last_df], axis=1)
df

Unnamed: 0_level_0,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,D_42,D_43,...,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145,target
customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00001b22f846c82c51f6e3958ccd81970162bae8b007e80662ef27519fcc18c1,0.878454,0.004181,0.004386,0.815677,0.006621,0.226382,0.006842,0.005493,0.18579,0.154763,...,0.014357,0.164939,0.007383,0.006623,0.000964,0.395966,0.002202,0.000834,0.003444,0.0
000084e5023181993c2e1b665ac88dbb1ce9ef621ec5370150fc2f8bdca6202c,0.794131,0.199325,0.027116,0.974886,0.005724,0.304118,0.004638,0.006662,0.18579,0.011693,...,0.014357,0.164939,0.009401,0.007832,0.000957,0.395966,0.009054,0.006943,0.001446,0.0
0001812036f1558332e5c0880ecbad70b13a6f28ab04a8db6d83a26ef40aadb0,0.386107,0.117855,0.862294,0.023567,0.274521,0.820201,0.005433,0.749541,0.18579,0.387932,...,0.014357,0.164939,0.005706,0.009877,0.003770,0.395966,0.002888,0.009148,0.001400,1.0
000445609ff2a39d2dd02484899affa5696210a95f6869f26390bd26eeb3b651,0.972260,0.005031,0.005157,0.816084,0.003508,0.188652,0.005562,0.006415,0.18579,0.013120,...,0.014357,0.164939,0.009010,0.007145,0.005159,0.395966,0.000031,0.004060,0.000315,0.0
00057576e6eab4633ec2893ca7e0ab76f2094ad2d43b1e3749db49d51e064ee9,0.080502,0.673698,0.105436,0.127109,0.523456,0.140852,0.455548,0.245338,0.18579,0.140496,...,0.014357,0.164939,1.001770,0.006841,0.979688,0.717009,1.009697,0.003155,0.733949,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
fff84f243230b1b6b65020994e2c37c196050c21e92e03422aa0c66c5d01c84d,0.519669,0.218172,0.220397,0.185407,0.005225,0.098243,0.004803,0.435218,0.18579,0.097019,...,0.014357,0.164939,0.001928,0.009422,0.008127,0.395966,0.008111,0.008643,0.004765,1.0
fff94ae882244cbe97416913a5ee734eccbc81390db6c15830f89e71c50e3f97,0.853581,0.039299,0.029193,0.917468,0.004437,0.135957,0.004178,0.009728,0.18579,0.159217,...,0.014357,0.164939,0.006014,0.008984,0.007000,0.395966,0.004756,0.005134,0.000925,0.0
fffc59f2e1aa4de05de451121a35c26c61e63cfc38328a6c3500d1a4c43ef317,0.912961,0.005264,0.006156,0.815343,0.004429,0.233056,0.005672,0.006374,0.18579,0.089115,...,0.014357,0.164939,0.009665,0.009193,0.004957,0.395966,0.006643,0.000382,0.007155,0.0
fffd1a37da07c54a930788ab53061cd7bd34d38592205e4b2bc631c79bcb5160,0.870187,0.004345,0.006280,0.814879,0.004485,0.226382,0.004768,0.005497,0.18579,0.154763,...,0.014357,0.164939,0.000321,0.009248,0.003959,0.395966,0.004084,0.009832,0.007089,0.0


In [12]:
X = df.loc[:,df.columns != 'target'].values
scaler = preprocessing.StandardScaler().fit(X)
X_scaled = scaler.transform(X)
y = df['target'].values
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.25, random_state=42)

# Models

In [13]:
def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:

    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns').sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns').sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

def evaluate_metric(X_test, y_test, model):
    ypred = pd.DataFrame(model.predict(X_test), columns = ['prediction'])
    ytest = pd.DataFrame(y_test, columns = ['target'])
    return amex_metric(y_true = ytest, y_pred =ypred)

In [14]:
summary = pd.DataFrame()

### Logistic Regression

In [15]:
lr = LogisticRegression(random_state=0, class_weight= "balanced", n_jobs=-1, max_iter = 500, solver='saga')
parameters = {'penalty' : ['l2', 'none']}
lr_gs = GridSearchCV(lr, parameters, n_jobs=-1)
lr_gs.fit(X_train,y_train)
print("Finalizado entrenamiento")
metric_lr = evaluate_metric(X_test, y_test, lr_gs)
print(lr_gs.best_params_)
print(metric_lr)



Finalizado entrenamiento
{'penalty': 'l2'}
0.5211383977784526


In [16]:
print(classification_report(y_test, lr_gs.predict(X_test)))

              precision    recall  f1-score   support

         0.0       0.96      0.88      0.92      8535
         1.0       0.72      0.90      0.80      2938

    accuracy                           0.88     11473
   macro avg       0.84      0.89      0.86     11473
weighted avg       0.90      0.88      0.89     11473



### Random Forest


In [17]:
rf = RandomForestClassifier( random_state=0, class_weight= "balanced", criterion= 'gini')
parameters = {
        'n_estimators': [100,200,300],
        'max_depth' : [4,5,6,7,8]
        }
rf_gs = GridSearchCV(rf, parameters, n_jobs=-1)
rf_gs.fit(X_train,y_train)
print("Finalizado entrenamiento")
metric_rf = evaluate_metric(X_test, y_test, rf_gs)
print(rf_gs.best_params_)
print(metric_rf)
print(classification_report(y_test, rf_gs.predict(X_test)))

Finalizado entrenamiento
{'max_depth': 8, 'n_estimators': 200}
0.4933419112728931
              precision    recall  f1-score   support

         0.0       0.96      0.86      0.91      8535
         1.0       0.68      0.90      0.78      2938

    accuracy                           0.87     11473
   macro avg       0.82      0.88      0.84     11473
weighted avg       0.89      0.87      0.87     11473



### GradientBoosting

In [18]:
gb = GradientBoostingClassifier(random_state=0)
parameters = {
        'learning_rate': [0.1,0.5,1],
        }
gb_gs = GridSearchCV(gb, parameters, n_jobs=-1)
gb_gs.fit(X_train,y_train)
print("Finalizado entrenamiento")
metric_gb = evaluate_metric(X_test, y_test, gb_gs)
print(metric_gb)
print(classification_report(y_test, gb_gs.predict(X_test)))

Finalizado entrenamiento
0.5618034815916566
              precision    recall  f1-score   support

         0.0       0.93      0.93      0.93      8535
         1.0       0.80      0.79      0.80      2938

    accuracy                           0.90     11473
   macro avg       0.86      0.86      0.86     11473
weighted avg       0.90      0.90      0.90     11473



In [19]:
print(gb_gs.best_params_)

{'learning_rate': 0.1}


In [22]:
filename = 'model_gb.sav'
pickle.dump(gb_gs, open(filename, 'wb'))

In [23]:
filename = 'model_rf.sav'
pickle.dump(rf_gs, open(filename, 'wb'))

In [24]:
filename = 'model_lr.sav'
pickle.dump(lr_gs, open(filename, 'wb'))

In [25]:
imp

SimpleImputer()

In [26]:
filename = 'imputer.sav'
pickle.dump(imp, open(filename, 'wb'))