In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
pd.set_option('display.max_columns', 500)

In [2]:
df1 = pd.read_csv('../data/features_basicas.csv', low_memory=False, index_col='person')
df2 = pd.read_csv('../data/features_checkouts.csv', low_memory=False, index_col='person')
df3 = pd.read_csv('../data/features_vistos.csv', low_memory=False, index_col='person')
df4 = pd.read_csv('../data/feature_dispositivo_mas_usado.csv', low_memory=False, index_col='person')
df5 = pd.read_csv('../data/feature_dia_mas_activo.csv', low_memory=False, index_col='person')
df6 = pd.read_csv('../data/feature_compra_onsite.csv', low_memory=False, index_col='person')
df7 = pd.read_csv('../data/feature_cantidad_de_eventos_en_intervalos.csv', low_memory=False, index_col='person')
df8 = pd.read_csv('../data/feature_dispositivos_storage.csv', low_memory=False, index_col='person')
df9 = pd.read_csv('../data/featureUsuarioSonDeSaoPablo.csv', low_memory=False, index_col='person')
df10 = pd.read_csv('../data/featureCantidadMaximaPersonaVeUnProducto.csv', low_memory=False, index_col='person')
df11 = pd.read_csv('../data/featureUltimaConexion.csv', low_memory=False, index_col='person')
df12 = pd.read_csv('../data/features_basicas_ult_quin.csv', low_memory=False, index_col='person')
df14 = pd.read_csv('../data/days_elapsed_from_last_event.csv', low_memory=False, index_col='person')
df15 = pd.read_csv('../data/featureUsuarioRealiza30EventosEn20MinsLosUltimos2Dias.csv', low_memory=False, index_col='person')
df16 = pd.read_csv('../data/features_ultimo_checkout.csv', low_memory=False, index_col='person')
df17 = pd.read_csv('../data/diferencia_de_count_de_eventos.csv', low_memory=False, index_col='person')
df18 = pd.read_csv('../data/top_10_celulares.csv', low_memory=False, index_col='person')
df19 = pd.read_csv('../data/feature_cantidad_de_eventos_en_intervalos_ult_quin.csv', low_memory=False, index_col='person')

labels = pd.read_csv('../data/labels_training_set.csv', low_memory=False, index_col='person')

df_unidos = df1.join(df2, how='inner')
df_unidos = df_unidos.join(df3, how='inner')
df_unidos = df_unidos.join(df4, how='inner')
df_unidos = df_unidos.join(df5, how='inner')
df_unidos = df_unidos.join(df6, how='inner')
df_unidos = df_unidos.join(df7, how='inner')
df_unidos = df_unidos.join(df8, how='inner')
df_unidos = df_unidos.join(df9, how='inner')
df_unidos = df_unidos.join(df10, how='inner')
df_unidos = df_unidos.join(df11, how='inner')
df_unidos = df_unidos.join(df12, how='inner')
df_unidos = df_unidos.join(df14, how='inner')
df_unidos = df_unidos.join(df15, how='inner')
df_unidos = df_unidos.join(df16, how='inner')
df_unidos = df_unidos.join(df17).fillna(0)
df_unidos = df_unidos.join(df18, how='inner')
df_unidos = df_unidos.join(df19, how='inner')

#  Le asigno las features al subset de usuarios que nos da Trocafone para entrenar.
df_test = df_unidos.join(labels, on='person', how='inner')
df_test.shape

(19414, 93)

In [3]:
features = df_test.columns.tolist()
features.remove('label')

## Veamos si las labels están balanceadas

In [4]:
labels['label'].value_counts(normalize=True)

0    0.949521
1    0.050479
Name: label, dtype: float64

# Entrenando el Random Forest

In [5]:
param_grid = { 
    'n_estimators': [x for x in range(300, 500, 50)],
    'max_depth' : [6,7,8,9],
    'min_samples_split': [x for x in range(200, 400, 50)]
}

In [6]:
#  Partimos los datos que tenemos para entrenar en dos partes, una para entrenar el modelo (80% de los datos)
#  y la otra parte se usará para probar el modelo (el 20% restante)
X_train, X_test, Y_train, Y_test = train_test_split(df_test[features],\
                                                    df_test['label'], test_size=0.20, random_state=80)

In [35]:
rfc = RandomForestClassifier(class_weight='balanced', random_state=80)
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5, scoring='roc_auc', n_jobs=4)
CV_rfc.fit(X_train, Y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=80,
            verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=4,
       param_grid={'n_estimators': [300, 350, 400, 450], 'max_depth': [6, 7, 8, 9], 'min_samples_split': [200, 250, 300, 350]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

In [36]:
CV_rfc.best_params_

{'max_depth': 7, 'min_samples_split': 300, 'n_estimators': 350}

In [10]:
rf = RandomForestClassifier(n_estimators=3000, n_jobs=-1, min_samples_split=300,\
                            max_depth=7, random_state=80, class_weight='balanced')
rf.fit(X_train,Y_train)
Y_pred = rf.predict_proba(X_test)
Y_pred_proba = [p[1] for p in Y_pred]
print(roc_auc_score(Y_test, Y_pred_proba))

0.8606203605039533


In [12]:
feature_importances = pd.DataFrame(rf.feature_importances_,\
                                   index = X_train.columns,\
                                    columns=['importance']).sort_values('importance',ascending=False)
#  Acá quedan ordenadas las features por importancia para el modelo, aparentemente la marca más vista y la marca
#  que más compró cada usuario no es tan importante.
feature_importances.head(20)

Unnamed: 0,importance
storage_mas_checkout,0.126417
tasa_ult_quin_cant_checkouts,0.123706
ult_quin_cant_checkouts,0.121393
cant_checkouts,0.064318
model_encoding,0.054601
tiene_checkouts,0.054135
actividad_total,0.033912
days_elapsed,0.026011
primeraConexion,0.024645
cant_viewed_product,0.021269


# Preparamos el submit

In [14]:
df_submit = pd.read_csv('../data/trocafone_kaggle_test.csv', low_memory=False, index_col='person')
df_events = df_submit.join(df_unidos, how='inner')

In [15]:
df_events.head()

Unnamed: 0_level_0,tiene_checkouts,llegaron_por_ad,llegaron_por_search,tiene_conversions,cant_conversions,cant_checkouts,cant_viewed_product,cant_searched_product,cant_visitas,cant_leads,cant_brand_listings,cant_generic_listings,cant_entradas_ads,cant_entradas_buscador,total_sesiones,promedio_eventos_por_sesion,checkout_Apple,checkout_Asus,checkout_LG,checkout_Lenovo,checkout_Motorola,checkout_Quantum,checkout_Samsung,checkout_Sony,vio_mas_Apple,vio_mas_Asus,vio_mas_LG,vio_mas_Lenovo,vio_mas_Motorola,vio_mas_Quantum,vio_mas_Samsung,vio_mas_Sony,dispositivo_Computer,dispositivo_Smartphone,dispositivo_Tablet,dispositivo_Unknown,Friday,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday,compro_onsite,cant_eventos_4_a_10hs,cant_eventos_11_a_14hs,cant_eventos_15_a_20hs,cant_eventos_21_a_3hs,storage_mas_visto,storage_mas_checkout,storage_mas_eventos,actividad_total,EsDeSaoPablo,cantidadMaximaQueVeUnMismoProducto,primeraConexion,ultimaConexion,vida_del_usuario,ult_quin_cant_conversions,ult_quin_cant_checkouts,ult_quin_cant_viewed_product,ult_quin_cant_searched_product,ult_quin_cant_visitas,ult_quin_cant_leads,ult_quin_cant_brand_listings,ult_quin_cant_generic_listings,ult_quin_cant_entradas_ads,ult_quin_cant_entradas_buscador,tasa_ult_quin_cant_checkouts,tasa_ult_quin_cant_conversions,tasa_ult_quin_cant_viewed_product,tasa_ult_quin_cant_searched_product,tasa_ult_quin_cant_visitas,tasa_ult_quin_cant_leads,tasa_ult_quin_cant_brand_listings,tasa_ult_quin_cant_generic_listings,tasa_ult_quin_cant_entradas_ads,tasa_ult_quin_cant_entradas_buscador,ult_quin_total_sesiones,ult_quin_promedio_eventos_por_sesion,days_elapsed,realiza30EventosEn20MinsLosUltimos2Dias,model_encoding,diff_checkouts_15_dias,diff_visited_site_15_dias,diff_searched_products_15_dias,diff_viewed_products_15_dias,total_top_10_phones_checkout,different_top_10_phones_checkout,ult_quin_cant_eventos_4_a_10hs,ult_quin_cant_eventos_11_a_14hs,ult_quin_cant_eventos_15_a_20hs,ult_quin_cant_eventos_21_a_3hs
person,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1
4886f805,True,False,True,False,0.0,1.0,4.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1,9.0,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,0,1,0,0,1,0,0,0,0,0,0,False,0.0,0.0,0.0,9.0,4.0,4.0,4.0,12.0,False,4.0,20180518000722,20180518003030,1.0,0.0,1.0,4.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,100.0,0.0,400.0,100.0,100.0,0.0,0.0,100.0,0.0,100.0,1.0,9.0,13.994884,False,141,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0
0297fc1e,True,True,False,False,0.0,7.0,404.0,6.0,95.0,1.0,4.0,21.0,29.0,0.0,89,6.370787,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,0,1,0,0,0,1,0,0,0,0,0,False,37.0,120.0,199.0,187.0,3.0,4.0,3.0,10.0,False,213.0,20180109225634,20180528115012,1.000021,0.0,1.0,103.0,0.0,25.0,0.0,0.0,2.0,4.0,0.0,99.142857,0.0,132.778146,84.857143,132.816901,49.5,79.2,104.05,110.576923,0.0,23.0,5.869565,51.862187,False,92,0.0,21.0,0.0,73.0,1.0,1.0,3.0,31.0,74.0,27.0
2d681dd8,True,True,True,False,0.0,1.0,13.0,1.0,2.0,0.0,5.0,1.0,1.0,2.0,2,13.0,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,1,0,0,0,0,0,0,1,0,0,0,False,0.0,0.0,16.0,10.0,2.0,4.0,2.0,8.0,False,6.0,20180518000829,20180527194249,1.0,0.0,1.0,13.0,1.0,2.0,0.0,5.0,1.0,1.0,2.0,100.0,0.0,1300.0,100.0,200.0,0.0,500.0,100.0,100.0,200.0,2.0,13.0,4.182963,False,101,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,16.0,10.0
cccea85e,True,True,True,False,0.0,1.0,739.0,1.0,22.0,0.0,7.0,20.0,15.0,26.0,17,49.176471,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,1,0,0,0,0,1,0,0,0,0,0,False,0.0,77.0,241.0,518.0,4.0,0.0,4.0,8.0,False,252.0,20180507223259,20180531143830,1.000001,0.0,0.0,525.0,1.0,17.0,0.0,3.0,15.0,10.0,18.0,49.5,0.0,342.725581,100.0,365.833333,0.0,139.2,332.5,249.166667,288.0,13.0,45.307692,20.024421,True,79,-1.0,12.0,1.0,311.0,0.0,0.0,0.0,77.0,215.0,297.0
4c8a8b93,True,True,True,False,0.0,2.0,177.0,9.0,20.0,0.0,8.0,14.0,14.0,13.0,14,18.357143,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,0,1,0,0,1,0,0,0,0,0,0,False,0.0,66.0,17.0,174.0,3.0,5.0,3.0,11.0,False,83.0,20180518002514,20180522003318,1.0,0.0,2.0,177.0,9.0,20.0,0.0,8.0,14.0,14.0,13.0,200.0,0.0,17700.0,900.0,2000.0,0.0,800.0,1400.0,1400.0,1300.0,14.0,18.357143,13.918056,False,137,0.0,0.0,0.0,0.0,2.0,2.0,0.0,66.0,17.0,174.0


In [16]:
kaggle_pred = rf.predict_proba(df_events)
proba_de_comprar = [x[1] for x in kaggle_pred]
series = pd.Series(proba_de_comprar)
df_submit['label'] = series.values

In [17]:
df_submit.to_csv('../submit_rf.csv')