In [21]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

In [22]:
df1 = pd.read_csv('data/features_basicas.csv', low_memory=False, index_col='person')
df2 = pd.read_csv('data/features_compras.csv', low_memory=False, index_col='person')
df3 = pd.read_csv('data/features_vistos.csv', low_memory=False, index_col='person')
df4 = pd.read_csv('data/feature_dispositivo_mas_usado.csv', low_memory=False, index_col='person')
df5 = pd.read_csv('data/feature_dia_mas_activo.csv', low_memory=False, index_col='person')
df6 = pd.read_csv('data/feature_compra_onsite.csv', low_memory=False, index_col='person')
df7 = pd.read_csv('data/feature_cantidad_de_eventos_en_intervalos.csv', low_memory=False, index_col='person')
df8 = pd.read_csv('data/feature_dispositivos_storage.csv', low_memory=False, index_col='person')

labels = pd.read_csv('data/labels_training_set.csv', low_memory=False, index_col='person')

df_unidos = df1.join(df2, how='inner')
df_unidos = df_unidos.join(df3, how='inner')
df_unidos = df_unidos.join(df4, how='inner')
df_unidos = df_unidos.join(df5, how='inner')
df_unidos = df_unidos.join(df6, how='inner')
df_unidos = df_unidos.join(df7, how='inner')
df_unidos = df_unidos.join(df8, how='inner')

#  Le asigno las features al subset de usuarios que nos da Trocafone para entrenar.
df_test = df_unidos.join(labels, on='person', how='inner')

In [23]:
df_test.head()

Unnamed: 0_level_0,vio_productos,tiene_checkouts,tiene_brand_listing,tiene_generic_listing,tiene_visitas,llegaron_por_ad,llegaron_por_search,tiene_leads,tiene_busquedas,tiene_conversions,...,Wednesday,compro_onsite,cant_eventos_4_a_10hs,cant_eventos_11_a_14hs,cant_eventos_15_a_20hs,cant_eventos_21_a_3hs,storage_mas_visto,storage_mas_checkout,storage_mas_eventos,label
person,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ad93850f,True,True,True,True,True,True,True,False,False,False,...,0,False,0.0,0.0,0.0,65.0,5.0,4.0,5.0,0
1b9f7cf6,True,True,True,False,True,False,True,False,True,False,...,0,False,2.0,0.0,15.0,8.0,3.0,3.0,3.0,0
de8fe91b,True,True,True,True,True,False,True,False,True,False,...,0,False,0.0,0.0,0.0,53.0,4.0,4.0,4.0,0
45baf068,True,True,True,True,True,True,False,False,False,False,...,0,False,0.0,0.0,0.0,20.0,4.0,4.0,4.0,0
99abca5a,True,True,True,True,True,True,True,True,True,True,...,0,True,3.0,37.0,78.0,699.0,4.0,4.0,4.0,0


In [24]:
features = df_test.columns.tolist()
features.remove('label')

## Veamos si las labels están balanceadas

In [25]:
labels['label'].value_counts(normalize=True)

0    0.949521
1    0.050479
Name: label, dtype: float64

# Entrenando el Random Forest

In [45]:
param_grid = { 
    'n_estimators': [x for x in range(200, 400, 50)],
    'max_depth' : [4,5,6,7,8],
    'min_samples_split': [x for x in range(50, 301, 50)]
}

In [46]:
#  Partimos los datos que tenemos para entrenar en dos partes, una para entrenar el modelo (80% de los datos)
#  y la otra parte se usará para probar el modelo (el 20% restante)
X_train, X_test, y_train, y_test = train_test_split(df_test[features],\
                                                    df_test['label'], test_size=0.20, random_state=80)

In [47]:
rfc = RandomForestClassifier(class_weight='balanced', random_state=80)
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5, scoring='roc_auc', n_jobs=4)
CV_rfc.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators='warn', n_jobs=None, oob_score=False,
            random_state=80, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=4,
       param_grid={'n_estimators': [200, 250, 300, 350], 'max_depth': [4, 5, 6, 7, 8], 'min_samples_split': [50, 100, 150, 200, 250, 300]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

In [48]:
CV_rfc.best_params_

{'max_depth': 7, 'min_samples_split': 300, 'n_estimators': 350}

In [17]:
# Submit de 83 puntos
#  class_weight='balanced' hace que se le de la misma importancia a las clases a predecir, sin importar (creo)
#  su distribución.
#  Los valores de los parámetros los seleccioné probando, aún no sé bien cómo influencian los resultados.
#rf = RandomForestClassifier(n_estimators=250, n_jobs=2, min_samples_split=200,\
#                             random_state=80, class_weight='balanced')

In [50]:
rf = RandomForestClassifier(n_estimators=350, n_jobs=2, min_samples_split=300,\
                            max_depth=7, random_state=80, class_weight='balanced')
rf.fit(X_train,Y_train)
Y_pred = rf.predict_proba(X_test)
Y_pred_proba = [p[1] for p in Y_pred]
print(roc_auc_score(Y_test, Y_pred_proba))

0.8249423187705018


In [51]:
feature_importances = pd.DataFrame(rf.feature_importances_,\
                                   index = X_train.columns,\
                                    columns=['importance']).sort_values('importance',ascending=False)
#  Acá quedan ordenadas las features por importancia para el modelo, aparentemente la marca más vista y la marca
#  que más compró cada usuario no es tan importante.
feature_importances.head(40)

Unnamed: 0,importance
cant_checkouts,0.178146
storage_mas_checkout,0.176172
tiene_checkouts,0.163207
total_sesiones,0.042537
cant_viewed_product,0.040983
cant_visitas,0.035424
cant_brand_listings,0.031139
cant_conversions,0.029415
tiene_conversions,0.025613
cant_generic_listings,0.025509


# Preparamos el submit

In [52]:
df_submit = pd.read_csv('data/trocafone_kaggle_test.csv', low_memory=False, index_col='person')
df_events = df_submit.join(df_unidos, how='inner')

In [53]:
df_events.head()

Unnamed: 0_level_0,vio_productos,tiene_checkouts,tiene_brand_listing,tiene_generic_listing,tiene_visitas,llegaron_por_ad,llegaron_por_search,tiene_leads,tiene_busquedas,tiene_conversions,...,Tuesday,Wednesday,compro_onsite,cant_eventos_4_a_10hs,cant_eventos_11_a_14hs,cant_eventos_15_a_20hs,cant_eventos_21_a_3hs,storage_mas_visto,storage_mas_checkout,storage_mas_eventos
person,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4886f805,True,True,False,True,True,False,True,False,True,False,...,0,0,False,0.0,0.0,0.0,9.0,4.0,4.0,4.0
0297fc1e,True,True,True,True,True,True,False,True,True,False,...,0,0,False,37.0,120.0,199.0,187.0,3.0,3.0,3.0
2d681dd8,True,True,True,True,True,True,True,False,True,False,...,0,0,False,0.0,0.0,16.0,10.0,2.0,4.0,2.0
cccea85e,True,True,True,True,True,True,True,False,True,False,...,0,0,False,0.0,77.0,241.0,518.0,4.0,4.0,4.0
4c8a8b93,True,True,True,True,True,True,True,False,True,False,...,0,0,False,0.0,66.0,17.0,174.0,3.0,5.0,3.0


In [54]:
kaggle_pred = rf.predict_proba(df_events)
proba_de_comprar = [x[1] for x in kaggle_pred]
series = pd.Series(proba_de_comprar)
df_submit['label'] = series.values

In [55]:
df_submit.to_csv('submit.csv')