In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.preprocessing import normalize
from sklearn.model_selection import cross_val_score

In [2]:
df1 = pd.read_csv('data/features_basicas.csv', low_memory=False, index_col='person')
df2 = pd.read_csv('data/features_checkouts.csv', low_memory=False, index_col='person')
df3 = pd.read_csv('data/features_vistos.csv', low_memory=False, index_col='person')
df4 = pd.read_csv('data/feature_dispositivo_mas_usado.csv', low_memory=False, index_col='person')
df5 = pd.read_csv('data/feature_dia_mas_activo.csv', low_memory=False, index_col='person')
df6 = pd.read_csv('data/feature_compra_onsite.csv', low_memory=False, index_col='person')
df7 = pd.read_csv('data/feature_cantidad_de_eventos_en_intervalos.csv', low_memory=False, index_col='person')
df8 = pd.read_csv('data/feature_dispositivos_storage.csv', low_memory=False, index_col='person')
df9 = pd.read_csv('data/featureUsuarioSonDeSaoPablo.csv', low_memory=False, index_col='person')
df10 = pd.read_csv('data/featureCantidadMaximaPersonaVeUnProducto.csv', low_memory=False, index_col='person')
df11 = pd.read_csv('data/featureUltimaConexion.csv', low_memory=False, index_col='person')
df12 = pd.read_csv('data/features_basicas_ult_quin.csv', low_memory=False, index_col='person')

labels = pd.read_csv('data/labels_training_set.csv', low_memory=False, index_col='person')

df_unidos = df1.join(df2, how='inner')
df_unidos = df_unidos.join(df3, how='inner')
df_unidos = df_unidos.join(df4, how='inner')
df_unidos = df_unidos.join(df5, how='inner')
df_unidos = df_unidos.join(df6, how='inner')
df_unidos = df_unidos.join(df7, how='inner')
df_unidos = df_unidos.join(df8, how='inner')
df_unidos = df_unidos.join(df9, how='inner')
df_unidos = df_unidos.join(df10, how='inner')
df_unidos = df_unidos.join(df11, how='inner')
df_unidos = df_unidos.join(df12, how='inner')

#  Le asigno las features al subset de usuarios que nos da Trocafone para entrenar.
df_test = df_unidos.join(labels, on='person', how='inner')

In [3]:
df_test.shape

(19414, 67)

In [4]:
features = df_test.columns.tolist()
features.remove('label')

In [5]:
X = df_test[features]
Y = df_test['label']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=311)

In [None]:
param_grid = { 
    'n_estimators': [x for x in range(100, 500, 50)],
    'max_depth' : [3,4,5],
    'scale_pos_weight': [x for x in range(1, 4, 1)]
}

In [None]:
xgb = XGBClassifier(random_state=80)
CV_xgb = GridSearchCV(estimator=xgb, param_grid=param_grid, cv= 5, scoring='roc_auc', n_jobs=4,)
CV_xgb.fit(X_train, y_train)

In [None]:
CV_xgb.best_params_

In [None]:
CV_xgb.best_score_

In [6]:
model_xgb = XGBClassifier(scale_pos_weight=3, n_estimators=100, random_state=80, max_depth=3)
model_xgb.fit(X_train, y_train)
y_pred = model_xgb.predict_proba(X_test)
y_pred_proba = [p[1] for p in y_pred]
print(roc_auc_score(y_test, y_pred_proba))

0.8542503924646783


In [7]:
#  Acá quedan ordenadas las features por importancia para el modelo, aparentemente la marca más vista y la marca
#  que más compró cada usuario no es tan importante.
feature_importances = pd.DataFrame(model_xgb.feature_importances_,\
                                   index = X.columns,\
                                    columns=['importance']).sort_values('importance',ascending=False)

feature_importances.head(10)

Unnamed: 0,importance
ultimaConexion,0.290625
ult_quin_cant_checkouts,0.104687
cant_eventos_21_a_3hs,0.045313
cant_viewed_product,0.040625
cant_eventos_11_a_14hs,0.035937
cant_brand_listings,0.032812
dispositivo_Computer,0.028125
cant_generic_listings,0.028125
cant_searched_product,0.025
tiene_conversions,0.025


In [8]:
df_submit = pd.read_csv('data/trocafone_kaggle_test.csv', low_memory=False, index_col='person')
df_events = df_submit.join(df_unidos, how='inner')

In [9]:
df_events.shape

(19415, 66)

In [10]:
kaggle_pred = model_xgb.predict_proba(df_events)
proba_de_comprar = [x[1] for x in kaggle_pred]
series = pd.Series(proba_de_comprar)
df_submit['label'] = series.values

In [11]:
df_submit.to_csv('submit.csv')