In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.preprocessing import normalize
from sklearn.model_selection import cross_val_score

from sklearn.manifold import mds
from sklearn.decomposition import TruncatedSVD

from sklearn.ensemble import RandomForestClassifier

In [2]:
df1 = pd.read_csv('../data/features_basicas.csv', low_memory=False, index_col='person')
df2 = pd.read_csv('../data/features_checkouts.csv', low_memory=False, index_col='person')
df3 = pd.read_csv('../data/features_vistos.csv', low_memory=False, index_col='person')
df4 = pd.read_csv('../data/feature_dispositivo_mas_usado.csv', low_memory=False, index_col='person')
df5 = pd.read_csv('../data/feature_dia_mas_activo.csv', low_memory=False, index_col='person')
df6 = pd.read_csv('../data/feature_compra_onsite.csv', low_memory=False, index_col='person')
df7 = pd.read_csv('../data/feature_cantidad_de_eventos_en_intervalos.csv', low_memory=False, index_col='person')
df8 = pd.read_csv('../data/feature_dispositivos_storage.csv', low_memory=False, index_col='person')
df9 = pd.read_csv('../data/featureUsuarioSonDeSaoPablo.csv', low_memory=False, index_col='person')
df10 = pd.read_csv('../data/featureCantidadMaximaPersonaVeUnProducto.csv', low_memory=False, index_col='person')
df11 = pd.read_csv('../data/featureUltimaConexion.csv', low_memory=False, index_col='person')
df12 = pd.read_csv('../data/features_basicas_ult_quin.csv', low_memory=False, index_col='person')
df14 = pd.read_csv('../data/days_elapsed_from_last_event.csv', low_memory=False, index_col='person')
df15 = pd.read_csv('../data/featureUsuarioRealiza30EventosEn20MinsLosUltimos2Dias.csv', low_memory=False, index_col='person')
df16 = pd.read_csv('../data/features_ultimo_checkout.csv', low_memory=False, index_col='person')
df17 = pd.read_csv('../data/diferencia_de_count_de_eventos.csv', low_memory=False, index_col='person')
df18 = pd.read_csv('../data/top_10_celulares.csv', low_memory=False, index_col='person')
df19 = pd.read_csv('../data/feature_cantidad_de_eventos_en_intervalos_ult_quin.csv', low_memory=False, index_col='person')

labels = pd.read_csv('../data/labels_training_set.csv', low_memory=False, index_col='person')

df_unidos = df1.join(df2, how='inner')
df_unidos = df_unidos.join(df3, how='inner')
df_unidos = df_unidos.join(df4, how='inner')
df_unidos = df_unidos.join(df5, how='inner')
df_unidos = df_unidos.join(df6, how='inner')
df_unidos = df_unidos.join(df7, how='inner')
df_unidos = df_unidos.join(df8, how='inner')
df_unidos = df_unidos.join(df9, how='inner')
df_unidos = df_unidos.join(df10, how='inner')
df_unidos = df_unidos.join(df11, how='inner')
df_unidos = df_unidos.join(df12, how='inner')
df_unidos = df_unidos.join(df14, how='inner')
df_unidos = df_unidos.join(df15, how='inner')
df_unidos = df_unidos.join(df16, how='inner')
df_unidos = df_unidos.join(df17).fillna(0)
df_unidos = df_unidos.join(df18, how='inner')
df_unidos = df_unidos.join(df19, how='inner')

#  Le asigno las features al subset de usuarios que nos da Trocafone para entrenar.
df_test = df_unidos.join(labels, on='person', how='inner')
df_test.shape

(19414, 93)

#### Elegir las mejores features

ahora voy a correr un RF y usar el top10 de las features mas importantes

In [3]:
rf = RandomForestClassifier(n_estimators=100, criterion='entropy')

In [4]:
features = df_test.columns.tolist()
features.remove('label')

In [5]:
X = df_test[features]
Y = df_test['label']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=1, random_state=311)

In [6]:
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

#### Nos quedamos con las 10 features mas importantes

In [7]:
feature_importances = pd.DataFrame(rf.feature_importances_,\
                                   index = X_train.columns,\
                                    columns=['importance']).sort_values('importance',ascending=False)
#  Acá quedan ordenadas las features por importancia para el modelo, aparentemente la marca más vista y la marca
#  que más compró cada usuario no es tan importante.
feature_importances.head(10)

Unnamed: 0,importance
days_elapsed,0.043111
tasa_ult_quin_cant_checkouts,0.042461
ult_quin_cant_checkouts,0.033429
storage_mas_checkout,0.03237
cant_viewed_product,0.028243
promedio_eventos_por_sesion,0.02807
primeraConexion,0.027116
model_encoding,0.026644
cantidadMaximaQueVeUnMismoProducto,0.025089
ultimaConexion,0.024688


#### Ahora vamos a llevar todos los puntos en 10 dimensiones a 2 o 3 dimensiones

In [8]:
df_a_reducir = df_test[[
    'tasa_ult_quin_cant_checkouts',
    'days_elapsed',
    'storage_mas_checkout',
    'primeraConexion',
    'cant_viewed_product',
    'ult_quin_cant_checkouts',
    'cant_checkouts',
    'promedio_eventos_por_sesion',
    'ultimaConexion',
    'tasa_ult_quin_cant_viewed_product'
]]

In [9]:
svd = TruncatedSVD(n_components=3, random_state=1)
features_reducidas = svd.fit_transform(df_a_reducir)

In [10]:
df_reducido = pd.DataFrame(features_reducidas)

df_test['svd_d1'] = df_reducido[0]
df_test['svd_d2'] = df_reducido[1]
df_test['svd_d3'] = df_reducido[2]

df_unidos['svd_d1'] = df_reducido[0]
df_unidos['svd_d2'] = df_reducido[1]
df_unidos['svd_d3'] = df_reducido[2]

In [11]:
df_unidos.columns

Index(['tiene_checkouts', 'llegaron_por_ad', 'llegaron_por_search',
       'tiene_conversions', 'cant_conversions', 'cant_checkouts',
       'cant_viewed_product', 'cant_searched_product', 'cant_visitas',
       'cant_leads', 'cant_brand_listings', 'cant_generic_listings',
       'cant_entradas_ads', 'cant_entradas_buscador', 'total_sesiones',
       'promedio_eventos_por_sesion', 'checkout_Apple', 'checkout_Asus',
       'checkout_LG', 'checkout_Lenovo', 'checkout_Motorola',
       'checkout_Quantum', 'checkout_Samsung', 'checkout_Sony',
       'vio_mas_Apple', 'vio_mas_Asus', 'vio_mas_LG', 'vio_mas_Lenovo',
       'vio_mas_Motorola', 'vio_mas_Quantum', 'vio_mas_Samsung',
       'vio_mas_Sony', 'dispositivo_Computer', 'dispositivo_Smartphone',
       'dispositivo_Tablet', 'dispositivo_Unknown', 'Friday', 'Monday',
       'Saturday', 'Sunday', 'Thursday', 'Tuesday', 'Wednesday',
       'compro_onsite', 'cant_eventos_4_a_10hs', 'cant_eventos_11_a_14hs',
       'cant_eventos_15_a_20hs', 

In [12]:
df_test.shape

(19414, 96)

In [13]:
features = df_test.columns.tolist()
features.remove('label')

In [14]:
X = df_test[features]
Y = df_test['label']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=311)

In [15]:
param_grid = { 
    'n_estimators': [100, 300, 500],
    'max_depth' : [3,4,5],
    'scale_pos_weight': [1, 2, 4]
}

In [16]:
xgb = XGBClassifier(random_state=80)
CV_xgb = GridSearchCV(estimator=xgb, param_grid=param_grid, cv= 10, scoring='roc_auc', n_jobs=4, verbose=10)
CV_xgb.fit(X_train, y_train)

Fitting 10 folds for each of 27 candidates, totalling 270 fits


[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:   42.1s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:  1.1min
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:  1.6min
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:  2.0min
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:  3.3min
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  5.0min
[Parallel(n_jobs=4)]: Done  53 tasks      | elapsed:  7.4min
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed: 10.4min
[Parallel(n_jobs=4)]: Done  77 tasks      | elapsed: 15.1min
[Parallel(n_jobs=4)]: Done  90 tasks      | elapsed: 18.8min
[Parallel(n_jobs=4)]: Done 105 tasks      | elapsed: 20.3min
[Parallel(n_jobs=4)]: Done 120 tasks      | elapsed: 21.8min
[Parallel(n_jobs=4)]: Done 137 tasks      | elapsed: 26.7min
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed: 31.6min
[Parallel(n_jobs=4)]: Done 173 tasks      | elapsed: 40.3min
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed: 44.0min
[Parallel(n_jobs=4)]: Do

GridSearchCV(cv=10, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic',
       random_state=80, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1),
       fit_params=None, iid=True, n_jobs=4,
       param_grid={'n_estimators': [100, 300, 500], 'max_depth': [3, 4, 5], 'scale_pos_weight': [1, 2, 4]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=10)

In [25]:
CV_xgb.best_params_

{'max_depth': 3, 'n_estimators': 100, 'scale_pos_weight': 1}

In [26]:
CV_xgb.best_score_

0.8732316017270474

In [27]:
model_xgb = XGBClassifier(scale_pos_weight=1, n_estimators=100, random_state=80, max_depth=3)
model_xgb.fit(X_train, y_train)
y_pred = model_xgb.predict_proba(X_test)
y_pred_proba = [p[1] for p in y_pred]
print(roc_auc_score(y_test, y_pred_proba))

0.8579604918890633


In [28]:
#  Acá quedan ordenadas las features por importancia para el modelo, aparentemente la marca más vista y la marca
#  que más compró cada usuario no es tan importante.
feature_importances = pd.DataFrame(model_xgb.feature_importances_,\
                                   index = X.columns,\
                                    columns=['importance']).sort_values('importance',ascending=False)

feature_importances.head(30)

Unnamed: 0,importance
days_elapsed,0.068285
cant_viewed_product,0.066768
storage_mas_checkout,0.06525
tasa_ult_quin_cant_checkouts,0.054628
ultimaConexion,0.047041
cant_eventos_21_a_3hs,0.037936
dispositivo_Computer,0.031866
tasa_ult_quin_cant_entradas_ads,0.027314
tasa_ult_quin_cant_conversions,0.025797
tiene_conversions,0.025797


In [29]:
df_submit = pd.read_csv('../data/trocafone_kaggle_test.csv', low_memory=False, index_col='person')
df_events = df_submit.join(df_unidos, how='inner')

In [30]:
df_events.shape

(19415, 95)

In [32]:
kaggle_pred = CV_xgb.predict_proba(df_events)
proba_de_comprar = [x[1] for x in kaggle_pred]
series = pd.Series(proba_de_comprar)
df_submit['label'] = series.values

In [33]:
df_submit.to_csv('../submit_svd_kfolds10.csv')