In [17]:
import pandas as pd
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.utils import resample

In [9]:
df1 = pd.read_csv('data/encoding_stats_features.csv', index_col='person')
df2 = pd.read_csv('data/viewed_product_stats.csv', index_col='person')
df3 = pd.read_csv('data/features_basicas.csv', index_col='person')

labels = pd.read_csv('data/labels_training_set.csv', index_col='person')

In [10]:
# joineando df1 con df3
df_final = df3.join(df1, how='left', on='person')
df_final = df_final.fillna(df_final.mean())

In [11]:
df_final = df_final.join(df2, how='left', on='person')
df_final = df_final.fillna(df_final.mean())

In [12]:
df_final = df_final.drop(labels=['checked_out'], axis=1)
df_features = df_final.copy()
df_final = df_final.join(labels, how='inner', on='person')

### Balanceamos un poco el set de datos (down-sampling)

Usando el resample de sklearn, desechamos cierta cantidad de no-compradores para que se balanceen las clases un poco mas.

In [14]:
df_no_compradores = df_final[df_final.label==0]
df_compradores = df_final[df_final.label==1]

proporcion_compradores = 0.1  # el 10% ahora son compradores

df_mas_compradores = resample(df_no_compradores, 
                              replace=True,
                              n_samples=int(df_no_compradores.label.value_counts()[0] * (1-proporcion_compradores)),
                              random_state=123)
df_final = pd.concat([df_mas_compradores, df_no_compradores])

In [23]:
features = df_final.columns.tolist()
features.remove('label')

X_train, X_test, Y_train, Y_test = train_test_split(df_final[features],
                                                    df_final['label'], 
                                                    test_size=0.0, 
                                                    random_state=123
                                                   )

### Definimos que parametros vamos a tunear

In [66]:
params = {
    'min_child_weight': [1, 5, 10, 7],
    'gamma': [0.5, 1, 1.5, 2, 5, 0.25],
    'subsample': [0.6, 0.4, 1.0, 0.8],
    'colsample_bytree': [0.6, 0.8, 1.0, 0.4],
    'max_depth': [3, 4, 6, 8, 10]
}

### Instanciamos el modelo de XGB que vamos a usar

In [67]:
xgb = XGBClassifier(learning_rate=0.02, n_estimators=300, silent=True, nthread=4)

### Definimos variables necesarias para K-Folds

In [68]:
folds = 10
param_comb = 5

kfolds = StratifiedKFold(n_splits=folds, shuffle=True, random_state=123)

In [69]:
random_search = RandomizedSearchCV(
    xgb, 
    param_distributions=params, 
    n_iter=param_comb, 
    scoring='roc_auc',
    cv=kfolds.split(X_train, Y_train),
    random_state=123,
    verbose=3,
    n_jobs=4
)

In [70]:
random_search.fit(X_train, Y_train)

Fitting 10 folds for each of 5 candidates, totalling 50 fits


[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:  2.4min
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:  6.3min finished


RandomizedSearchCV(cv=<generator object _BaseKFold.split at 0x000001A7019838E0>,
          error_score='raise',
          estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.02, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=300,
       n_jobs=1, nthread=4, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
          fit_params=None, iid=True, n_iter=5, n_jobs=4,
          param_distributions={'min_child_weight': [1, 5, 10, 7], 'gamma': [0.5, 1, 1.5, 2, 5, 0.25], 'subsample': [0.6, 0.4, 1.0, 0.8], 'colsample_bytree': [0.6, 0.8, 1.0, 0.4], 'max_depth': [3, 4, 6, 8, 10]},
          pre_dispatch='2*n_jobs', random_state=123, refit=True,
          return_train_score='warn', scoring='roc_auc', verbose=3)

In [56]:
print(random_search.best_estimator_)
print()
print('Best params: {}'.format(random_search.best_params_))

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.6, gamma=1, learning_rate=0.02, max_delta_step=0,
       max_depth=4, min_child_weight=10, missing=None, n_estimators=300,
       n_jobs=1, nthread=4, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.6)

Best params: {'subsample': 0.6, 'min_child_weight': 10, 'max_depth': 4, 'gamma': 1, 'colsample_bytree': 0.6}


### Hacer un submit

Ahora lo que queda es ver que tan bien le va con el submit de kaggle

In [63]:
df_submit = pd.read_csv('data/trocafone_kaggle_test.csv', low_memory=False, index_col='person')
df_events = df_submit.join(df_features, how='inner')
df_submit.info()

<class 'pandas.core.frame.DataFrame'>
Index: 19415 entries, 4886f805 to 80aea0a0
Empty DataFrame

In [64]:
kaggle_pred = random_search.predict_proba(df_events)
proba_de_comprar = [x[1] for x in kaggle_pred]
series = pd.Series(proba_de_comprar)
df_submit['label'] = series.values

In [65]:
df_submit.to_csv('submit.csv')