In [1]:
import numpy as np
import pandas as pd

In [2]:
train = pd.read_csv('data/train.csv',na_values=-1)
test = pd.read_csv('data/test.csv',na_values=-1)

In [5]:
# Number of missing values
# train.isnull().sum()

## Preparing the data for machine learning

On met les noms des features dans trois listes, une pour chaque type. La liste features_names contient le nom de toutes les features.

In [6]:
cat_feat = train.filter(like='cat', axis=1).columns.values.tolist()

bin_feat = train.filter(like='bin', axis=1).columns.values.tolist()

num_feat = list(set(list(train)) - set(cat_feat)- set(bin_feat))
num_feat.remove('target')
num_feat.remove('id')

features_names = cat_feat + bin_feat + num_feat

On change le type des variables catégorielles

In [7]:
for var in bin_feat + cat_feat:
    train[var] = train[var].astype('category')
    test[var] = test[var].astype('category')

In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 595212 entries, 0 to 595211
Data columns (total 59 columns):
id                595212 non-null int64
target            595212 non-null int64
ps_ind_01         595212 non-null int64
ps_ind_02_cat     594996 non-null category
ps_ind_03         595212 non-null int64
ps_ind_04_cat     595129 non-null category
ps_ind_05_cat     589403 non-null category
ps_ind_06_bin     595212 non-null category
ps_ind_07_bin     595212 non-null category
ps_ind_08_bin     595212 non-null category
ps_ind_09_bin     595212 non-null category
ps_ind_10_bin     595212 non-null category
ps_ind_11_bin     595212 non-null category
ps_ind_12_bin     595212 non-null category
ps_ind_13_bin     595212 non-null category
ps_ind_14         595212 non-null int64
ps_ind_15         595212 non-null int64
ps_ind_16_bin     595212 non-null category
ps_ind_17_bin     595212 non-null category
ps_ind_18_bin     595212 non-null category
ps_reg_01         595212 non-null float64
ps_re

In [9]:
train.head()

Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,7,0,2,2.0,5,1.0,0.0,0,1,0,...,9,1,5,8,0,1,1,0,0,1
1,9,0,1,1.0,7,0.0,0.0,0,0,1,...,3,1,1,9,0,1,1,0,1,0
2,13,0,5,4.0,9,1.0,0.0,0,0,1,...,4,2,7,7,0,1,1,0,1,0
3,16,0,0,1.0,2,0.0,0.0,1,0,0,...,2,2,4,9,0,0,0,0,0,0
4,17,0,0,2.0,0,1.0,0.0,1,0,0,...,3,1,1,3,0,0,0,1,1,0


On change le format des variables catégorielles avec la fonction get_dummies, voir le TP  de Guilloux gro_train.

In [10]:
Train_cnt = train[num_feat]
Train_bin = train[bin_feat]
Train_cat = pd.get_dummies(train[cat_feat],
                              prefix_sep='#',
                              drop_first=True)
Test_cnt = test[num_feat]
Test_bin = test[bin_feat]
Test_cat = pd.get_dummies(test[cat_feat],
                              prefix_sep='#',
                              drop_first=True)

## Save the data without handling missing values and nomalization

In [11]:
X_cnt = Train_cnt.as_matrix()
X_cat = Train_cat.as_matrix()
X_bin = Train_bin.as_matrix()

X_cnt_test = Test_cnt.as_matrix()
X_cat_test = Test_cat.as_matrix()
X_bin_test = Test_bin.as_matrix()

In [12]:
XGB = np.concatenate((X_cnt, X_bin, X_cat), axis=1)
XGB_test = np.concatenate((X_cnt_test, X_bin_test, X_cat_test), axis=1)
yGB = train['target']

In [13]:
np.save('/tmp/XGB', XGB)
np.save('/tmp/XGB_test', XGB_test)

## On remplace les valeurs manquantes des variables continues par la moyenne puis on les met dans une matrice.

In [14]:
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
X_cnt = imp.fit_transform(Train_cnt)
X_cnt_test = imp.fit_transform(Test_cnt)

On normalise les données.

In [15]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
X_cnt = MinMaxScaler().fit_transform(X_cnt)
X_cnt_test = MinMaxScaler().fit_transform(X_cnt_test)

In [17]:
X = np.concatenate((X_cnt, X_bin, X_cat), axis=1)
X_test = np.concatenate((X_cnt_test, X_bin_test,X_cat_test), axis=1)
y = train['target']

In [18]:
np.save('/tmp/X', X)
np.save('/tmp/X_test', X_test)
np.save('/tmp/y', y)

## XGBoost (Code de Binh)

In [20]:
import xgboost as xgb
from sklearn.model_selection import KFold, GridSearchCV



In [21]:
def gini(pred, y):
    g = np.asarray(np.c_[y, pred, np.arange(len(y)) ], dtype=np.float)
    g = g[np.lexsort((g[:,2], -1*g[:,1]))]
    gs = g[:,0].cumsum().sum() / g[:,0].sum()
    gs -= (len(y) + 1) / 2.
    return gs / len(y)

def normalized_gini(pred, y):
    return gini(pred, y) / gini(y, y)

# custom normalized gini score for xgb model
def gini_xgb(pred, y):
    y = y.get_label()
    return 'gini', normalized_gini(pred, y)

In [22]:
submission = test['id'].to_frame()
submission['target'] = 0

In [23]:
XGB.shape

(595212, 204)

In [25]:
kfold = 5
nrounds=500
params = {
    'eta': 0.07,
    'max_depth': 5,
    'subsample': 0.8, #
    'colsample_bytree':0.8,
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'silent': True, # ok
    'n_jobs':-1 # use all CPU cores
}

kf = KFold(n_splits=kfold, shuffle=True, random_state=322)

for i, (train_index, cv_index) in enumerate(kf.split(X)):

    print(' xgb kfold: {} of {} : '.format(i+1, kfold))
    X_train, X_eval = XGB[train_index,:], XGB[cv_index,:]
    y_train, y_eval = y[train_index], y[cv_index]

    d_train = xgb.DMatrix(X_train, y_train) 
    d_valid = xgb.DMatrix(X_eval, y_eval)

    watchlist = [(d_train, 'train'), (d_valid, 'eval')]
    xgb_model = xgb.train(params,
                          d_train,
                          nrounds, # ok
                          watchlist,
                          early_stopping_rounds=100, # ok
                          feval=gini_xgb,
                          maximize=True,
                          verbose_eval=50)
    # Updating prediction by taking the average
    submission['target'] += xgb_model.predict(xgb.DMatrix(XGB_test)) / kfold

 xgb kfold: 1 of 5 : 
[0]	train-gini:0.191481	eval-gini:0.171372
Multiple eval metrics have been passed: 'eval-gini' will be used for early stopping.

Will train until eval-gini hasn't improved in 100 rounds.
[50]	train-gini:0.29136	eval-gini:0.244537
[100]	train-gini:0.335884	eval-gini:0.261076
[150]	train-gini:0.364687	eval-gini:0.263505
[200]	train-gini:0.392813	eval-gini:0.265438
[250]	train-gini:0.416536	eval-gini:0.265082
[300]	train-gini:0.438769	eval-gini:0.263516
Stopping. Best iteration:
[206]	train-gini:0.395615	eval-gini:0.265989

 xgb kfold: 2 of 5 : 
[0]	train-gini:0.188744	eval-gini:0.173038
Multiple eval metrics have been passed: 'eval-gini' will be used for early stopping.

Will train until eval-gini hasn't improved in 100 rounds.
[50]	train-gini:0.287392	eval-gini:0.251172
[100]	train-gini:0.335465	eval-gini:0.273693
[150]	train-gini:0.366161	eval-gini:0.276636
[200]	train-gini:0.392968	eval-gini:0.27711
[250]	train-gini:0.419909	eval-gini:0.277109
[300]	train-gini:0.

In [26]:
submission.id = submission.id.astype(int)
submission.to_csv('Submissions/PS-02-01.csv',index=False)

In [25]:
scores = {'XGBoost' : (0.28208,0.27825)}

## Régression logistique simple

In [28]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
logreg_cv = LogisticRegressionCV(penalty='l2', class_weight='balanced',
                                 Cs=np.logspace(-3, 7, 10),
                                 cv=5, n_jobs=-1, refit=True)
logreg_cv.fit(X, y)

In [29]:
submission['target'] = logreg_cv.predict(X_test)

In [30]:
submission.id = submission.id.astype(int)
submission.to_csv('subl.csv',index=False)

In [32]:
scores['LogReg'] = (0.18423,0.18040)

In [33]:
scores

{'LogReg': (0.18423, 0.1804), 'XGBoost': (0.28208, 0.27825)}