# P5 - Segmentez les comportements de clients

In [1]:
import pandas as pd
import numpy as np
import os

from sklearn import dummy
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_curve, auc, mean_squared_error, accuracy_score
from sklearn.preprocessing import MinMaxScaler



In [2]:
# Méthodes d'export
from sklearn.externals import joblib
CT_DIR = 'obj/'

def save_sklearn_obj(obj, name):
    fn = CT_DIR + name + '.pkl'
    try:
        os.remove(fn)
    except OSError:
        pass
    joblib.dump(obj, fn)
    print(fn, 'saved')

## Exploration de l'historique des clients

In [3]:
df = pd.read_csv('dfTotale.csv', sep="\t", encoding='utf-8')
print(df.shape)
df.head()

(4372, 88)


Unnamed: 0,CustomerID,recency,frequency,monetary_value,Cancel,Promo,UK,CODE_POST,CODE_DISCOUNT,CODE_CARRIAGE,...,"('TotalPrice', 'min', 4)","('TotalPrice', 'min', 5)","('TotalPrice', 'min', 6)","('TotalPrice', 'min', 7)","('TotalPrice', 'min', 8)","('TotalPrice', 'min', 9)","('TotalPrice', 'min', 10)","('TotalPrice', 'min', 11)","('TotalPrice', 'min', 12)",CustomerCategory
0,12346.0,325,2,0.0,1,0,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
1,12347.0,2,182,4310.0,0,0,0,0,0,0,...,6.0,0.0,10.2,0.0,8.5,0.0,8.4,0.0,8.4,2
2,12348.0,75,31,1797.24,0,0,0,4,0,0,...,17.0,0.0,0.0,0.0,0.0,40.0,0.0,0.0,13.2,5
3,12349.0,18,73,1757.55,0,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.64,0.0,2
4,12350.0,310,17,334.4,0,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5


In [4]:
df.index = df.CustomerID
del df['CustomerID']

# On enlève les features redondantes
#del df['recency']
#del df['frequency']
#del df['monetary_value']
#del df['RFMScore']

df.head()

Unnamed: 0_level_0,recency,frequency,monetary_value,Cancel,Promo,UK,CODE_POST,CODE_DISCOUNT,CODE_CARRIAGE,CODE_MANUAL,...,"('TotalPrice', 'min', 4)","('TotalPrice', 'min', 5)","('TotalPrice', 'min', 6)","('TotalPrice', 'min', 7)","('TotalPrice', 'min', 8)","('TotalPrice', 'min', 9)","('TotalPrice', 'min', 10)","('TotalPrice', 'min', 11)","('TotalPrice', 'min', 12)",CustomerCategory
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12346.0,325,2,0.0,1,0,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
12347.0,2,182,4310.0,0,0,0,0,0,0,0,...,6.0,0.0,10.2,0.0,8.5,0.0,8.4,0.0,8.4,2
12348.0,75,31,1797.24,0,0,0,4,0,0,0,...,17.0,0.0,0.0,0.0,0.0,40.0,0.0,0.0,13.2,5
12349.0,18,73,1757.55,0,0,0,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.64,0.0,2
12350.0,310,17,334.4,0,0,0,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5


In [5]:
# La target correspond à la colonne du cluster déterminé précédemment
y = df.CustomerCategory

# Gardons comme données toutes les colonnes sauf la target bien sûr
X = df.copy()
del X['CustomerCategory']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, stratify=y)

scaler = MinMaxScaler().fit(X_train)
save_sklearn_obj(scaler, 'model_scaler')

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

print(X_train.shape)
print(X_test.shape)

obj/model_scaler.pkl saved
(2929, 86)
(1443, 86)


## Méthodes unitaires - Recherche des meilleurs paramètres

In [6]:
parametres = {
    'C': np.logspace(-3, 3, 7),
    'penalty':['l1','l2']}

gs = GridSearchCV(LogisticRegression(), param_grid=parametres, cv=5)
gs.fit(X_train, y_train)

# On affiche le score
print ("Score = %.3f" % gs.score(X_test, y_test))
print ("Best params = %s" % gs.best_params_)

Score = 0.993
Best params = {'C': 1.0, 'penalty': 'l1'}


In [7]:
parametres = {
    'kernel': ['linear','rbf','sigmoid'],
    'C': np.logspace(-3, 4, 8)}

gs = GridSearchCV(SVC(kernel='linear', probability=True, max_iter=500000), param_grid=parametres, cv=5)
gs.fit(X_train, y_train)

# On affiche le score
print ("Score = %.3f" % gs.score(X_test, y_test))
print ("Best params = %s" % gs.best_params_)

Score = 0.999
Best params = {'C': 1000.0, 'kernel': 'linear'}


In [8]:
parametres = {
    "learning_rate":[0.3,0.2,0.1,0.05,0.01],
    "max_depth":[2,3,4,5,6],
    "subsample":[1.0,0.8,0.5],
    "max_features":[None,'sqrt','log2']}

gs = GridSearchCV(GradientBoostingClassifier(), param_grid=parametres, cv=5)
gs.fit(X_train, y_train)

# On affiche le score
print ("Score = %.3f" % gs.score(X_test, y_test))
print ("Best params = %s" % gs.best_params_)


Score = 0.997
Best params = {'learning_rate': 0.2, 'max_depth': 4, 'max_features': None, 'subsample': 1.0}


In [15]:
parametres = {
    "n_estimators":[100,200,300,400,500]}

gs = GridSearchCV(GradientBoostingClassifier(), param_grid={}, cv=5)
gs.fit(X_train, y_train)


GridSearchCV(cv=5, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False),
       fit_params=None, iid=True, n_jobs=1, param_grid={},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

## Méthodes de classification

In [9]:
def RunModel(model, label, params={}):
    print (label)
    gs = GridSearchCV(model, param_grid=params, cv=5, scoring='accuracy')
    res = gs.fit(X_train, y_train)

    # On affiche le score
    print ("\tScore = %.3f" % res.best_score_)

    sc = gs.score(X_test, y_test)
    print ("\tScore jeu de test = %.3f" % sc)
    if len(gs.best_params_) > 2:
        print ("\tBest params = %s" % gs.best_params_)

    if sc > 0.98:
        # On ajoute l'estimateur à la liste des votant pour le VotingClassifier
        print('\tAjouté au vote !')
        weights.append(1)
        estimators.append((label, gs))
        
    return gs

In [10]:
weights = []
estimators = []

# Méthode naive par la classe la plus fréquente pour comparer
RunModel(dummy.DummyClassifier(strategy='most_frequent'), 'Méthode naive')

RunModel(GaussianNB(), 'GaussianNB')

RunModel(KNeighborsClassifier(), 'KNeighborsClassifier')

parametres = {
    'C': np.logspace(-3, 3, 7),
    'penalty':['l1','l2']}
RunModel(LogisticRegression(), 'LogisticRegression', parametres)

RunModel(BaggingClassifier(n_estimators=100, max_samples=0.5, max_features=0.5), 'BaggingClassifier')

RunModel(RandomForestClassifier(n_estimators=100), 'RandomForestClassifier')

RunModel(ExtraTreesClassifier(n_estimators=100, max_depth=None, min_samples_split=2, random_state=0), 'ExtraTreesClassifier')

parametres = {
    'kernel': ['linear','rbf','sigmoid'],
    'C': np.logspace(-3, 4, 8)}
RunModel(SVC(probability=True, max_iter=500000, verbose=0), 'LinearSVC', parametres)

RunModel(AdaBoostClassifier(n_estimators=100), 'AdaBoostClassifier')

parametres = {
    "learning_rate":[0.3,0.2,0.1,0.05,0.01],
    "max_depth":[2,3,4,5,6],
    "subsample":[1.0,0.8,0.5],
    "max_features":[None,'sqrt','log2']}
RunModel(GradientBoostingClassifier(n_estimators=100, random_state=0), 'GradientBoostingClassifier', parametres)

RunModel(XGBClassifier(), 'XGBClassifier')

Méthode naive
	Score = 0.284
	Score jeu de test = 0.284
GaussianNB
	Score = 0.723
	Score jeu de test = 0.726
KNeighborsClassifier
	Score = 0.995
	Score jeu de test = 0.993
	Ajouté au vote !
LogisticRegression
	Score = 0.992
	Score jeu de test = 0.993
	Ajouté au vote !
BaggingClassifier
	Score = 0.984
	Score jeu de test = 0.980
RandomForestClassifier
	Score = 0.984
	Score jeu de test = 0.990
	Ajouté au vote !
ExtraTreesClassifier
	Score = 0.992
	Score jeu de test = 0.993
	Ajouté au vote !
LinearSVC
	Score = 0.998
	Score jeu de test = 0.999
	Ajouté au vote !
AdaBoostClassifier
	Score = 0.565
	Score jeu de test = 0.559
GradientBoostingClassifier
	Score = 0.997
	Score jeu de test = 0.996
	Best params = {'learning_rate': 0.2, 'max_depth': 4, 'max_features': None, 'subsample': 1.0}
	Ajouté au vote !
XGBClassifier
	Score = 0.997
	Score jeu de test = 0.995
	Ajouté au vote !


GridSearchCV(cv=5, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1),
       fit_params=None, iid=True, n_jobs=1, param_grid={},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [12]:
estimators

[('KNeighborsClassifier', GridSearchCV(cv=5, error_score='raise',
         estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
             metric_params=None, n_jobs=1, n_neighbors=5, p=2,
             weights='uniform'),
         fit_params=None, iid=True, n_jobs=1, param_grid={},
         pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
         scoring='accuracy', verbose=0)),
 ('LogisticRegression', GridSearchCV(cv=5, error_score='raise',
         estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
            intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
            penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
            verbose=0, warm_start=False),
         fit_params=None, iid=True, n_jobs=1,
         param_grid={'C': array([  1.00000e-03,   1.00000e-02,   1.00000e-01,   1.00000e+00,
           1.00000e+01,   1.00000e+02,   1.00000e+03]), 'penalty': ['l1', 'l

In [None]:
# Aggrégation de modèles
# On va maintenant utiliser l'ensemble des estimateurs précédents pour faire un choix à la majorité
eclf = VotingClassifier(estimators=estimators, voting='soft', weights=weights)

gs = GridSearchCV(eclf, param_grid={}, cv=5)
gs.fit(X_train, y_train)

# On affiche le score
print ("\tScore = %.3f" % gs.score(X_test, y_test))
print ("\tBest params = %s" % gs.best_params_)

eclf = RunModel(eclf, 'VotingClassifier')
save_sklearn_obj(eclf, 'model')

	Score = 0.998
	Best params = {}
VotingClassifier


In [None]:
save_sklearn_obj(eclf, 'model')