# P5 - Segmentez les comportements de clients
___

Nous avons maintenant un dataset représentant une liste de clients, un certain nombre de features les représentant, et une colonne désignant le segment auquel chaque client appartient, segment résultant du clustering effectué dans le notebook d'analyse.

Nous allons donc maintenant tâcher de trouver un modèle qui puisse prédire le segment en fonction des autres features, ce modèle sera sauvegardé et réutilisé dans la dernière partie de ce projet
___



In [1]:
# Importation des librairies

import pandas as pd
import numpy as np
import os

from sklearn import dummy
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_curve, auc, mean_squared_error, accuracy_score
from sklearn.preprocessing import MinMaxScaler



In [2]:
# Méthodes d'export pour permettre la réutilisation dans le code final

from sklearn.externals import joblib
CT_DIR = 'obj/'

def save_sklearn_obj(obj, name):
    fn = CT_DIR + name + '.pkl'
    try:
        os.remove(fn)
    except OSError:
        pass
    joblib.dump(obj, fn)
    print(fn, 'saved')

## Récupération et préparation de la table

In [3]:
# Les données nettoyées et transformées par le notebook "Analyse" ont été stockées dans un fichier csv que l'on récupère ici
df = pd.read_csv('dfTotale.csv', sep="\t", encoding='utf-8')
print(df.shape)
pd.set_option('display.max_columns', 150)
df.head()

(4372, 87)


Unnamed: 0,CustomerID,recency,frequency,monetary_value,Cancel,Promo,UK,CODE_POST,CODE_DISCOUNT,CODE_CARRIAGE,...,"('TotalPrice', 'min', 4)","('TotalPrice', 'min', 5)","('TotalPrice', 'min', 6)","('TotalPrice', 'min', 7)","('TotalPrice', 'min', 8)","('TotalPrice', 'min', 9)","('TotalPrice', 'min', 10)","('TotalPrice', 'min', 11)","('TotalPrice', 'min', 12)",CustomerCategory
0,12346.0,325,2,0.0,1,0,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,12347.0,2,182,4310.0,0,0,0,0,0,0,...,6.0,0.0,10.2,0.0,8.5,0.0,8.4,0.0,8.4,3
2,12348.0,75,31,1797.24,0,0,0,4,0,0,...,17.0,0.0,0.0,0.0,0.0,40.0,0.0,0.0,13.2,3
3,12349.0,18,73,1757.55,0,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.64,0.0,3
4,12350.0,310,17,334.4,0,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5


In [4]:
# L'identifiant client devient l'index de notre dataset
df.index = df.CustomerID
del df['CustomerID']
df.head()

Unnamed: 0_level_0,recency,frequency,monetary_value,Cancel,Promo,UK,CODE_POST,CODE_DISCOUNT,CODE_CARRIAGE,CODE_MANUAL,r_quartile,f_quartile,m_quartile,"('Quantity', 'sum', 1)","('Quantity', 'sum', 2)","('Quantity', 'sum', 3)","('Quantity', 'sum', 4)","('Quantity', 'sum', 5)","('Quantity', 'sum', 6)","('Quantity', 'sum', 7)","('Quantity', 'sum', 8)","('Quantity', 'sum', 9)","('Quantity', 'sum', 10)","('Quantity', 'sum', 11)","('Quantity', 'sum', 12)","('Quantity', 'max', 1)","('Quantity', 'max', 2)","('Quantity', 'max', 3)","('Quantity', 'max', 4)","('Quantity', 'max', 5)","('Quantity', 'max', 6)","('Quantity', 'max', 7)","('Quantity', 'max', 8)","('Quantity', 'max', 9)","('Quantity', 'max', 10)","('Quantity', 'max', 11)","('Quantity', 'max', 12)","('Quantity', 'min', 1)","('Quantity', 'min', 2)","('Quantity', 'min', 3)","('Quantity', 'min', 4)","('Quantity', 'min', 5)","('Quantity', 'min', 6)","('Quantity', 'min', 7)","('Quantity', 'min', 8)","('Quantity', 'min', 9)","('Quantity', 'min', 10)","('Quantity', 'min', 11)","('Quantity', 'min', 12)","('TotalPrice', 'sum', 1)","('TotalPrice', 'sum', 2)","('TotalPrice', 'sum', 3)","('TotalPrice', 'sum', 4)","('TotalPrice', 'sum', 5)","('TotalPrice', 'sum', 6)","('TotalPrice', 'sum', 7)","('TotalPrice', 'sum', 8)","('TotalPrice', 'sum', 9)","('TotalPrice', 'sum', 10)","('TotalPrice', 'sum', 11)","('TotalPrice', 'sum', 12)","('TotalPrice', 'max', 1)","('TotalPrice', 'max', 2)","('TotalPrice', 'max', 3)","('TotalPrice', 'max', 4)","('TotalPrice', 'max', 5)","('TotalPrice', 'max', 6)","('TotalPrice', 'max', 7)","('TotalPrice', 'max', 8)","('TotalPrice', 'max', 9)","('TotalPrice', 'max', 10)","('TotalPrice', 'max', 11)","('TotalPrice', 'max', 12)","('TotalPrice', 'min', 1)","('TotalPrice', 'min', 2)","('TotalPrice', 'min', 3)","('TotalPrice', 'min', 4)","('TotalPrice', 'min', 5)","('TotalPrice', 'min', 6)","('TotalPrice', 'min', 7)","('TotalPrice', 'min', 8)","('TotalPrice', 'min', 9)","('TotalPrice', 'min', 10)","('TotalPrice', 'min', 11)","('TotalPrice', 'min', 12)",CustomerCategory
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1
12346.0,325,2,0.0,1,0,1,0,0,0,0,4,4,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,74215.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-74215.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,77183.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-77183.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
12347.0,2,182,4310.0,0,0,0,0,0,0,0,1,1,1,315.0,0.0,0.0,483.0,0.0,196.0,0.0,277.0,0.0,676.0,0.0,511.0,24.0,0.0,0.0,240.0,0.0,36.0,0.0,36.0,0.0,48.0,0.0,36.0,3.0,0.0,0.0,3.0,0.0,2.0,0.0,3.0,0.0,2.0,0.0,3.0,475.39,0.0,0.0,636.25,0.0,382.52,0.0,584.91,0.0,1294.32,0.0,936.61,38.25,0.0,0.0,249.6,0.0,53.1,0.0,106.2,0.0,106.2,0.0,70.8,5.04,0.0,0.0,6.0,0.0,10.2,0.0,8.5,0.0,8.4,0.0,8.4,3
12348.0,75,31,1797.24,0,0,0,4,0,0,0,3,3,1,601.0,0.0,0.0,269.0,0.0,0.0,0.0,0.0,217.0,0.0,0.0,1254.0,144.0,0.0,0.0,96.0,0.0,0.0,0.0,0.0,120.0,0.0,0.0,144.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,6.0,227.44,0.0,0.0,367.0,0.0,0.0,0.0,0.0,310.0,0.0,0.0,892.8,41.76,0.0,0.0,120.0,0.0,0.0,0.0,0.0,150.0,0.0,0.0,240.0,20.4,0.0,0.0,17.0,0.0,0.0,0.0,0.0,40.0,0.0,0.0,13.2,3
12349.0,18,73,1757.55,0,0,0,1,0,0,0,2,2,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,631.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,36.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1757.55,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,300.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.64,0.0,3
12350.0,310,17,334.4,0,0,0,1,0,0,0,4,4,3,0.0,197.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,334.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5


In [5]:
# La target correspond à la colonne du cluster déterminé précédemment
y = df.CustomerCategory

# Gardons comme données toutes les colonnes sauf la target bien sûr
X = df.copy()
del X['CustomerCategory']

# On sépare les données en jeu d'entrainement et jeu de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, stratify=y)

# On crée avec un scaler entre 0 et 1  qu'on entraine avec le jeu d'entrainement
scaler = MinMaxScaler().fit(X_train)

# Le scaler est ensuite enregistré pour être réutilisé dans le code final
save_sklearn_obj(scaler, 'model_scaler')

# On normalise les données
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

print(X_train.shape)
print(X_test.shape)

obj/model_scaler.pkl saved
(2929, 85)
(1443, 85)


## Méthodes unitaires - Recherche des meilleurs paramètres

### LogisticRegression

In [6]:
parametres = {
    'C': np.logspace(-3, 3, 7),
    'penalty':['l1','l2']}

gs = GridSearchCV(LogisticRegression(), param_grid=parametres, cv=5)
gs.fit(X_train, y_train)

# On affiche le score
print ("Score = %.3f" % gs.score(X_test, y_test))
print ("Best params = %s" % gs.best_params_)

Score = 0.993
Best params = {'C': 1.0, 'penalty': 'l2'}


On a déjà un très bon score avec la régression logistic, voyons quand même si on peut faire mieux

### SVC

In [7]:
parametres = {
    'kernel': ['linear','rbf','sigmoid'],
    'C': np.logspace(-3, 4, 8)}

gs = GridSearchCV(SVC(kernel='linear', probability=True, max_iter=500000), param_grid=parametres, cv=5)
gs.fit(X_train, y_train)

# On affiche le score
print ("Score = %.3f" % gs.score(X_test, y_test))
print ("Best params = %s" % gs.best_params_)

Score = 0.996
Best params = {'C': 1000.0, 'kernel': 'rbf'}


C'est encore meilleur avec une SVM, allez on sauvegarde et on fait encore un essai avec un GradientBoostingClassifier

In [8]:
save_sklearn_obj(gs.best_estimator_, 'model_SVC')

obj/model.pkl saved


### GradientBoostingClassifier

In [8]:
parametres = {
    "learning_rate":[0.3,0.2,0.1,0.05,0.01],
    "max_depth":[2,3,4,5,6],
    "subsample":[1.0,0.8,0.5],
    "max_features":[None,'sqrt','log2']}

gs = GridSearchCV(GradientBoostingClassifier(), param_grid=parametres, cv=5)
gs.fit(X_train, y_train)

# On affiche le score
print ("Score = %.3f" % gs.score(X_test, y_test))
print ("Best params = %s" % gs.best_params_)


Score = 0.997
Best params = {'learning_rate': 0.2, 'max_depth': 4, 'max_features': None, 'subsample': 1.0}


Le GradientBoostingClassifier donne aussi un excellent score, je crois qu'on peut l'utiliser dans notre code final car maintenant il risque de falloir pas mal d'efforts pour gagner un millième.

On saugarde donc le modèle, mais on va aussi en tester d'autres par principe

In [8]:
save_sklearn_obj(gs.best_estimator_, 'model_GradientBoostingClassifier')

obj/model.pkl saved


## Méthode de recherche sur plusieurs classifieurs

In [9]:
def RunModel(model, label, params={}):
    print (label)
    gs = GridSearchCV(model, param_grid=params, cv=5, scoring='accuracy')
    res = gs.fit(X_train, y_train)

    # On affiche le score
    print ("\tScore = %.3f" % res.best_score_)

    sc = gs.score(X_test, y_test)
    print ("\tScore jeu de test = %.3f" % sc)
    if len(gs.best_params_) > 2:
        print ("\tBest params = %s" % gs.best_params_)

    if sc > 0.98:
        # On ajoute l'estimateur à la liste des votant pour le VotingClassifier
        print('\tAjouté au vote !')
        weights.append(1)
        estimators.append((label, gs))
        
    return gs

In [10]:
weights = []
estimators = []

# Méthode naive par la classe la plus fréquente pour comparer
RunModel(dummy.DummyClassifier(strategy='most_frequent'), 'Méthode naive')

RunModel(GaussianNB(), 'GaussianNB')

RunModel(KNeighborsClassifier(), 'KNeighborsClassifier')

parametres = {
    'C': np.logspace(-3, 3, 7),
    'penalty':['l1','l2']}
RunModel(LogisticRegression(), 'LogisticRegression', parametres)

RunModel(BaggingClassifier(n_estimators=100, max_samples=0.5, max_features=0.5), 'BaggingClassifier')

RunModel(RandomForestClassifier(n_estimators=100), 'RandomForestClassifier')

RunModel(ExtraTreesClassifier(n_estimators=100, max_depth=None, min_samples_split=2, random_state=0), 'ExtraTreesClassifier')

parametres = {
    'kernel': ['linear','rbf','sigmoid'],
    'C': np.logspace(-3, 4, 8)}
RunModel(SVC(probability=True, max_iter=500000, verbose=0), 'LinearSVC', parametres)

RunModel(AdaBoostClassifier(n_estimators=100), 'AdaBoostClassifier')

parametres = {
    "learning_rate":[0.3,0.2,0.1,0.05,0.01],
    "max_depth":[2,3,4,5,6],
    "subsample":[1.0,0.8,0.5],
    "max_features":[None,'sqrt','log2']}
RunModel(GradientBoostingClassifier(n_estimators=100, random_state=0), 'GradientBoostingClassifier', parametres)

RunModel(XGBClassifier(), 'XGBClassifier')

Méthode naive
	Score = 0.284
	Score jeu de test = 0.284
GaussianNB
	Score = 0.723
	Score jeu de test = 0.726
KNeighborsClassifier
	Score = 0.995
	Score jeu de test = 0.993
	Ajouté au vote !
LogisticRegression
	Score = 0.992
	Score jeu de test = 0.993
	Ajouté au vote !
BaggingClassifier
	Score = 0.984
	Score jeu de test = 0.980
RandomForestClassifier
	Score = 0.984
	Score jeu de test = 0.990
	Ajouté au vote !
ExtraTreesClassifier
	Score = 0.992
	Score jeu de test = 0.993
	Ajouté au vote !
LinearSVC
	Score = 0.998
	Score jeu de test = 0.999
	Ajouté au vote !
AdaBoostClassifier
	Score = 0.565
	Score jeu de test = 0.559
GradientBoostingClassifier
	Score = 0.997
	Score jeu de test = 0.996
	Best params = {'learning_rate': 0.2, 'max_depth': 4, 'max_features': None, 'subsample': 1.0}
	Ajouté au vote !
XGBClassifier
	Score = 0.997
	Score jeu de test = 0.995
	Ajouté au vote !


GridSearchCV(cv=5, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1),
       fit_params=None, iid=True, n_jobs=1, param_grid={},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [12]:
estimators

[('KNeighborsClassifier', GridSearchCV(cv=5, error_score='raise',
         estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
             metric_params=None, n_jobs=1, n_neighbors=5, p=2,
             weights='uniform'),
         fit_params=None, iid=True, n_jobs=1, param_grid={},
         pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
         scoring='accuracy', verbose=0)),
 ('LogisticRegression', GridSearchCV(cv=5, error_score='raise',
         estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
            intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
            penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
            verbose=0, warm_start=False),
         fit_params=None, iid=True, n_jobs=1,
         param_grid={'C': array([  1.00000e-03,   1.00000e-02,   1.00000e-01,   1.00000e+00,
           1.00000e+01,   1.00000e+02,   1.00000e+03]), 'penalty': ['l1', 'l

In [None]:
# Aggrégation de modèles
# On va maintenant utiliser l'ensemble des estimateurs précédents pour faire un choix à la majorité
eclf = VotingClassifier(estimators=estimators, voting='soft', weights=weights)

gs = GridSearchCV(eclf, param_grid={}, cv=5)
gs.fit(X_train, y_train)

# On affiche le score
print ("\tScore = %.3f" % gs.score(X_test, y_test))
print ("\tBest params = %s" % gs.best_params_)

eclf = RunModel(eclf, 'VotingClassifier')

	Score = 0.998
	Best params = {}
VotingClassifier


In [None]:
# Ce modèle, ou plutôt cette aggégation de modèles est le meilleur, sauvegarde !
save_sklearn_obj(eclf, 'model_VotingClassifier')