In [23]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel      
from sklearn.feature_selection import RFE
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

seg = pd.read_csv("data/predict_customer_segment.csv",encoding='latin1')
seg.head()

Unnamed: 0,customer id,channel,Nearest_distance_cosco,Net Sales,Age,children count,Transaction_date,Martial status,Region,Loyality_flag,...,Orders groceries,Orders_cerials,Orders_chokolates,Orders dentals,Orders cosmetics,Orders_ready_eat,Orders_braverage,Orders_frozen,Loyality_amt,segments
0,5102,Email,4,367941,58,4,2017-09-19,0,San Diego,0,...,6,10,1,6,3,7,4,0,335,6
1,3549,Email,33,171305,98,1,2017-02-17,0,Atlanta,0,...,7,8,0,3,5,4,10,8,446,5
2,5885,direct,28,173439,18,2,2017-06-06,0,TampaSt. Petersburg,0,...,5,5,8,9,3,7,5,0,49,6
3,7381,store,40,997988,87,3,2016-09-23,0,Seattle,1,...,5,0,8,1,2,7,7,7,280,3
4,2713,direct,5,898554,31,2,2017-01-20,0,Seattle,0,...,4,5,10,5,10,8,9,0,89,3


In [24]:
categorical = seg.select_dtypes(include='object').columns
numerical = seg.select_dtypes(exclude='object').columns
df = pd.concat([seg[numerical],pd.get_dummies(seg[categorical])],axis=1)
df.head()

Unnamed: 0,customer id,Nearest_distance_cosco,Net Sales,Age,children count,Martial status,Loyality_flag,coupon amount,Income,Orders groceries,...,Brands_ Shiseido Co,Brands_ Spectrum Brands Holdings,Brands_ Stanley Black & Decker,Brands_ Svenska Cellulosa AB SCA,Brands_ Swatch Group SA*,Brands_ Tyson Foods,Brands_ Unilever N,Brands_ VF Corp,Brands_ WH Group Ltd,Brands_ Whirlpool Corp
0,5102,4,367941,58,4,0,0,6,4628,6,...,0,0,0,0,0,0,0,0,0,0
1,3549,33,171305,98,1,0,0,43,4074,7,...,0,0,0,0,0,0,0,0,0,0
2,5885,28,173439,18,2,0,0,42,4910,5,...,0,0,0,0,0,0,0,0,0,0
3,7381,40,997988,87,3,0,1,38,3297,5,...,0,0,0,0,0,0,0,0,0,0
4,2713,5,898554,31,2,0,0,24,4940,4,...,0,0,0,0,0,0,0,0,0,0


In [25]:
X = df.drop(['customer id','segments'],axis=1)
y = df['segments']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42,shuffle=True)

classifier = XGBClassifier(random_state=2)
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)
f1 = f1_score(y_test,y_pred,average='macro')
xgb_cr = classification_report(y_test,y_pred)
print(f1,"\n",xgb_cr)

0.1784048478126609 
               precision    recall  f1-score   support

           1       0.19      0.24      0.21       247
           2       0.19      0.10      0.14       288
           3       0.19      0.30      0.23       241
           4       0.13      0.13      0.13       231
           5       0.15      0.11      0.13       251
           6       0.23      0.22      0.23       242

    accuracy                           0.18      1500
   macro avg       0.18      0.19      0.18      1500
weighted avg       0.18      0.18      0.18      1500



In [27]:
parameters={'learning_rate':[0.1,0.15,0.2,0.25,0.3],
            'max_depth':range(1,3)}

grid_search = GridSearchCV(estimator=classifier,param_grid=parameters,n_jobs=-1,verbose=1)
grid_search.fit(X_train,y_train)
grid_predictions = grid_search.predict(X_test)
grid_f1 = f1_score(y_test,grid_predictions,average='macro')
report = classification_report(y_test,y_pred)
print(grid_f1,"\n",report)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  6.5min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  9.1min finished


0.1773890153382315 
 <function classification_report at 0x000001C3734BAB70>


In [29]:
model = RandomForestClassifier(random_state=2)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
f1 = f1_score(y_test,y_pred,average='macro')
report = classification_report(y_test,y_pred)
print(f1,"\n",xgb_cr)

0.16116249335524743 
               precision    recall  f1-score   support

           1       0.19      0.24      0.21       247
           2       0.19      0.10      0.14       288
           3       0.19      0.30      0.23       241
           4       0.13      0.13      0.13       231
           5       0.15      0.11      0.13       251
           6       0.23      0.22      0.23       242

    accuracy                           0.18      1500
   macro avg       0.18      0.19      0.18      1500
weighted avg       0.18      0.18      0.18      1500



In [47]:
parameters = {
    'n_estimators':[50,100,150,200],
    'max_depth':[10,20,50],
    'min_samples_leaf':[1,2]
}

grid_search = GridSearchCV(estimator=model,param_grid=parameters,n_jobs=-1,verbose=1)
grid_search.fit(X_train,y_train)
grid_predictions = grid_search.predict(X_test)
grid_f1 = f1_score(y_test,grid_predictions,average='macro')
report = classification_report(y_test,y_pred)
print(grid_f1,"\n",report)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   10.7s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:   54.0s finished


0.1342877076449344 
               precision    recall  f1-score   support

           1       0.18      0.21      0.19       247
           2       0.16      0.10      0.13       288
           3       0.15      0.22      0.18       241
           4       0.16      0.17      0.17       231
           5       0.18      0.14      0.16       251
           6       0.15      0.14      0.14       242

    accuracy                           0.16      1500
   macro avg       0.16      0.16      0.16      1500
weighted avg       0.16      0.16      0.16      1500



In [48]:
grid_search.best_params_

{'max_depth': 10, 'min_samples_leaf': 1, 'n_estimators': 100}