In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier, XGBRFClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV, StratifiedKFold

from sklearn.metrics import accuracy_score, confusion_matrix
import pickle

We can follow 2 different strategies:
- We can try to predict the churn rate (numbers from 0 to 5)
- We can group the churn rate by No risk [0, 1], Low risk [2,3], High risk [4,5]

We will train different models following both strategies and depending on the results, we will choose one approach or the other for the final model

### Load data

In [7]:
X_train = pd.read_csv('../data/processed/outliers_999/X_train.csv', index_col=0).reset_index().drop(['index'], axis=1)
X_val = pd.read_csv('../data/processed/outliers_999/X_val.csv', index_col=0).reset_index().drop(['index'], axis=1)
y_train = pd.read_csv('../data/processed/outliers_999/y_train.csv', index_col=0).reset_index().drop(['index'], axis=1)
y_val = pd.read_csv('../data/processed/outliers_999/y_val.csv', index_col=0).reset_index().drop(['index'], axis=1)

In [8]:
X_train.head()

Unnamed: 0,age,membership_category,joined_through_referral,days_since_last_login,avg_time_spent,avg_transaction_value,avg_frequency_login_days,points_in_wallet,used_special_discount,offer_application_preference,...,feedback_Poor Product Quality,feedback_Poor Website,feedback_Products always in Stock,feedback_Quality Customer Care,feedback_Reasonable Price,feedback_Too many ads,feedback_User Friendly Website,feedback_categroy_Negative,feedback_categroy_Neutral,feedback_categroy_Positive
0,0.0,1,1,0.98439,0.421806,0.024991,0.01601,0.0,0,1,...,1,0,0,0,0,0,0,1,0,0
1,0.759259,5,1,0.982439,0.43907,0.025924,0.019013,0.384211,1,0,...,0,0,0,0,1,0,0,0,0,1
2,0.981481,0,1,0.0,0.41976,0.054324,0.021015,0.293765,0,1,...,1,0,0,0,0,0,0,1,0,0
3,0.611111,3,0,0.996098,0.42314,0.190213,0.012006,0.311522,1,0,...,0,0,0,0,0,0,0,0,1,0
4,0.37037,3,0,0.993171,0.42194,0.173387,0.01601,0.100691,0,1,...,0,0,0,0,0,0,0,0,1,0


In [9]:
y_train.head()

Unnamed: 0,churn_risk_score
0,5
1,2
2,4
3,4
4,3


In [10]:
y_train[y_train == -1] = 0
y_val[y_val==-1] = 0

In [11]:
y_train.value_counts()

churn_risk_score
3                   6254
4                   6111
5                   5896
2                   1645
1                   1591
0                    697
Name: count, dtype: int64

In [12]:
# For Grid Search
X = pd.concat([X_train, X_val])
y = pd.concat([y_train, y_val])

In [13]:
X

Unnamed: 0,age,membership_category,joined_through_referral,days_since_last_login,avg_time_spent,avg_transaction_value,avg_frequency_login_days,points_in_wallet,used_special_discount,offer_application_preference,...,feedback_Poor Product Quality,feedback_Poor Website,feedback_Products always in Stock,feedback_Quality Customer Care,feedback_Reasonable Price,feedback_Too many ads,feedback_User Friendly Website,feedback_categroy_Negative,feedback_categroy_Neutral,feedback_categroy_Positive
0,0.000000,1,1,0.984390,0.421806,0.024991,0.016010,0.000000,0,1,...,1,0,0,0,0,0,0,1,0,0
1,0.759259,5,1,0.982439,0.439070,0.025924,0.019013,0.384211,1,0,...,0,0,0,0,1,0,0,0,0,1
2,0.981481,0,1,0.000000,0.419760,0.054324,0.021015,0.293765,0,1,...,1,0,0,0,0,0,0,1,0,0
3,0.611111,3,0,0.996098,0.423140,0.190213,0.012006,0.311522,1,0,...,0,0,0,0,0,0,0,0,1,0
4,0.370370,3,0,0.993171,0.421940,0.173387,0.016010,0.100691,0,1,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7394,0.055556,3,0,0.991220,0.570248,0.012901,0.010004,0.376657,1,0,...,0,0,0,0,0,1,0,1,0,0
7395,0.481481,1,1,0.983415,0.453696,0.105664,0.015009,0.453183,0,1,...,0,1,0,0,0,0,0,1,0,0
7396,0.185185,0,0,0.984390,0.468768,0.170820,0.026020,0.324624,1,1,...,0,0,0,0,0,0,0,1,0,0
7397,0.833333,0,0,0.000000,0.465638,0.231195,0.009003,0.286776,1,1,...,0,0,0,0,0,0,0,1,0,0


## First strategy
Predicting churn rate

#### 1.1 Find the best hyperparameters for different models

In [14]:
# Random forest
param_grid = {
    'n_estimators': [200, 250, 300, 400, 500],
    'max_depth': [ 9, 11, 13, 15, 17],
    'min_samples_split': [ 5, 7, 9, 11, 13, 15],
}

gs_rf = GridSearchCV(
    estimator=RandomForestClassifier(),
    param_grid=param_grid,
    n_jobs=-1, 
    cv = StratifiedKFold(4, random_state=42, shuffle=True),
    verbose=1,
    scoring='accuracy')

gs_rf.fit(X, y)

Fitting 4 folds for each of 150 candidates, totalling 600 fits


  return fit_method(estimator, *args, **kwargs)


In [15]:
gs_rf.best_estimator_

In [16]:
gs_rf.best_score_

np.float64(0.7599431056804218)

In [17]:
results_df = pd.DataFrame(gs_rf.cv_results_)
sorted_results = results_df.sort_values(by='rank_test_score')
top_4_parameters = sorted_results[['params', 'mean_test_score', 'std_test_score', 'rank_test_score']].head(4)
top_4_parameters

Unnamed: 0,params,mean_test_score,std_test_score,rank_test_score
67,"{'max_depth': 13, 'min_samples_split': 7, 'n_e...",0.759943,0.006686,1
81,"{'max_depth': 13, 'min_samples_split': 13, 'n_...",0.75974,0.004842,2
78,"{'max_depth': 13, 'min_samples_split': 11, 'n_...",0.75947,0.005485,3
64,"{'max_depth': 13, 'min_samples_split': 5, 'n_e...",0.759402,0.0057,4


In [18]:
top_4_parameters.iloc[0, 0]

{'max_depth': 13, 'min_samples_split': 7, 'n_estimators': 300}

In [19]:
top_4_parameters.iloc[3, 0]

{'max_depth': 13, 'min_samples_split': 5, 'n_estimators': 500}

In [20]:
best_rf = gs_rf.best_estimator_
with open('../model/strategy1/outliers_999/best_rf.pkl', 'wb') as file:
    pickle.dump(best_rf, file)

In [21]:
# LogisticRegression

param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet'], 
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  
    'solver': ['liblinear', 'saga'],     
    'class_weight': [None, 'balanced'] 
}

gs_lr = GridSearchCV(
    estimator=LogisticRegression(),
    param_grid=param_grid,
    n_jobs=-1, 
    cv = StratifiedKFold(4, random_state=42, shuffle=True),
    verbose=1,
    scoring='accuracy')

gs_lr.fit(X, y)

Fitting 4 folds for each of 72 candidates, totalling 288 fits


96 fits failed out of a total of 288.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
48 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Pablo\miniconda3\envs\data_analysis_env\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Pablo\miniconda3\envs\data_analysis_env\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Pablo\miniconda3\envs\data_analysis_env\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1193, in fit
    solver = _check_solver(self.solver, self.pe

In [22]:
gs_lr.best_estimator_

In [23]:
gs_lr.best_score_

np.float64(0.5772988473304062)

In [24]:
best_lr = gs_lr.best_estimator_
with open('../model/strategy1/outliers_999/best_lr.pkl', 'wb') as file:
    pickle.dump(best_lr, file)

In [25]:
# KNNeighbours
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11, 13, 15, 17, 19, 21], 
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree'], 
    'p': [1, 2], # 1 for Manhattan, 2 for Euclidean
}

gs_knn = GridSearchCV(
    estimator=KNeighborsClassifier(),
    param_grid=param_grid,
    n_jobs=-1, 
    cv = StratifiedKFold(4, random_state=42, shuffle=True),
    verbose=1,
    scoring='accuracy')

gs_knn.fit(X, y)

Fitting 4 folds for each of 120 candidates, totalling 480 fits


  return self._fit(X, y)


In [26]:
gs_knn.best_score_

np.float64(0.6220727587856012)

In [27]:
best_knn = gs_knn.best_estimator_
with open('../model/strategy1/outliers_999/best_knn.pkl', 'wb') as file:
    pickle.dump(best_knn, file)

In [28]:
# XGBClassifier
param_grid_xgb = {
    # 1. Number of estimators and learning rate are often tuned together.
    # Keep n_estimators moderate for the initial search.
    'n_estimators': [100, 200, 300], 
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7],          
    'min_child_weight': [1, 5],   
    'subsample': [0.8, 1.0],          
    'colsample_bytree': [0.8, 1.0],  
    'reg_lambda': [1, 10]  }       

gs_xgb = GridSearchCV(
    estimator=XGBClassifier(),
    param_grid=param_grid_xgb,
    cv= StratifiedKFold(4, random_state=42, shuffle=True),
    n_jobs=-1,                     
    verbose=2,                     
    scoring='accuracy'           
)      

gs_xgb.fit(X, y)

Fitting 4 folds for each of 432 candidates, totalling 1728 fits


In [29]:
gs_xgb.best_score_

np.float64(0.7653834912845057)

In [30]:
best_xgb = gs_xgb.best_estimator_
with open('../model/strategy1/outliers_999/best_xgb.pkl', 'wb') as file:
    pickle.dump(best_xgb, file)

In [31]:
# SVC
param_grid_linearsvc = {
        'C': [0.01, 0.1, 1, 10],  
        'loss': ['hinge', 'squared_hinge'],
        'penalty': ['l1', 'l2'],       
        'dual': [True, False],             
    }

gs_lsvc = GridSearchCV(
    estimator=LinearSVC(),
    param_grid=param_grid_linearsvc,
    cv= StratifiedKFold(4, random_state=42, shuffle=True),
    n_jobs=-1,                     
    verbose=2,                     
    scoring='accuracy'           
)      

gs_lsvc.fit(X, y)

Fitting 4 folds for each of 32 candidates, totalling 128 fits


64 fits failed out of a total of 128.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
16 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Pablo\miniconda3\envs\data_analysis_env\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Pablo\miniconda3\envs\data_analysis_env\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Pablo\miniconda3\envs\data_analysis_env\Lib\site-packages\sklearn\svm\_classes.py", line 321, in fit
    self.coef_, self.intercept_, n_iter_ = _fit_liblinear(

In [32]:
gs_lsvc.best_score_

np.float64(0.6260602225131364)

In [33]:
best_lsvc = gs_lsvc.best_estimator_
with open('../model/strategy1/outliers_999/best_lsvc.pkl', 'wb') as file:
    pickle.dump(best_lsvc, file)

In [34]:
gs_rf.best_estimator_

In [35]:
# Voting classifier
print(gs_rf.best_params_, gs_lr.best_params_, gs_xgb.best_params_, gs_knn.best_params_, gs_lsvc.best_params_)

{'max_depth': 13, 'min_samples_split': 7, 'n_estimators': 300} {'C': 0.001, 'class_weight': 'balanced', 'penalty': 'l2', 'solver': 'liblinear'} {'colsample_bytree': 0.8, 'learning_rate': 0.05, 'max_depth': 5, 'min_child_weight': 1, 'n_estimators': 100, 'reg_lambda': 1, 'subsample': 1.0} {'algorithm': 'auto', 'n_neighbors': 21, 'p': 1, 'weights': 'uniform'} {'C': 0.1, 'dual': True, 'loss': 'hinge', 'penalty': 'l2'}


In [36]:
# Voting system 
estimators_hard = [
    ('rf', RandomForestClassifier(max_depth= 15, min_samples_split= 7, n_estimators= 400)),
    ('lr', LogisticRegression(C= 0.001, class_weight= 'balanced', penalty= 'l2', solver= 'liblinear')),
    ('xgb', XGBClassifier(colsample_bytree= 0.8, learning_rate= 0.05, max_depth= 3, min_child_weight= 1, n_estimators= 100, reg_lambda= 1, subsample= 0.8)),
    ('knn', KNeighborsClassifier(algorithm= 'auto', n_neighbors= 21, p= 1, weights= 'uniform')),
    ('lsvc', LinearSVC(C= 0.1, dual= True, loss= 'hinge', penalty= 'l2'))
]

estimators_soft = [
    ('rf', RandomForestClassifier(max_depth= 15, min_samples_split= 7, n_estimators= 400)),
    ('lr', LogisticRegression(C= 0.001, class_weight= 'balanced', penalty= 'l2', solver= 'liblinear')),
    ('xgb', XGBClassifier(colsample_bytree= 0.8, learning_rate= 0.05, max_depth= 3, min_child_weight= 1, n_estimators= 100, reg_lambda= 1, subsample= 0.8)),
    ('knn', KNeighborsClassifier(algorithm= 'auto', n_neighbors= 21, p= 1, weights= 'uniform'))
]

voting_hard = VotingClassifier(
    estimators=estimators_hard,
    voting='hard',
    n_jobs=-1)

voting_soft = VotingClassifier(
    estimators=estimators_soft,
    voting='soft',
    n_jobs=-1)

In [37]:
voting_hard.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


In [38]:
with open('../model/strategy1/outliers_999/best_hard.pkl', 'wb') as file:
    pickle.dump(voting_hard, file)

In [39]:
voting_soft.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


In [40]:
with open('../model/strategy1/outliers_999/best_soft.pkl', 'wb') as file:
    pickle.dump(voting_soft, file)

In [41]:
y_pred = voting_hard.predict(X_val)
acc_hard = accuracy_score(y_true=y_val, y_pred=y_pred)
accuracy_score(y_true=y_val, y_pred=y_pred)
acc_hard

0.6771185295310177

In [42]:
y_pred = voting_soft.predict(X_val)
acc_soft = accuracy_score(y_true=y_val, y_pred=y_pred)
acc_soft

0.7680767671306933

In [43]:
my_dict = {'model': ['rf', 'lr', 'xgb', 'knn', 'svc', 'hard', 'soft'],
           'accuracy': [gs_rf.best_score_, gs_lr.best_score_, gs_xgb.best_score_, gs_knn.best_score_, gs_lsvc.best_score_, acc_hard, acc_soft]}

pd.DataFrame(my_dict)

Unnamed: 0,model,accuracy
0,rf,0.759943
1,lr,0.577299
2,xgb,0.765383
3,knn,0.622073
4,svc,0.62606
5,hard,0.677119
6,soft,0.768077


In [44]:
# The bes model is the soft voting. We decide to train it with all the data
voting_soft = VotingClassifier(
    estimators=estimators_soft,
    voting='soft',
    n_jobs=-1)

voting_soft.fit(X, y)

with open('../model/strategy1/outliers_999/best_all.pkl', 'wb') as file:
    pickle.dump(voting_soft, file)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
