In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier, XGBRFClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV, StratifiedKFold

from sklearn.metrics import accuracy_score, confusion_matrix
import pickle

We can follow 2 different strategies:
- We can try to predict the churn rate (numbers from 0 to 5)
- We can group the churn rate by No risk [0, 1], Low risk [2,3], High risk [4,5]

We will train different models following both strategies and depending on the results, we will choose one approach or the other for the final model

### Load data

In [2]:
X_train = pd.read_csv('../data/processed/outliers_99/X_train.csv', index_col=0).reset_index().drop(['index'], axis=1)
X_val = pd.read_csv('../data/processed/outliers_99/X_val.csv', index_col=0).reset_index().drop(['index'], axis=1)
y_train = pd.read_csv('../data/processed/outliers_99/y_train.csv', index_col=0).reset_index().drop(['index'], axis=1)
y_val = pd.read_csv('../data/processed/outliers_99/y_val.csv', index_col=0).reset_index().drop(['index'], axis=1)

In [3]:
X_train.head()

Unnamed: 0,age,joined_through_referral,days_since_last_login,avg_time_spent,avg_transaction_value,avg_frequency_login_days,points_in_wallet,used_special_discount,offer_application_preference,past_complaint,...,feedback_User Friendly Website,feedback_categroy_Negative,feedback_categroy_Neutral,feedback_categroy_Positive,membership_category_Basic Membership,membership_category_Gold Membership,membership_category_No Membership,membership_category_Platinum Membership,membership_category_Premium Membership,membership_category_Silver Membership
0,0.0,1,0.872,0.421806,0.024991,0.161563,0.0,0,1,1,...,0,1,0,0,True,False,False,False,False,False
1,0.759259,1,0.856,0.43907,0.025924,0.191868,0.384211,1,0,1,...,0,0,0,1,False,False,False,True,False,False
2,0.981481,1,0.0,0.41976,0.054324,0.212072,0.293765,0,1,1,...,0,1,0,0,False,False,True,False,False,False
3,0.611111,0,0.968,0.42314,0.190213,0.121157,0.311522,1,0,0,...,0,0,1,0,False,False,False,False,False,True
4,0.37037,0,0.944,0.42194,0.173387,0.161563,0.100691,0,1,1,...,0,0,1,0,False,False,False,False,False,True


In [4]:
y_train.head()

Unnamed: 0,churn_risk_score
0,5
1,2
2,4
3,4
4,3


In [5]:
y_train[y_train == -1] = 0
y_val[y_val==-1] = 0

In [6]:
y_train.value_counts()

churn_risk_score
3                   6254
4                   6111
5                   5896
2                   1645
1                   1591
0                    697
Name: count, dtype: int64

In [7]:
# For Grid Search
X = pd.concat([X_train, X_val])
y = pd.concat([y_train, y_val])

In [8]:
X

Unnamed: 0,age,joined_through_referral,days_since_last_login,avg_time_spent,avg_transaction_value,avg_frequency_login_days,points_in_wallet,used_special_discount,offer_application_preference,past_complaint,...,feedback_User Friendly Website,feedback_categroy_Negative,feedback_categroy_Neutral,feedback_categroy_Positive,membership_category_Basic Membership,membership_category_Gold Membership,membership_category_No Membership,membership_category_Platinum Membership,membership_category_Premium Membership,membership_category_Silver Membership
0,0.000000,1,0.872,0.421806,0.024991,0.161563,0.000000,0,1,1,...,0,1,0,0,True,False,False,False,False,False
1,0.759259,1,0.856,0.439070,0.025924,0.191868,0.384211,1,0,1,...,0,0,0,1,False,False,False,True,False,False
2,0.981481,1,0.000,0.419760,0.054324,0.212072,0.293765,0,1,1,...,0,1,0,0,False,False,True,False,False,False
3,0.611111,0,0.968,0.423140,0.190213,0.121157,0.311522,1,0,0,...,0,0,1,0,False,False,False,False,False,True
4,0.370370,0,0.944,0.421940,0.173387,0.161563,0.100691,0,1,1,...,0,0,1,0,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7394,0.055556,0,0.928,0.570248,0.012901,0.100954,0.376657,1,0,0,...,0,1,0,0,False,False,False,False,False,True
7395,0.481481,1,0.864,0.453696,0.105664,0.151462,0.453183,0,1,0,...,0,1,0,0,True,False,False,False,False,False
7396,0.185185,0,0.872,0.468768,0.170820,0.262580,0.324624,1,1,0,...,0,1,0,0,False,False,True,False,False,False
7397,0.833333,0,0.000,0.465638,0.231195,0.090852,0.286776,1,1,1,...,0,1,0,0,False,False,True,False,False,False


## First strategy
Predicting churn rate

#### 1.1 Find the best hyperparameters for different models

In [9]:
# Random forest
param_grid = {
    'n_estimators': [ 300, 400, 500],
    'max_depth': [ 11, 13, 15],
    'min_samples_split': [9, 11, 13, 15, 17],
}

gs_rf = GridSearchCV(
    estimator=RandomForestClassifier(),
    param_grid=param_grid,
    n_jobs=-1, 
    cv = StratifiedKFold(4, random_state=42, shuffle=True),
    verbose=1,
    scoring='accuracy')

gs_rf.fit(X, y)

Fitting 4 folds for each of 45 candidates, totalling 180 fits


  return fit_method(estimator, *args, **kwargs)


In [10]:
gs_rf.best_estimator_

In [11]:
gs_rf.best_score_

np.float64(0.7599767844167363)

In [12]:
results_df = pd.DataFrame(gs_rf.cv_results_)
sorted_results = results_df.sort_values(by='rank_test_score')
top_4_parameters = sorted_results[['params', 'mean_test_score', 'std_test_score', 'rank_test_score']].head(4)
top_4_parameters

Unnamed: 0,params,mean_test_score,std_test_score,rank_test_score
14,"{'max_depth': 11, 'min_samples_split': 17, 'n_...",0.759977,0.005265,1
11,"{'max_depth': 11, 'min_samples_split': 15, 'n_...",0.759842,0.005846,2
13,"{'max_depth': 11, 'min_samples_split': 17, 'n_...",0.759842,0.005401,3
26,"{'max_depth': 13, 'min_samples_split': 15, 'n_...",0.75947,0.005625,4


In [13]:
top_4_parameters.iloc[0, 0]

{'max_depth': 11, 'min_samples_split': 17, 'n_estimators': 500}

In [14]:
top_4_parameters.iloc[3, 0]

{'max_depth': 13, 'min_samples_split': 15, 'n_estimators': 500}

In [15]:
best_rf = gs_rf.best_estimator_
with open('../model/strategy1/outliers_99/best_rf.pkl', 'wb') as file:
    pickle.dump(best_rf, file)

In [16]:
# LogisticRegression

param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet'], 
    'C': [ 0.01, 0.1, 1, 10],  
    'solver': ['liblinear', 'saga'],     
    'class_weight': [None, 'balanced'] 
}

gs_lr = GridSearchCV(
    estimator=LogisticRegression(),
    param_grid=param_grid,
    n_jobs=-1, 
    cv = StratifiedKFold(4, random_state=42, shuffle=True),
    verbose=1,
    scoring='accuracy')

gs_lr.fit(X, y)

Fitting 4 folds for each of 48 candidates, totalling 192 fits


64 fits failed out of a total of 192.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
32 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Pablo\miniconda3\envs\data_analysis_env\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Pablo\miniconda3\envs\data_analysis_env\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Pablo\miniconda3\envs\data_analysis_env\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1193, in fit
    solver = _check_solver(self.solver, self.pe

In [17]:
gs_lr.best_estimator_

In [18]:
gs_lr.best_score_

np.float64(0.6656981495164895)

In [19]:
best_lr = gs_lr.best_estimator_
with open('../model/strategy1/outliers_99/best_lr.pkl', 'wb') as file:
    pickle.dump(best_lr, file)

In [20]:
# KNNeighbours
param_grid = {
    'n_neighbors': [ 19, 21, 23, 25], 
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree'], 
    'p': [1, 2], # 1 for Manhattan, 2 for Euclidean
}

gs_knn = GridSearchCV(
    estimator=KNeighborsClassifier(),
    param_grid=param_grid,
    n_jobs=-1, 
    cv = StratifiedKFold(4, random_state=42, shuffle=True),
    verbose=1,
    scoring='accuracy')

gs_knn.fit(X, y)

Fitting 4 folds for each of 48 candidates, totalling 192 fits


  return self._fit(X, y)


In [21]:
gs_knn.best_score_

np.float64(0.6276144729011954)

In [22]:
best_knn = gs_knn.best_estimator_
with open('../model/strategy1/outliers_99/best_knn.pkl', 'wb') as file:
    pickle.dump(best_knn, file)

In [23]:
# XGBClassifier
param_grid_xgb = {
    'n_estimators': [100, 200, 300], 
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7],          
    'min_child_weight': [1, 5],   
    'subsample': [0.8, 1.0],          
    'colsample_bytree': [0.8, 1.0],  
    'reg_lambda': [1, 10]  }       

gs_xgb = GridSearchCV(
    estimator=XGBClassifier(),
    param_grid=param_grid_xgb,
    cv= StratifiedKFold(4, random_state=42, shuffle=True),
    n_jobs=-1,                     
    verbose=2,                     
    scoring='accuracy'           
)      

gs_xgb.fit(X, y)

Fitting 4 folds for each of 432 candidates, totalling 1728 fits


In [24]:
gs_xgb.best_score_

np.float64(0.7661608242873911)

In [25]:
best_xgb = gs_xgb.best_estimator_
with open('../model/strategy1/outliers_99/best_xgb.pkl', 'wb') as file:
    pickle.dump(best_xgb, file)

In [26]:
# SVC
param_grid_linearsvc = {
        'C': [0.01, 0.1, 1, 10],  
        'loss': ['hinge', 'squared_hinge'],
        'penalty': ['l1', 'l2'],       
        'dual': [True, False],             
    }

gs_lsvc = GridSearchCV(
    estimator=LinearSVC(),
    param_grid=param_grid_linearsvc,
    cv= StratifiedKFold(4, random_state=42, shuffle=True),
    n_jobs=-1,                     
    verbose=2,                     
    scoring='accuracy'           
)      

gs_lsvc.fit(X, y)

Fitting 4 folds for each of 32 candidates, totalling 128 fits


64 fits failed out of a total of 128.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
16 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Pablo\miniconda3\envs\data_analysis_env\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Pablo\miniconda3\envs\data_analysis_env\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Pablo\miniconda3\envs\data_analysis_env\Lib\site-packages\sklearn\svm\_classes.py", line 321, in fit
    self.coef_, self.intercept_, n_iter_ = _fit_liblinear(

In [27]:
gs_lsvc.best_score_

np.float64(0.6559659903771803)

In [28]:
best_lsvc = gs_lsvc.best_estimator_
with open('../model/strategy1/outliers_99/best_lsvc.pkl', 'wb') as file:
    pickle.dump(best_lsvc, file)

In [29]:
gs_rf.best_estimator_

In [30]:
# Voting classifier
print(gs_rf.best_params_, gs_lr.best_params_, gs_xgb.best_params_, gs_knn.best_params_, gs_lsvc.best_params_)

{'max_depth': 11, 'min_samples_split': 17, 'n_estimators': 500} {'C': 0.1, 'class_weight': None, 'penalty': 'l1', 'solver': 'saga'} {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 5, 'n_estimators': 100, 'reg_lambda': 10, 'subsample': 1.0} {'algorithm': 'auto', 'n_neighbors': 25, 'p': 1, 'weights': 'uniform'} {'C': 0.1, 'dual': False, 'loss': 'squared_hinge', 'penalty': 'l1'}


In [31]:
# Voting system 
estimators_hard = [
    ('rf', RandomForestClassifier(max_depth= 13, min_samples_split= 15, n_estimators= 400)),
    ('lr', LogisticRegression(C= 0.1, class_weight= None, penalty= 'l1', solver= 'saga')),
    ('xgb', XGBClassifier(colsample_bytree= 0.8, learning_rate= 0.05, max_depth= 3, min_child_weight= 1, n_estimators= 100, reg_lambda= 1, subsample= 0.8)),
    ('knn', KNeighborsClassifier(algorithm= 'auto', n_neighbors= 21, p= 1, weights= 'uniform')),
    ('lsvc', LinearSVC(C= 0.1, dual= False, loss= 'squared_hinge', penalty= 'l1'))
]

estimators_soft = [
    ('rf', RandomForestClassifier(max_depth= 13, min_samples_split= 15, n_estimators= 400)),
    ('lr', LogisticRegression(C= 0.1, class_weight= None, penalty= 'l1', solver= 'saga')),
    ('xgb', XGBClassifier(colsample_bytree= 0.8, learning_rate= 0.05, max_depth= 3, min_child_weight= 1, n_estimators= 100, reg_lambda= 1, subsample= 0.8)),
    ('knn', KNeighborsClassifier(algorithm= 'auto', n_neighbors= 21, p= 1, weights= 'uniform'))
]

voting_hard = VotingClassifier(
    estimators=estimators_hard,
    voting='hard',
    n_jobs=-1)

voting_soft = VotingClassifier(
    estimators=estimators_soft,
    voting='soft',
    n_jobs=-1)

In [32]:
voting_hard.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


In [33]:
with open('../model/strategy1/outliers_99/best_hard.pkl', 'wb') as file:
    pickle.dump(voting_hard, file)

In [34]:
voting_soft.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


In [35]:
with open('../model/strategy1/outliers_99/best_soft.pkl', 'wb') as file:
    pickle.dump(voting_soft, file)

In [36]:
y_pred = voting_hard.predict(X_val)
acc_hard = accuracy_score(y_true=y_val, y_pred=y_pred)
accuracy_score(y_true=y_val, y_pred=y_pred)
acc_hard

0.7077983511285308

In [37]:
y_pred = voting_soft.predict(X_val)
acc_soft = accuracy_score(y_true=y_val, y_pred=y_pred)
acc_soft

0.7688876875253413

In [38]:
my_dict = {'model': ['rf', 'lr', 'xgb', 'knn', 'svc', 'hard', 'soft'],
           'accuracy': [gs_rf.best_score_, gs_lr.best_score_, gs_xgb.best_score_, gs_knn.best_score_, gs_lsvc.best_score_, acc_hard, acc_soft]}

pd.DataFrame(my_dict)

Unnamed: 0,model,accuracy
0,rf,0.759977
1,lr,0.665698
2,xgb,0.766161
3,knn,0.627614
4,svc,0.655966
5,hard,0.707798
6,soft,0.768888


In [39]:
# The bes model is the soft voting. We decide to train it with all the data
voting_soft = VotingClassifier(
    estimators=estimators_soft,
    voting='soft',
    n_jobs=-1)

voting_soft.fit(X, y)

with open('../model/strategy1/outliers_99/best_all.pkl', 'wb') as file:
    pickle.dump(voting_soft, file)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
