# Model Training – Customer Churn Prediction

This notebook trains a Support Vector Machine (SVM) classifier to predict customer churn.
The trained model is saved for later evaluation and inference.


In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline


In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import joblib 
import warnings 
warnings.filterwarnings("ignore")

In [3]:
df = pd.read_csv("../data/processed/churn_cleaned.csv")
X = pd.read_csv("../data/processed/X.csv")
y = pd.read_csv("../data/processed/y.csv")


## Train-Test Split

The dataset is split into training and testing sets to evaluate model generalization.


In [4]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [5]:
X

Unnamed: 0,Age,Gender,Tenure,MonthlyCharges
0,49,0,4,88.35
1,43,0,0,36.67
2,51,1,2,63.79
3,60,1,8,102.34
4,42,0,32,69.01
...,...,...,...,...
995,42,0,41,37.14
996,62,0,9,80.93
997,51,1,15,111.72
998,39,0,68,65.67


## Model Pipeline

A pipeline is used to standardize features and train an SVM classifier.
This ensures consistent preprocessing and avoids data leakage.


In [6]:
model = Pipeline([
    ("scaler", StandardScaler()),
    ("svc", SVC(kernel="rbf", probability=True, random_state=42))
])

model.fit(X_train, y_train)



0,1,2
,steps,"[('scaler', ...), ('svc', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,C,1.0
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,True
,tol,0.001
,cache_size,200
,class_weight,


In [7]:
scaler= StandardScaler()

In [8]:
X_train = scaler.fit_transform(X_train)

In [9]:
joblib.dump(scaler,"scaler.pkl")

['scaler.pkl']

In [10]:
X_test = scaler.fit_transform(X_test)

In [11]:
def modelperformance(predictions):
    print("Accuracy score on model is {}".format(accuracy_score(y_test,predictions)))

In [12]:
log_model = LogisticRegression()

In [13]:
log_model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [14]:
X

Unnamed: 0,Age,Gender,Tenure,MonthlyCharges
0,49,0,4,88.35
1,43,0,0,36.67
2,51,1,2,63.79
3,60,1,8,102.34
4,42,0,32,69.01
...,...,...,...,...
995,42,0,41,37.14
996,62,0,9,80.93
997,51,1,15,111.72
998,39,0,68,65.67


Predictions for the X test

In [15]:
log_model.predict(X_test)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1])

In [16]:
y_pred = log_model.predict(X_test)

In [17]:
modelperformance(y_pred)

Accuracy score on model is 0.865


### K-Neighbors Classifier

In [18]:
param_grid = {
    "n_neighbors" : [3,5,7,9],
    "weights" : ["uniform","distance"],
}

In [19]:
gridkn = GridSearchCV(KNeighborsClassifier(),param_grid, cv = 5)

In [20]:
gridkn.fit(X_train,y_train)

0,1,2
,estimator,KNeighborsClassifier()
,param_grid,"{'n_neighbors': [3, 5, ...], 'weights': ['uniform', 'distance']}"
,scoring,
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_neighbors,9
,weights,'distance'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [21]:
gridkn.best_params_

{'n_neighbors': 9, 'weights': 'distance'}

In [22]:
y_pred = gridkn.predict(X_test)

In [23]:
modelperformance(y_pred)

Accuracy score on model is 0.87


Here, we understand that Logistic Regression performs better than KNeighbors Classifiers

### Support Vector Machines

In [24]:
model = SVC(probability=True)


In [25]:
param_grid = {
    "C" : [0.01,0.1,0.5,1],
    "kernel" : ["linear","rbf","poly"]
}

In [26]:
gridsvc = GridSearchCV(model, param_grid,cv = 5)

In [27]:
gridsvc.fit(X_train,y_train)

0,1,2
,estimator,SVC(probability=True)
,param_grid,"{'C': [0.01, 0.1, ...], 'kernel': ['linear', 'rbf', ...]}"
,scoring,
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,C,0.01
,kernel,'linear'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,True
,tol,0.001
,cache_size,200
,class_weight,


In [28]:
import joblib
joblib.dump(model, "../models/model.pkl")


['../models/model.pkl']

In [29]:
model


0,1,2
,C,1.0
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,True
,tol,0.001
,cache_size,200
,class_weight,


In [30]:
gridsvc.best_params_

{'C': 0.01, 'kernel': 'linear'}

In [31]:
y_pred = gridsvc.predict(X_test)

In [32]:
modelperformance(y_pred)

Accuracy score on model is 0.865


Here,SVC leads on accuracy after Logistic Regression.

### Decision Tree Classifier


In [33]:
param_grid = {
    "criterion" : ["gini","entropy"],
    "splitter" : ["best","random"],
    "max_depth" : [None, 10, 20, 30],
    "min_samples_split" :[2,5,10],
    "min_samples_leaf" : [1,2,4]
}

In [34]:
grid_tree = GridSearchCV(DecisionTreeClassifier(),param_grid,cv=5)

In [35]:
grid_tree.fit(X_train,y_train)

0,1,2
,estimator,DecisionTreeClassifier()
,param_grid,"{'criterion': ['gini', 'entropy'], 'max_depth': [None, 10, ...], 'min_samples_leaf': [1, 2, ...], 'min_samples_split': [2, 5, ...], ...}"
,scoring,
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,criterion,'entropy'
,splitter,'random'
,max_depth,10
,min_samples_split,10
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [36]:
grid_tree.best_params_

{'criterion': 'entropy',
 'max_depth': 10,
 'min_samples_leaf': 1,
 'min_samples_split': 10,
 'splitter': 'random'}

In [37]:
y_pred = grid_tree.predict(X_test)

In [38]:
y_pred

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 1])

In [39]:
modelperformance(y_pred)

Accuracy score on model is 0.865


Random Forest

In [40]:
rfc_model = RandomForestClassifier()

In [41]:
param_grid = {
    "n_estimators": [32,64,128,256],
    "max_features" : [2,3,4],
    "bootstrap" : [True,False]
}

In [42]:
grid_rfc = GridSearchCV(rfc_model, param_grid, cv = 5)

In [43]:
grid_rfc.fit(X_train,y_train)

0,1,2
,estimator,RandomForestClassifier()
,param_grid,"{'bootstrap': [True, False], 'max_features': [2, 3, ...], 'n_estimators': [32, 64, ...]}"
,scoring,
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,128
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,2
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [44]:
grid_rfc.best_params_

{'bootstrap': True, 'max_features': 2, 'n_estimators': 128}

In [45]:
y_pred = grid_rfc.predict(X_test)

In [46]:
modelperformance(y_pred)

Accuracy score on model is 0.87


Best performning one is Support Vector Classifier

In [47]:
best_model = gridsvc.best_estimator_

In [48]:
joblib.dump(best_model,"model.pkl")

['model.pkl']

In [49]:
pip freeze > requirements.txt


Note: you may need to restart the kernel to use updated packages.
