In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings(action = 'ignore')
plt.style.use(['seaborn-bright','dark_background'])

In [5]:
data = pd.read_csv('churn_prediction_simple.csv')
data.head()

Unnamed: 0,customer_id,vintage,age,gender,dependents,occupation,city,customer_nw_category,branch_code,days_since_last_transaction,...,previous_month_end_balance,average_monthly_balance_prevQ,average_monthly_balance_prevQ2,current_month_credit,previous_month_credit,current_month_debit,previous_month_debit,current_month_balance,previous_month_balance,churn
0,1,3135,66,0,0.0,0,187.0,2,755,224.0,...,1458.71,1458.71,1449.07,0.2,0.2,0.2,0.2,1458.71,1458.71,0
1,6,2531,42,0,2.0,0,1494.0,3,388,58.0,...,1401.72,1643.31,1871.12,0.33,714.61,588.62,1538.06,1157.15,1677.16,1
2,7,263,42,1,0.0,0,1096.0,2,1666,60.0,...,16059.34,15211.29,13798.82,0.36,0.36,857.5,286.07,15719.44,15349.75,0
3,8,5922,72,0,0.0,1,1020.0,1,1,98.0,...,7714.19,7859.74,11232.37,0.64,0.64,1299.64,439.26,7076.06,7755.98,0
4,9,1145,46,0,0.0,0,623.0,2,317,172.0,...,8519.53,6511.82,16314.17,0.27,0.27,443.13,5688.44,8563.84,5317.04,0


In [6]:
data =  data.dropna()
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22067 entries, 0 to 22066
Data columns (total 21 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   customer_id                     22067 non-null  int64  
 1   vintage                         22067 non-null  int64  
 2   age                             22067 non-null  int64  
 3   gender                          22067 non-null  int64  
 4   dependents                      22067 non-null  float64
 5   occupation                      22067 non-null  int64  
 6   city                            22067 non-null  float64
 7   customer_nw_category            22067 non-null  int64  
 8   branch_code                     22067 non-null  int64  
 9   days_since_last_transaction     22067 non-null  float64
 10  current_balance                 22067 non-null  float64
 11  previous_month_end_balance      22067 non-null  float64
 12  average_monthly_balance_prevQ   

In [7]:
#checking the dataset distribution
data['churn'].value_counts()/len(data)

0    0.806317
1    0.193683
Name: churn, dtype: float64

In [8]:
#seperating dependent and independent variable
X = data.drop(columns =['churn','customer_id'])
Y = data['churn']

In [9]:
#scaling the data(into a standard format)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_X = scaler.fit_transform(X)

In [10]:
#splitting the dataset
from sklearn.model_selection import train_test_split as tts
x_train, x_test, y_train, y_test = tts(scaled_X, Y, train_size = 0.80, stratify = Y)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((17653, 19), (4414, 19), (17653,), (4414,))

In [11]:
#to emphasize the classes dapending upon no of observations
from sklearn.linear_model import LogisticRegression as lr
classifier = lr(class_weight = 'balanced')

In [12]:
classifier.fit(x_train, y_train)
predicted_values = classifier.predict(x_test)                         #predicting class
predicted_probabilities = classifier.predict_proba(x_test)            #predicting probabilities

In [13]:
predicted_values

array([0, 1, 1, ..., 0, 1, 1], dtype=int64)

In [14]:
predicted_probabilities, predicted_probabilities.shape

(array([[0.70526933, 0.29473067],
        [0.46525341, 0.53474659],
        [0.49756969, 0.50243031],
        ...,
        [0.63097878, 0.36902122],
        [0.24478694, 0.75521306],
        [0.46057817, 0.53942183]]),
 (4414, 2))

# Evaluation Metrices

In [15]:
#accuracy
classifier.score(x_test, y_test)

0.7206615314907113

In [16]:
#calculating the precision score
from sklearn.metrics import precision_score
precision = precision_score(y_test, predicted_values)
precision

0.369475138121547

In [17]:
#calculating recall score
from sklearn.metrics import recall_score
Recall = recall_score(y_test, predicted_values)
Recall

0.6257309941520468

In [18]:
#calculating the f1 score
from sklearn.metrics import f1_score
F1 = f1_score(y_test, predicted_values)
F1


0.46461137646547984

In [19]:
#calculating precision,recall,f1-score and support at once
from sklearn.metrics import precision_recall_fscore_support as PRF_summary
precision, recall, f1, support = PRF_summary(y_test, predicted_values)

In [20]:
precision

array([0.89211059, 0.36947514])

In [21]:
recall

array([0.74346727, 0.62573099])

In [22]:
f1

array([0.81103448, 0.46461138])

In [23]:
from sklearn.metrics import classification_report
k = classification_report(y_test, predicted_values)
print(k)

              precision    recall  f1-score   support

           0       0.89      0.74      0.81      3559
           1       0.37      0.63      0.46       855

    accuracy                           0.72      4414
   macro avg       0.63      0.68      0.64      4414
weighted avg       0.79      0.72      0.74      4414

