# Lab | Imbalanced data

In [1]:
import pandas as pd

df = pd.read_csv('customer_churn.csv')

# Display the first few rows of the dataset
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


# Split the Dataset into X ('tenure', 'SeniorCitizen', 'MonthlyCharges') and y ('Churn')


In [2]:
# Splitting the dataset into X and y
X = df[['tenure', 'SeniorCitizen', 'MonthlyCharges']]
y = df['Churn']

X.head(), y.head()

(   tenure  SeniorCitizen  MonthlyCharges
 0       1              0           29.85
 1      34              0           56.95
 2       2              0           53.85
 3      45              0           42.30
 4       2              0           70.70,
 0     No
 1     No
 2    Yes
 3     No
 4    Yes
 Name: Churn, dtype: object)

# Build the logistic regression model.


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [9]:
# Splitting the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [10]:
# Building the logistic regression model
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)

# Predicting on the test set
y_pred = logreg.predict(X_test)

In [11]:
# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)

In [12]:
accuracy

0.7906316536550745

In [13]:
print(classification_rep)

              precision    recall  f1-score   support

          No       0.83      0.90      0.86      1033
         Yes       0.64      0.49      0.56       376

    accuracy                           0.79      1409
   macro avg       0.73      0.69      0.71      1409
weighted avg       0.78      0.79      0.78      1409



In [20]:
# Getting classification report in dictionary format
report_dict = classification_report(y_test, y_pred, output_dict=True)

# Extracting individual metrics for the positive class ("Yes")
precision_from_report = report_dict['Yes']['precision']
recall_from_report = report_dict['Yes']['recall']
f1_from_report = report_dict['Yes']['f1-score']

precision_from_report, recall_from_report, f1_from_report

(0.6411149825783972, 0.48936170212765956, 0.5550527903469079)

In [21]:
confusion_mat

array([[930, 103],
       [192, 184]], dtype=int64)

In [22]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

# Even a simple model will give us more than 70% accuracy. Why?

The higher accuracy in predicting customer churn is likely due to an imbalance in the dataset. If one class (e.g., customers who didn't churn) significantly outnumbers the other class (e.g., customers who churned), even a naive model that always predicts the majority class will achieve high accuracy. This is why accuracy alone can sometimes be misleading in imbalanced datasets.

# Applying SMOTE and Building a Logistic Regression Model Again

In [23]:
from sklearn.neighbors import NearestNeighbors
import numpy as np



In [24]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(k_neighbors=3)
X_train_SMOTE,y_train_SMOTE = sm.fit_resample(X_train,y_train)

In [25]:
X_train_SMOTE.shape

(8282, 3)

In [28]:
LR = LogisticRegression(random_state=0, solver='lbfgs')
LR.fit(X_train_SMOTE, y_train_SMOTE)
pred = LR.predict(X_test)

In [30]:
# Predicting using the  test data
pred = LR.predict(X_test)

# Calculating the evaluation metrics
precision_corrected = precision_score(y_test, pred, pos_label="Yes")
recall_corrected = recall_score(y_test, pred, pos_label="Yes")
f1_corrected = f1_score(y_test, pred, pos_label="Yes")
accuracy_corrected = accuracy_score(y_test, pred)

precision_corrected, recall_corrected, f1_corrected, accuracy_corrected


(0.4854014598540146,
 0.7074468085106383,
 0.5757575757575758,
 0.7217885024840313)