In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.utils import resample

pd.set_option('display.max_columns',None)

In [3]:
data = pd.read_csv('Customer-Churn.csv')
data.shape

(7043, 16)

In [4]:
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')


In [5]:
missing_values_count = data.isnull().sum()
print("Missing values count:\n", missing_values_count)

Missing values count:
 gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64


In [6]:
data = data.dropna()

In [25]:
features = ['tenure', 'SeniorCitizen', 'MonthlyCharges', 'TotalCharges']
X = data[features]
y=data['Churn']

In [30]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, data['Churn'], test_size=0.2, random_state=42)
logreg_model = LogisticRegression()
logreg_model.fit(X_train, y_train)
y_pred = logreg_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy on test data: {accuracy:.2f}")

Accuracy on test data: 0.78


In [17]:
print("Class distribution before resampling:")
print(y_train.value_counts())

Class distribution before resampling:
No     4130
Yes    1495
Name: Churn, dtype: int64


In [26]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()
X_sm, y_sm = smote.fit_resample(X, y)
y_sm.value_counts()

No     5163
Yes    5163
Name: Churn, dtype: int64

In [31]:
scaler = StandardScaler()
X_scaled_sm = scaler.fit_transform(X_sm)
X_train_sm, X_test_sm, y_train_sm, y_test_sm = train_test_split(X_scaled_sm, y_sm, test_size=0.2, random_state=42)
logreg_model = LogisticRegression()
logreg_model.fit(X_train_sm, y_train_sm)
y_pred_sm = logreg_model.predict(X_test_sm)
accuracy = accuracy_score(y_test_sm, y_pred_sm)

print(f"Accuracy on test data: {accuracy:.2f}")

Accuracy on test data: 0.74


In [29]:
from imblearn.under_sampling import TomekLinks
tl = TomekLinks(sampling_strategy='majority')
X_tl, y_tl = tl.fit_resample(X, y)
y_tl.value_counts()

No     4610
Yes    1869
Name: Churn, dtype: int64

In [32]:
scaler = StandardScaler()
X_scaled_tl = scaler.fit_transform(X_tl)
X_train_tl, X_test_tl, y_train_tl, y_test_tl = train_test_split(X_scaled_tl, y_tl, test_size=0.2, random_state=42)
logreg_model = LogisticRegression()
logreg_model.fit(X_train_tl, y_train_tl)
y_pred_tl = logreg_model.predict(X_test_tl)
accuracy = accuracy_score(y_test_tl, y_pred_tl)

print(f"Accuracy on test data: {accuracy:.2f}")

Accuracy on test data: 0.77


The upsampler had the best accuracy score