In [16]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')


In [17]:
churnData = pd.read_csv('customer_churn.csv')
churnData.head(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [18]:
churnData['Churn'].value_counts()

No     5174
Yes    1869
Name: Churn, dtype: int64

In [19]:
churnData.isna().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [20]:
data = churnData[['tenure', 'SeniorCitizen', 'MonthlyCharges', 'Churn']]

In [21]:
data

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges,Churn
0,1,0,29.85,No
1,34,0,56.95,No
2,2,0,53.85,Yes
3,45,0,42.30,No
4,2,0,70.70,Yes
...,...,...,...,...
7038,24,0,84.80,No
7039,72,0,103.20,No
7040,11,0,29.60,No
7041,4,1,74.40,Yes


In [22]:
data.describe()

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,32.371149,0.162147,64.761692
std,24.559481,0.368612,30.090047
min,0.0,0.0,18.25
25%,9.0,0.0,35.5
50%,29.0,0.0,70.35
75%,55.0,0.0,89.85
max,72.0,1.0,118.75


In [23]:
y = pd.DataFrame(data=data, columns=['Churn'])
X = data.select_dtypes(include=['float64','int64'])

In [24]:
transformer = StandardScaler().fit(X)
scaled_x = transformer.transform(X)

In [25]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(scaled_x, y, test_size=0.2, random_state=42)

In [26]:
classification = LogisticRegression(random_state=42, max_iter=1000)
classification.fit(X_train, y_train)

LogisticRegression(max_iter=1000, random_state=42)

In [27]:
classification.score(X_test, y_test)

0.8076650106458482

In [28]:
X_smote = X.copy()
y_smote = y.copy()

In [42]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()

transformer = StandardScaler().fit(X_smote)
X_smote_scaled = transformer.transform(X_smote)
X_sm, y_sm = smote.fit_resample(X_smote_scaled, y_smote)
y_sm.value_counts()

Churn
Yes      5174
No       5174
dtype: int64

In [43]:
X_train_smote, X_test_smote, y_train_smote, y_test_smote = train_test_split(X_sm, y_sm, test_size=0.2, random_state=70)

In [44]:
classification = LogisticRegression(random_state=42, max_iter=1000).fit(X_train_smote, y_train_smote)

y_predict_smote = classification.predict(X_test_smote)

In [45]:
from sklearn.metrics import classification_report

print(classification_report(y_test_smote, y_predict_smote, labels=['Yes','No']))

              precision    recall  f1-score   support

         Yes       0.75      0.75      0.75      1050
          No       0.74      0.74      0.74      1020

    accuracy                           0.74      2070
   macro avg       0.74      0.74      0.74      2070
weighted avg       0.74      0.74      0.74      2070

