In [433]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
#from sklearn.preprocessing import StandardScaler

from sklearn.utils import resample
from imblearn.over_sampling import SMOTE


pd.set_option('display.max_columns', None)

# Load the dataset and explore the variables.

In [434]:
data = pd.read_csv('customer_churn.csv')
display(data.head(2))
display(data.tail(2))

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Mailed check,74.4,306.6,Yes
7042,3186-AJIEK,Male,0,No,No,66,Yes,No,Fiber optic,Yes,No,Yes,Yes,Yes,Yes,Two year,Yes,Bank transfer (automatic),105.65,6844.5,No


In [435]:
#data.shape
#data.dtypes
#data.isna().sum()
#data.drop_duplicates()
round(data['Churn'].value_counts()/len(data['Churn']*100), 2)

No     0.73
Yes    0.27
Name: Churn, dtype: float64

NOTE: our dataset contains 73% of 'noes' and 27% of 'yeses'. This imbalance will have an effect on the development of our model, and therefore corrective measures will have to be introduced.

# Cleaning and preparing data for analysis

In [436]:
# Standardizing header names to all lower

cols = []
for i in range(len(data.columns)):
    cols.append(data.columns[i].lower())
data.columns = cols
data.columns

Index(['customerid', 'gender', 'seniorcitizen', 'partner', 'dependents',
       'tenure', 'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod', 'monthlycharges', 'totalcharges', 'churn'],
      dtype='object')

In [437]:
# Dropping columns that will not be used for the analytical process

data = data.drop(['customerid', 'gender', 'partner', 'dependents', 
                  'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod', 'totalcharges'], axis = 1)


In [438]:
# Renaming headers

data = data.rename(columns = {'seniorcitizen' : 'senior', 'monthlycharges': 'charges'})
data

Unnamed: 0,senior,tenure,charges,churn
0,0,1,29.85,No
1,0,34,56.95,No
2,0,2,53.85,Yes
3,0,45,42.30,No
4,0,2,70.70,Yes
...,...,...,...,...
7038,0,24,84.80,No
7039,0,72,103.20,No
7040,0,11,29.60,No
7041,1,4,74.40,Yes


In [439]:
# Converting target column 'churn' into boolean

def churn(x):
        
        if x == 'No':
            return 0
        else:
            return 1

data['churn'] = list(map(churn, data['churn']))
data.head(10)

Unnamed: 0,senior,tenure,charges,churn
0,0,1,29.85,0
1,0,34,56.95,0
2,0,2,53.85,1
3,0,45,42.3,0
4,0,2,70.7,1
5,0,8,99.65,1
6,0,22,89.1,0
7,0,10,29.75,0
8,0,28,104.8,1
9,0,62,56.15,0


In [440]:
# Converting feature column 'senior' into categorical

def seniority(x):
        
        if x == 1:
            return 'senior'
        else:
            return 'no_senior'

data['senior'] = list(map(seniority, data['senior']))
data.head(10)

Unnamed: 0,senior,tenure,charges,churn
0,no_senior,1,29.85,0
1,no_senior,34,56.95,0
2,no_senior,2,53.85,1
3,no_senior,45,42.3,0
4,no_senior,2,70.7,1
5,no_senior,8,99.65,1
6,no_senior,22,89.1,0
7,no_senior,10,29.75,0
8,no_senior,28,104.8,1
9,no_senior,62,56.15,0


# X/y split

In [441]:
y = data['churn']
X = data.drop(['churn'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1337)

# Scaling process

### Scaling train numericals

In [442]:
X_train_num = X_train.select_dtypes(include = np.number)

transformer = StandardScaler().fit(X_train_num)
X_train_num_norm = pd.DataFrame(transformer.transform(X_train_num),columns=X_train_num.columns)
X_train_num_norm

Unnamed: 0,tenure,charges
0,-0.961848,0.651056
1,-0.350268,-1.505296
2,1.280614,0.539406
3,0.220541,-1.488632
4,-0.676444,-1.505296
...,...,...
5629,-0.676444,-0.513774
5630,1.606791,0.701049
5631,-0.268724,-1.501963
5632,0.302085,-0.275475


### Scaling train categoricals

In [443]:
X_train_cat = X_train.select_dtypes(object)

encoder = OneHotEncoder(drop='first').fit(X_train_cat)
encoded = encoder.transform(X_train_cat).toarray()

cols = encoder.get_feature_names_out(input_features=X_train_cat.columns)

X_train_cat_norm = onehot_encoded = pd.DataFrame(encoded, columns=cols)
X_train_cat_norm.head()

Unnamed: 0,senior_senior
0,1.0
1,0.0
2,0.0
3,0.0
4,0.0


### Concatenating X_train_scaled

In [444]:
X_train_scaled = pd.concat([X_train_num_norm, X_train_cat_norm], axis=1)
X_train_scaled

Unnamed: 0,tenure,charges,senior_senior
0,-0.961848,0.651056,1.0
1,-0.350268,-1.505296,0.0
2,1.280614,0.539406,0.0
3,0.220541,-1.488632,0.0
4,-0.676444,-1.505296,0.0
...,...,...,...
5629,-0.676444,-0.513774,0.0
5630,1.606791,0.701049,0.0
5631,-0.268724,-1.501963,0.0
5632,0.302085,-0.275475,0.0


### Scaling test numericals

In [445]:
X_test_num = X_test.select_dtypes(include = np.number)

transformer = StandardScaler().fit(X_test_num)
X_test_num_norm = pd.DataFrame(transformer.transform(X_test_num),columns=X_test_num.columns)
# because this is the only tranformation we apply to the numerics, 
# we can immediately transform the X_test as well
#X_test_num_norm = pd.DataFrame(transformer.transform(X_test_num),columns=X.columns)
#X_test_scaled.head()
X_test_num_norm

Unnamed: 0,tenure,charges
0,-1.236405,-0.657440
1,-0.871475,0.051066
2,-0.912023,-1.446495
3,1.601939,-1.305123
4,-0.668736,-0.839909
...,...,...
1404,-0.547093,-1.454714
1405,0.750436,1.293828
1406,0.790984,-1.295260
1407,-1.195857,-1.439920


### Scaling test categoricals

In [446]:
X_test_cat = X_test.select_dtypes(object)

encoder = OneHotEncoder(drop='first').fit(X_test_cat)
encoded = encoder.transform(X_test_cat).toarray()

cols = encoder.get_feature_names_out(input_features=X_test_cat.columns)

X_test_cat_norm = onehot_encoded = pd.DataFrame(encoded, columns=cols)
X_test_cat_norm.head()

Unnamed: 0,senior_senior
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0


### Concatenating X_test_scaled

In [447]:
X_test_scaled = pd.concat([X_test_num_norm, X_test_cat_norm], axis=1)
X_test_scaled

Unnamed: 0,tenure,charges,senior_senior
0,-1.236405,-0.657440,0.0
1,-0.871475,0.051066,0.0
2,-0.912023,-1.446495,0.0
3,1.601939,-1.305123,0.0
4,-0.668736,-0.839909,0.0
...,...,...,...
1404,-0.547093,-1.454714,0.0
1405,0.750436,1.293828,0.0
1406,0.790984,-1.295260,0.0
1407,-1.195857,-1.439920,0.0


### y_train & y_test index reset

In [448]:
# This step is needed because during the transformation process of X_train_scaled the index has been reset.
# We then need to reset the index for y_train & y_test to make them match.

y_train = y_train.reset_index(drop=True) 
y_test = y_test.reset_index(drop=True) 

# Model

In [449]:
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression(random_state=0, solver='lbfgs')
LR.fit(X_train_scaled, y_train)

In [450]:
LR.predict_proba(X_train_scaled)

array([[0.26700132, 0.73299868],
       [0.92385077, 0.07614923],
       [0.9427654 , 0.0572346 ],
       ...,
       [0.93104209, 0.06895791],
       [0.9034702 , 0.0965298 ],
       [0.73661379, 0.26338621]])

In [451]:
LR = LogisticRegression(random_state=0, solver='lbfgs')
LR.fit(X_train_scaled, y_train)
LR.score(X_test_scaled, y_test)

0.7863733144073811

In [452]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

pred = LR.predict(X_test_scaled)

print("precision: ",precision_score(y_test,pred))
print("recall: ",recall_score(y_test,pred))
print("f1: ",f1_score(y_test,pred))

precision:  0.6482213438735178
recall:  0.43617021276595747
f1:  0.5214626391096979


In [453]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,pred)

array([[944,  89],
       [212, 164]], dtype=int64)

In [454]:
X_train_scaled

Unnamed: 0,tenure,charges,senior_senior
0,-0.961848,0.651056,1.0
1,-0.350268,-1.505296,0.0
2,1.280614,0.539406,0.0
3,0.220541,-1.488632,0.0
4,-0.676444,-1.505296,0.0
...,...,...,...
5629,-0.676444,-0.513774,0.0
5630,1.606791,0.701049,0.0
5631,-0.268724,-1.501963,0.0
5632,0.302085,-0.275475,0.0


In [455]:
from sklearn.utils import resample

In [456]:
train = pd.concat([X_train_scaled, y_train],axis=1)
train.head()

Unnamed: 0,tenure,charges,senior_senior,churn
0,-0.961848,0.651056,1.0,1
1,-0.350268,-1.505296,0.0,0
2,1.280614,0.539406,0.0,0
3,0.220541,-1.488632,0.0,0
4,-0.676444,-1.505296,0.0,0


# SMOTE

In [457]:
sm = SMOTE(random_state=100, k_neighbors=3)
X_train_SMOTE,y_train_SMOTE = sm.fit_resample(X_train_scaled,y_train)

In [458]:
X_train_SMOTE.shape

(8282, 3)

In [460]:
LR = LogisticRegression(random_state=0, solver='lbfgs')
LR.fit(X_train_SMOTE, y_train_SMOTE)
LR.score(X_train_SMOTE, y_train_SMOTE)

0.7285679787490944

In [461]:
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression(random_state=0, solver='lbfgs')
LR.fit(X_train_SMOTE, y_train_SMOTE)
pred = LR.predict(X_test_scaled)

print("precision: ",precision_score(y_test,pred))
print("recall: ",recall_score(y_test,pred))
print("f1: ",f1_score(y_test,pred))

precision:  0.5018587360594795
recall:  0.7180851063829787
f1:  0.5908096280087527


In [462]:
confusion_matrix(y_test,pred)

array([[765, 268],
       [106, 270]], dtype=int64)

In [463]:
# Previous scores
# accuracy 0.7863733144073811
# precision:  0.6482213438735178
# recall:  0.43617021276595747
# f1:  0.5214626391096979
# confusion matrix
# array([[944,  89],
  #     [212, 164]], dtype=int64)

Comments: through the application of SMOTE our model has lost precision and accuracy, but there has been a significant improvement in recall and f1 values. 