In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import warnings
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import train_test_split
warnings.filterwarnings('ignore')

In [5]:
# 1) Load the dataset and explore the variables. √
df = pd.read_csv('customer_churn.csv')
df.shape

(7043, 21)

In [9]:
# 2) We will try to predict variable Churn using a logistic regression on variables tenure, SeniorCitizen,
# MonthlyCharges. √
# 3) Extract the target variable. √
# 4) Extract the independent variables and scale them. √
# 5) Build the logistic regression model. √
# 6) Evaluate the model. √
numericData = df[['tenure', 'SeniorCitizen','MonthlyCharges']]
transformer = StandardScaler().fit(df[['tenure','SeniorCitizen','MonthlyCharges']])
scaled_x = transformer.transform(df[['tenure','SeniorCitizen','MonthlyCharges']])

#y = pd.DataFrame(data=df, columns=['Churn'])
y = df['Churn']


X_train, X_test, y_train, y_test = train_test_split(scaled_x, y, test_size=0.3, random_state=100)
classification = LogisticRegression(random_state=0, multi_class='ovr').fit(X_train, y_train)
predictions = classification.predict(X_test)

classification.score(X_test, y_test)

0.7808802650260294

In [10]:
# 7) Even a simple model will give us more than 70% accuracy. Why? A: it is because of the imbalance of the data
y.value_counts()

No     5174
Yes    1869
Name: Churn, dtype: int64

In [None]:
# 8) Synthetic Minority Oversampling TEchnique (SMOTE) is an over sampling technique based on nearest neighbors 
# that adds new points between existing points. Apply imblearn.over_sampling.SMOTE to the dataset. 
# Build and evaluate the logistic regression model. Is it there any improvement?

In [22]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()
X = df[['tenure', 'SeniorCitizen','MonthlyCharges']]
transformer = StandardScaler().fit(X)
X = transformer.transform(X)
y = pd.DataFrame(data=df, columns=['Churn'])
X_sm, y_sm = smote.fit_resample(X, y)
y_sm.value_counts()

Churn
No       5174
Yes      5174
dtype: int64

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.3, random_state=100)
classification = LogisticRegression(random_state=0, multi_class='ovr').fit(X_train, y_train)
predictions = classification.predict(X_test)

#check accuracity
classification.score(X_test, y_test)

0.744927536231884

In [24]:
# 9) Tomek links are pairs of very close instances, but of opposite classes. Removing the instances 
# of the majority class of each pair increases the space between the two classes, 
# facilitating the classification process. Apply imblearn.under_sampling.TomekLinks to the dataset. 
# Build and evaluate the logistic regression model. Is it there any improvement?

In [28]:
from imblearn.under_sampling import TomekLinks

tl = TomekLinks('majority')
X_tl, y_tl = tl.fit_resample(X, y)
y_tl.value_counts()

Churn
No       4694
Yes      1869
dtype: int64

In [29]:
X_tl2, y_tl2 = tl.fit_resample(X_tl, y_tl)
y_tl2.value_counts()

Churn
No       4541
Yes      1869
dtype: int64

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X_tl, y_tl, test_size=0.3, random_state=100)
classification = LogisticRegression(random_state=0, multi_class='ovr').fit(X_train, y_train)
predictions = classification.predict(X_test)

#check for accuracity
classification.score(X_test, y_test)

0.7973590655154901