In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.utils import resample

In [None]:
data=pd.read_csv("customer_churn.csv")
# data

In [None]:
X = data[['tenure','SeniorCitizen','MonthlyCharges']]
y = data["Churn"]

In [None]:
y.replace(('Yes', 'No'), (1, 0), inplace=True)
y

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [None]:
# all features are numeric, so no need to split into _num and _cat
transformer = StandardScaler().fit(X_train)
X_train_scaled = pd.DataFrame(transformer.transform(X_train),columns=X.columns)
# because this is the only tranformation we apply to the numerics, 
# we can immediately transform the X_test as well
X_test_scaled = pd.DataFrame(transformer.transform(X_test),columns=X.columns)
X_train_scaled.head()

In [None]:
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression(random_state=0, solver='lbfgs')
LR.fit(X_train_scaled, y_train)

In [None]:
logits = np.log(1/LR.predict_proba(X_train_scaled)[:,1]-1)

In [None]:
count_classes = data['Churn'].value_counts()
count_classes.plot(kind = 'bar')

In [None]:
count_classes = data['Churn'].value_counts()
count_classes

In [None]:
5174/(5174+1869)

In [None]:
LR = LogisticRegression(random_state=0, solver='lbfgs')
LR.fit(X_train_scaled, y_train)
LR.score(X_test_scaled, y_test)
# we are getting more then 70% accuracy because our samples are 73% 'yes' 
# meaning at least 73% of our model will predict 'yes' (it will be corect but not exact enough)

In [None]:
pred = LR.predict(X_test_scaled)

print("precision: ",precision_score(y_test,pred))
print("recall: ",recall_score(y_test,pred))
print("f1: ",f1_score(y_test,pred))

In [None]:
#SMOTE
sm = SMOTE(random_state=100,k_neighbors=3)
X_train_SMOTE,y_train_SMOTE = sm.fit_resample(X_train_scaled,y_train)

In [None]:
LR = LogisticRegression(max_iter=1000)
LR.fit(X_train_SMOTE, y_train_SMOTE)
pred = LR.predict(X_test_scaled)

print("precision: ",precision_score(y_test,pred))
print("recall: ",recall_score(y_test,pred))
print("f1: ",f1_score(y_test,pred))

In [None]:
train = pd.concat([X_train_scaled, y_train],axis=1)     # oversampling
train.head()

In [None]:
no = train[train['Churn']==0]
yes = train[train['Churn']==1]

In [None]:
yes_oversampled = resample(yes, #<- sample from here
                                    replace=True, #<- we need replacement, since we don't have enough data otherwise
                                    n_samples = len(no),#<- make both sets the same size
                                    random_state=0)

In [None]:
display(no.shape)
display(yes_oversampled.shape)    #now we get the same number of samples

In [None]:
train_oversampled = pd.concat([no,yes_oversampled],axis=0)
train_oversampled.head()

In [None]:
y_train_over = train_oversampled['Churn'].copy()
X_train_over = train_oversampled.drop('Churn',axis = 1).copy()

In [None]:
LR = LogisticRegression(max_iter=1000)
LR.fit(X_train_over, y_train_over)
pred = LR.predict(X_test_scaled)

print("precision: ",precision_score(y_test,pred))
print("recall: ",recall_score(y_test,pred))
print("f1: ",f1_score(y_test,pred))

In [None]:
# my predictions are not getting a lot better and i dont really understand why...