In [251]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

In [252]:
start = time.time()

In [253]:
df = pd.read_csv("du_lieu_chuan.csv")

In [254]:
df.head()

Unnamed: 0.1,Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,0,7590-VHVEG,0,0,1,0,1,0,2,1,...,0,0,0,0,0,1,0,29.85,29.85,0
1,1,5575-GNVDE,1,0,0,0,34,1,0,1,...,1,0,0,0,1,0,1,56.95,1889.5,0
2,2,3668-QPYBK,1,0,0,0,2,1,0,1,...,0,0,0,0,0,1,1,53.85,108.15,1
3,3,7795-CFOCW,1,0,0,0,45,0,2,1,...,1,1,0,0,1,0,2,42.3,1840.75,0
4,4,9237-HQITU,0,0,0,0,2,1,0,2,...,0,0,0,0,0,1,0,70.7,151.65,1


In [255]:
df.drop('customerID', axis=1, inplace=True)
df.drop('Unnamed: 0', axis=1, inplace=True)

In [256]:
X = df.drop("Churn", axis=1)
y = df["Churn"]

In [257]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)

print("Number transactions X_train dataset: ", X_train.shape)
print("Number transactions y_train dataset: ", y_train.shape)
print("Number transactions X_test dataset: ", X_test.shape)
print("Number transactions y_test dataset: ", y_test.shape)

Number transactions X_train dataset:  (6328, 19)
Number transactions y_train dataset:  (6328,)
Number transactions X_test dataset:  (704, 19)
Number transactions y_test dataset:  (704,)


In [258]:
from sklearn.linear_model import LogisticRegression
clf_lr = LogisticRegression(C=0.095, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=42, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [259]:
clf_lr.fit(X_train, y_train)



LogisticRegression(C=0.095, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=42, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [260]:
clf_lr.intercept_

array([-0.46024781])

In [261]:
clf_lr.coef_

array([[-2.53997407e-02,  2.02382658e-01,  0.00000000e+00,
        -1.55335324e-01, -6.21635933e-02,  0.00000000e+00,
         1.78944141e-01,  6.80624377e-01, -3.64338501e-01,
        -5.73318114e-02,  0.00000000e+00, -3.47093937e-01,
         2.66394235e-01,  2.29243483e-01, -6.76836018e-01,
         2.81372324e-01, -1.53316437e-01, -5.02316970e-03,
         3.54244398e-04]])

In [262]:
Yhat_test = clf_lr.predict(X_test)

In [263]:
from sklearn.metrics import confusion_matrix,accuracy_score,roc_curve,classification_report
clf_lr_conf_matrix = confusion_matrix(y_test, Yhat_test)
clf_lr_conf_matrix

array([[452,  65],
       [ 86, 101]], dtype=int64)

In [264]:
clf_lr_acc_score = accuracy_score(y_test, Yhat_test)
clf_lr_acc_score

0.7855113636363636

In [265]:
from sklearn.metrics import f1_score
f1_score(y_test, Yhat_test)

0.5722379603399435

In [266]:
from sklearn.metrics import precision_score
precision_score(y_test, Yhat_test)

0.608433734939759

In [267]:
from sklearn.metrics import recall_score
recall_score(y_test, Yhat_test)

0.5401069518716578

In [268]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7032 entries, 0 to 7031
Data columns (total 20 columns):
gender              7032 non-null int64
SeniorCitizen       7032 non-null int64
Partner             7032 non-null int64
Dependents          7032 non-null int64
tenure              7032 non-null int64
PhoneService        7032 non-null int64
MultipleLines       7032 non-null int64
InternetService     7032 non-null int64
OnlineSecurity      7032 non-null int64
OnlineBackup        7032 non-null int64
DeviceProtection    7032 non-null int64
TechSupport         7032 non-null int64
StreamingTV         7032 non-null int64
StreamingMovies     7032 non-null int64
Contract            7032 non-null int64
PaperlessBilling    7032 non-null int64
PaymentMethod       7032 non-null int64
MonthlyCharges      7032 non-null float64
TotalCharges        7032 non-null float64
Churn               7032 non-null int64
dtypes: float64(2), int64(18)
memory usage: 1.1 MB


In [269]:
from sklearn.model_selection import KFold
def run_kfold(clf_lr):
    kf = KFold(n_splits =10,random_state =42, shuffle = True)
    outcomes1 = []
    outcomes2 = []
    outcomes3 = []
    outcomes4 = []
    fold = 0
    for train_index, test_index in  kf.split(df):
        fold += 1
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]
        clf_lr.fit(X_train, y_train)
        Yhat_test = clf_lr.predict(X_test)
        precision = precision_score(y_test, Yhat_test)
        recall = recall_score(y_test, Yhat_test)
        f1 = f1_score(y_test, Yhat_test)
        accuracy = accuracy_score(y_test, Yhat_test)
        outcomes1.append(precision)
        outcomes2.append(recall)
        outcomes3.append(f1)
        outcomes4.append(accuracy)
        print("Fold {0} precision: {1}".format(fold, precision))
        print("Fold {0} recall: {1}".format(fold, recall))
        print("Fold {0} f1: {1}".format(fold, f1))
        print("Fold {0} accuracy: {1}".format(fold, accuracy))
    mean_outcome1 = np.mean(outcomes1)
    mean_outcome2 = np.mean(outcomes2)
    mean_outcome3 = np.mean(outcomes3)    
    mean_outcome4 = np.mean(outcomes4)
    print("Mean precision: {0}".format(mean_outcome1))
    print("Mean recall: {0}".format(mean_outcome2))
    print("Mean f1: {0}".format(mean_outcome3))
    print("Mean Accuracy: {0}".format(mean_outcome4)) 
run_kfold(clf_lr)

Fold 1 precision: 0.6645161290322581
Fold 1 recall: 0.533678756476684
Fold 1 f1: 0.5919540229885057
Fold 1 accuracy: 0.7982954545454546




Fold 2 precision: 0.5828220858895705
Fold 2 recall: 0.521978021978022
Fold 2 f1: 0.5507246376811593
Fold 2 accuracy: 0.7798295454545454
Fold 3 precision: 0.6790123456790124
Fold 3 recall: 0.5882352941176471
Fold 3 f1: 0.6303724928366763
Fold 3 accuracy: 0.8165007112375533
Fold 4 precision: 0.6642335766423357
Fold 4 recall: 0.5290697674418605
Fold 4 f1: 0.5889967637540453
Fold 4 accuracy: 0.8193456614509246




Fold 5 precision: 0.7
Fold 5 recall: 0.6028708133971292
Fold 5 f1: 0.647814910025707
Fold 5 accuracy: 0.8051209103840683
Fold 6 precision: 0.5870967741935483
Fold 6 recall: 0.4918918918918919
Fold 6 f1: 0.5352941176470588
Fold 6 accuracy: 0.7752489331436699




Fold 7 precision: 0.6363636363636364
Fold 7 recall: 0.621301775147929
Fold 7 f1: 0.6287425149700598
Fold 7 accuracy: 0.8236130867709816
Fold 8 precision: 0.6266666666666667
Fold 8 recall: 0.5164835164835165
Fold 8 f1: 0.5662650602409638
Fold 8 accuracy: 0.7951635846372689
Fold 9 precision: 0.6951871657754011




Fold 9 recall: 0.6132075471698113
Fold 9 f1: 0.6516290726817042
Fold 9 accuracy: 0.802275960170697
Fold 10 precision: 0.7096774193548387
Fold 10 recall: 0.6179775280898876
Fold 10 f1: 0.6606606606606606
Fold 10 accuracy: 0.8392603129445235
Mean precision: 0.6545575799597269
Mean recall: 0.5636694912194379
Mean f1: 0.605245425348654
Mean Accuracy: 0.8054654160739686




In [270]:
end = time.time()

In [271]:
print(end - start)

1.4582598209381104
