In [200]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [150]:
dataframe = pd.read_csv('data/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')
dataframe

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


In [151]:
print("Dataset columns: ",dataframe.columns)
print("Dataset column count: ", len(dataframe.columns))
print("Dataset row count: ",len(dataframe))

Dataset columns:  Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')
Dataset column count:  21
Dataset row count:  7043


Process Customer ID

In [152]:
dataframe.drop('customerID', axis=1, inplace=True)

Process Gender

In [153]:
dataframe.groupby(['gender']).size().reset_index(name='counts')

Unnamed: 0,gender,counts
0,Female,3488
1,Male,3555


In [154]:
gender_encoder = LabelEncoder()
gender_encoder.fit(dataframe['gender'])
dataframe["gender_encoded"] = gender_encoder.transform(dataframe['gender'])

In [155]:
dataframe.columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges', 'Churn', 'gender_encoded'],
      dtype='object')

In [156]:
dataframe.drop('gender',inplace=True, axis=1)

Process SeniorCitizen

In [157]:
dataframe.groupby(['SeniorCitizen']).size().reset_index(name='counts')

Unnamed: 0,SeniorCitizen,counts
0,0,5901
1,1,1142


Process Partner

In [158]:
dataframe.groupby(['Partner']).size().reset_index(name='counts')

Unnamed: 0,Partner,counts
0,No,3641
1,Yes,3402


In [159]:
Partner_encoder = LabelEncoder()
Partner_encoder.fit(dataframe['Partner'])
dataframe["Partner_encoded"] = Partner_encoder.transform(dataframe['Partner'])
dataframe.drop('Partner',inplace=True, axis=1)

Process Dependents

In [160]:
dataframe.groupby(['Dependents']).size().reset_index(name='counts')

Unnamed: 0,Dependents,counts
0,No,4933
1,Yes,2110


In [161]:
Dependents_encoder = LabelEncoder()
Dependents_encoder.fit(dataframe['Dependents'])
dataframe["Dependents_encoded"] = Dependents_encoder.transform(dataframe['Dependents'])
dataframe.drop(['Dependents'],inplace=True, axis=1)

Process PhoneService

In [162]:
dataframe.groupby(['PhoneService']).size().reset_index(name='counts')

Unnamed: 0,PhoneService,counts
0,No,682
1,Yes,6361


In [163]:
PhoneService_encoder = LabelEncoder()
PhoneService_encoder.fit(dataframe['PhoneService'])
dataframe["PhoneService_encoded"] = PhoneService_encoder.transform(dataframe['PhoneService'])
dataframe.drop(['PhoneService'],inplace=True, axis=1)

Process MultipleLines

In [164]:
dataframe.groupby(['MultipleLines']).size().reset_index(name='counts')

Unnamed: 0,MultipleLines,counts
0,No,3390
1,No phone service,682
2,Yes,2971


In [165]:
dataframe['MultipleLines_No'] = dataframe['MultipleLines'].map( {'No': 1, 'No phone service': 0, 'Yes':0 } ).astype(int)
dataframe['MultipleLines_No_phone_service'] = dataframe['MultipleLines'].map( {'No': 0, 'No phone service': 1, 'Yes':0 } ).astype(int)
dataframe.drop(['MultipleLines'],inplace=True, axis=1)

Process InternetService

In [166]:
dataframe.groupby(['InternetService']).size().reset_index(name='counts')

Unnamed: 0,InternetService,counts
0,DSL,2421
1,Fiber optic,3096
2,No,1526


In [167]:
dataframe['InternetService_DSL'] = dataframe['InternetService'].map( {'DSL': 1, 'Fiber optic': 0, 'No':0 } ).astype(int)
dataframe['InternetService_Fiber_optic'] = dataframe['InternetService'].map( {'DSL': 0, 'Fiber optic': 1, 'No':0 } ).astype(int)
dataframe.drop(['InternetService'],inplace=True, axis=1)

Process OnlineSecurity

In [168]:
dataframe.groupby(['OnlineSecurity']).size().reset_index(name='counts')

Unnamed: 0,OnlineSecurity,counts
0,No,3498
1,No internet service,1526
2,Yes,2019


In [169]:
dataframe['OnlineSecurity_No'] = dataframe['OnlineSecurity'].map( {'No': 1, 'Yes': 0, 'No internet service':0 } ).astype(int)
dataframe['OnlineSecurity_Yes'] = dataframe['OnlineSecurity'].map( {'No': 0, 'Yes': 1, 'No internet service':0 } ).astype(int)
dataframe.drop(['OnlineSecurity'],inplace=True, axis=1)

Process OnlineBackup

In [170]:
dataframe.groupby(['OnlineBackup']).size().reset_index(name='counts')

Unnamed: 0,OnlineBackup,counts
0,No,3088
1,No internet service,1526
2,Yes,2429


In [171]:
dataframe['OnlineBackup_No'] = dataframe['OnlineBackup'].map( {'No': 1, 'Yes': 0, 'No internet service':0 } ).astype(int)
dataframe['OnlineBackup_Yes'] = dataframe['OnlineBackup'].map( {'No': 0, 'Yes': 1, 'No internet service':0 } ).astype(int)
dataframe.drop('OnlineBackup',inplace=True, axis=1)

Process DeviceProtection

In [172]:
dataframe.groupby(['DeviceProtection']).size().reset_index(name='counts')

Unnamed: 0,DeviceProtection,counts
0,No,3095
1,No internet service,1526
2,Yes,2422


In [173]:
dataframe['DeviceProtection_No'] = dataframe['DeviceProtection'].map( {'No': 1, 'Yes': 0, 'No internet service':0 } ).astype(int)
dataframe['DeviceProtection_Yes'] = dataframe['DeviceProtection'].map( {'No': 0, 'Yes': 1, 'No internet service':0 } ).astype(int)
dataframe.drop('DeviceProtection',inplace=True, axis=1)

Process TechSupport

In [174]:
dataframe.groupby(['TechSupport']).size().reset_index(name='counts')

Unnamed: 0,TechSupport,counts
0,No,3473
1,No internet service,1526
2,Yes,2044


In [175]:
dataframe['TechSupport_No'] = dataframe['TechSupport'].map( {'No': 1, 'Yes': 0, 'No internet service':0 } ).astype(int)
dataframe['TechSupport_Yes'] = dataframe['TechSupport'].map( {'No': 0, 'Yes': 1, 'No internet service':0 } ).astype(int)
dataframe.drop('TechSupport',inplace=True, axis=1)

Process StreamingTV

In [176]:
dataframe.groupby(['StreamingTV']).size().reset_index(name='counts')

Unnamed: 0,StreamingTV,counts
0,No,2810
1,No internet service,1526
2,Yes,2707


In [177]:
dataframe['StreamingTV_No'] = dataframe['StreamingTV'].map( {'No': 1, 'Yes': 0, 'No internet service':0 } ).astype(int)
dataframe['StreamingTV_Yes'] = dataframe['StreamingTV'].map( {'No': 0, 'Yes': 1, 'No internet service':0 } ).astype(int)
dataframe.drop('StreamingTV',inplace=True, axis=1)

Process StreamingMovies

In [178]:
dataframe.groupby(['StreamingMovies']).size().reset_index(name='counts')

Unnamed: 0,StreamingMovies,counts
0,No,2785
1,No internet service,1526
2,Yes,2732


In [179]:
dataframe['StreamingMovies_No'] = dataframe['StreamingMovies'].map( {'No': 1, 'Yes': 0, 'No internet service':0 } ).astype(int)
dataframe['StreamingMovies_Yes'] = dataframe['StreamingMovies'].map( {'No': 0, 'Yes': 1, 'No internet service':0 } ).astype(int)
dataframe.drop('StreamingMovies',inplace=True, axis=1)

Process Contract

In [180]:
dataframe.groupby(['Contract']).size().reset_index(name='counts')

Unnamed: 0,Contract,counts
0,Month-to-month,3875
1,One year,1473
2,Two year,1695


In [181]:
dataframe['Contract_One_year'] = dataframe['Contract'].map( {'One year': 1, 'Two year': 0, 'Month-to-month':0 } ).astype(int)
dataframe['Contract_Two_year'] = dataframe['Contract'].map( {'One year': 0, 'Two year': 1, 'Month-to-month':0 } ).astype(int)
dataframe.drop('Contract',inplace=True, axis=1)

Process PaperlessBilling

In [182]:
dataframe.groupby(['PaperlessBilling']).size().reset_index(name='counts')

Unnamed: 0,PaperlessBilling,counts
0,No,2872
1,Yes,4171


In [183]:
PaperlessBilling_encoder = LabelEncoder()
PaperlessBilling_encoder.fit(dataframe['PaperlessBilling'])
dataframe["PaperlessBilling_encoded"] = PaperlessBilling_encoder.transform(dataframe['PaperlessBilling'])
dataframe.drop('PaperlessBilling',inplace=True, axis=1)

Process PaymentMethod

In [184]:
dataframe.groupby(['PaymentMethod']).size().reset_index(name='counts')

Unnamed: 0,PaymentMethod,counts
0,Bank transfer (automatic),1544
1,Credit card (automatic),1522
2,Electronic check,2365
3,Mailed check,1612


In [185]:
dataframe['PaymentMethod_Bank_transfer'] = dataframe['PaymentMethod'].map( {'Bank transfer (automatic)': 1, 'Credit card (automatic)': 0, 'Electronic check':0,'Mailed check':0 } ).astype(int)
dataframe['PaymentMethod_Credit_card'] = dataframe['PaymentMethod'].map( {'Bank transfer (automatic)': 0, 'Credit card (automatic)': 1, 'Electronic check':0,'Mailed check':0 } ).astype(int)
dataframe['PaymentMethod_Electronic_check'] = dataframe['PaymentMethod'].map( {'Bank transfer (automatic)': 0, 'Credit card (automatic)': 0, 'Electronic check': 1,'Mailed check':0 } ).astype(int)
dataframe.drop('PaymentMethod',inplace=True, axis=1)

Process Churn

In [186]:
Churn_encoder = LabelEncoder()
Churn_encoder.fit(dataframe['Churn'])
dataframe["Churn_encoded"] = Churn_encoder.transform(dataframe['Churn'])
dataframe.drop('Churn',inplace=True, axis=1)

In [187]:
for column in dataframe.columns:
    print(dataframe.groupby([column]).size().reset_index(name='counts'))
print("dataframe_columns: ",len(dataframe.columns))

   SeniorCitizen  counts
0              0    5901
1              1    1142
    tenure  counts
0        0      11
1        1     613
2        2     238
3        3     200
4        4     176
..     ...     ...
68      68     100
69      69      95
70      70     119
71      71     170
72      72     362

[73 rows x 2 columns]
      MonthlyCharges  counts
0              18.25       1
1              18.40       1
2              18.55       1
3              18.70       2
4              18.75       1
...              ...     ...
1580          118.20       1
1581          118.35       1
1582          118.60       2
1583          118.65       1
1584          118.75       1

[1585 rows x 2 columns]
     TotalCharges  counts
0                      11
1           100.2       1
2          100.25       1
3          100.35       1
4           100.4       1
...           ...     ...
6526       997.75       1
6527        998.1       1
6528       999.45       1
6529        999.8       1
6530        999

In [188]:
total_charges = []
for i in dataframe['TotalCharges']:
     if not i == ' ':
         total_charges .append(float(i))
mean_of_total_charge = np.mean(total_charges)
dataframe['TotalCharges'] = dataframe['TotalCharges'].replace(' ', mean_of_total_charge)

In [189]:
X_dataframe = dataframe.drop("Churn_encoded",inplace=False, axis=1)
Y_dataframe = dataframe["Churn_encoded"]

In [190]:
# X_dataframe.to_numpy().astype('float64')
for col in X_dataframe:
    for i in range(len(X_dataframe[col])):
        try:
            float(X_dataframe[col][i])
        except ValueError as e:
            print(X_dataframe[col][i]==' ')
            print(f"found missing value {X_dataframe[col][i]} in {col} column, {i} row")

In [191]:
X_numpy = X_dataframe.to_numpy()
Y_numpy = Y_dataframe.to_numpy()

In [192]:
scaler = StandardScaler()
X_numpy = scaler.fit_transform(X_numpy)
X_numpy

array([[-0.43991649, -1.27744458, -1.16032292, ..., -0.5298852 ,
        -0.52504733,  1.40641839],
       [-0.43991649,  0.06632742, -0.25962894, ..., -0.5298852 ,
        -0.52504733, -0.71102597],
       [-0.43991649, -1.23672422, -0.36266036, ..., -0.5298852 ,
        -0.52504733, -0.71102597],
       ...,
       [-0.43991649, -0.87024095, -1.1686319 , ..., -0.5298852 ,
        -0.52504733,  1.40641839],
       [ 2.27315869, -1.15528349,  0.32033821, ..., -0.5298852 ,
        -0.52504733, -0.71102597],
       [-0.43991649,  1.36937906,  1.35896134, ...,  1.88720123,
        -0.52504733, -0.71102597]])

In [193]:
X_train, X_test, y_train, y_test = train_test_split(X_numpy, Y_numpy, test_size=0.20, random_state=42)

In [194]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

LogisticRegression()

In [195]:
Y_pred = logreg.predict(X_test)

In [201]:
print(logreg.score(X_train, y_train))
print(accuracy_score(y_test, Y_pred))

0.8031593894213702
0.8204400283889283
