In [104]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [105]:
internet = pd.read_csv('../datasets/internet_data.csv')
customer = pd.read_csv('../datasets/customer_data.csv')
churn = pd.read_csv('../datasets/churn_data.csv')

In [106]:
churn.head()


Unnamed: 0,customerID,tenure,PhoneService,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,1,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,34,Yes,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,2,Yes,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,45,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,2,Yes,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [107]:
customer.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents
0,7590-VHVEG,Female,0,Yes,No
1,5575-GNVDE,Male,0,No,No
2,3668-QPYBK,Male,0,No,No
3,7795-CFOCW,Male,0,No,No
4,9237-HQITU,Female,0,No,No


In [108]:
internet.head()

Unnamed: 0,customerID,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies
0,7590-VHVEG,No phone service,DSL,No,Yes,No,No,No,No
1,5575-GNVDE,No,DSL,Yes,No,Yes,No,No,No
2,3668-QPYBK,No,DSL,Yes,Yes,No,No,No,No
3,7795-CFOCW,No phone service,DSL,Yes,No,Yes,Yes,No,No
4,9237-HQITU,No,Fiber optic,No,No,No,No,No,No


In [109]:
telecom = pd.merge(churn, customer, on = 'customerID', how = 'inner')
telecom = pd.merge(telecom, internet, on = 'customerID', how = 'inner')

In [110]:
telecom.shape

(7043, 21)

In [111]:
telecom.describe()
telecom.OnlineBackup.value_counts()

OnlineBackup
No                     3088
Yes                    2429
No internet service    1526
Name: count, dtype: int64

In [112]:
telecom.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   tenure            7043 non-null   int64  
 2   PhoneService      7043 non-null   object 
 3   Contract          7043 non-null   object 
 4   PaperlessBilling  7043 non-null   object 
 5   PaymentMethod     7043 non-null   object 
 6   MonthlyCharges    7043 non-null   float64
 7   TotalCharges      7043 non-null   object 
 8   Churn             7043 non-null   object 
 9   gender            7043 non-null   object 
 10  SeniorCitizen     7043 non-null   int64  
 11  Partner           7043 non-null   object 
 12  Dependents        7043 non-null   object 
 13  MultipleLines     7043 non-null   object 
 14  InternetService   7043 non-null   object 
 15  OnlineSecurity    7043 non-null   object 
 16  OnlineBackup      7043 non-null   object 


In [113]:
def binary_map(x):
    return x.map({'Yes':1, 'No':0})

featurelist = ['PhoneService', 'PaperlessBilling', 'Churn', 'Partner', 'Dependents']
telecom[featurelist] = telecom[featurelist].apply(binary_map)

In [114]:
dummy_1 = pd.get_dummies(['Contract', 'PaymentMethod', 'gender', 'InternetService'], drop_first=True)
telecom = pd.concat([telecom, dummy_1], axis = 1)

In [115]:
dummy_2 = pd.get_dummies(telecom['MultipleLines'], prefix='MultipleLines')
dummy_2 = dummy_2.drop(['MultipleLines_No phone service'], axis=1)
telecom = pd.concat([telecom, dummy_2], axis=1)

dummy_3 = pd.get_dummies(telecom['OnlineSecurity'], prefix='OnlineSecurity')
dummy_3 = dummy_3.drop(['OnlineSecurity_No internet service'], axis=1)
telecom = pd.concat([telecom, dummy_3], axis=1)

dummy_4 = pd.get_dummies(telecom['OnlineBackup'], prefix='OnlineBackup')
dummy_4 = dummy_4.drop(['OnlineBackup_No internet service'], axis=1)
telecom = pd.concat([telecom, dummy_4], axis=1)

dummy_5 = pd.get_dummies(telecom['DeviceProtection'], prefix='DeviceProtection')
dummy_5 = dummy_5.drop(['DeviceProtection_No internet service'], axis=1)
telecom = pd.concat([telecom, dummy_5], axis=1)

dummy_6 = pd.get_dummies(telecom['TechSupport'], prefix='TechSupport')
dummy_6 = dummy_6.drop(['TechSupport_No internet service'], axis=1)
telecom = pd.concat([telecom, dummy_6], axis=1)

dummy_7 = pd.get_dummies(telecom['StreamingTV'], prefix='StreamingTV')
dummy_7 = dummy_7.drop(['StreamingTV_No internet service'], axis=1)
telecom = pd.concat([telecom, dummy_7], axis=1)

dummy_8 = pd.get_dummies(telecom['StreamingMovies'], prefix='StreamingMovies')
dummy_8 = dummy_8.drop(['StreamingMovies_No internet service'], axis=1)
telecom = pd.concat([telecom, dummy_8], axis=1)

In [116]:
telecom = telecom.drop(['Contract', 'PaymentMethod', 'gender', 'MultipleLines','InternetService', 'MultipleLines', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies'], axis=1)

In [117]:
telecom.TotalCharges = telecom.TotalCharges.convert_dtypes(convert_string=True).convert_dtypes(convert_floating=True)

In [118]:
telecom.info()
telecom['TotalCharges'] = pd.to_numeric(telecom['TotalCharges'], errors='coerce')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   customerID            7043 non-null   object 
 1   tenure                7043 non-null   int64  
 2   PhoneService          7043 non-null   int64  
 3   PaperlessBilling      7043 non-null   int64  
 4   MonthlyCharges        7043 non-null   float64
 5   TotalCharges          7043 non-null   string 
 6   Churn                 7043 non-null   int64  
 7   SeniorCitizen         7043 non-null   int64  
 8   Partner               7043 non-null   int64  
 9   Dependents            7043 non-null   int64  
 10  MultipleLines_No      7043 non-null   bool   
 11  MultipleLines_Yes     7043 non-null   bool   
 12  OnlineSecurity_No     7043 non-null   bool   
 13  OnlineSecurity_Yes    7043 non-null   bool   
 14  OnlineBackup_No       7043 non-null   bool   
 15  OnlineBackup_Yes     

In [119]:
num_telecom = telecom[['tenure', 'MonthlyCharges', 'SeniorCitizen', 'TotalCharges']]
num_telecom.describe(percentiles=[.25,.5,.75,.90,.95,.99])

Unnamed: 0,tenure,MonthlyCharges,SeniorCitizen,TotalCharges
count,7043.0,7043.0,7043.0,7032.0
mean,32.371149,64.761692,0.162147,2283.300441
std,24.559481,30.090047,0.368612,2266.771362
min,0.0,18.25,0.0,18.8
25%,9.0,35.5,0.0,401.45
50%,29.0,70.35,0.0,1397.475
75%,55.0,89.85,0.0,3794.7375
90%,69.0,102.6,1.0,5976.64
95%,72.0,107.4,1.0,6923.59
99%,72.0,114.729,1.0,8039.883


In [124]:
telecom = telecom[~np.isnan(telecom['TotalCharges'])]

In [125]:
X = telecom.drop(['Churn', 'customerID'], axis=1)
y = telecom['Churn']

X_test, X_train, y_test, y_train = train_test_split(X, y, train_size=0.7, random_state=100)

In [127]:
scaler = MinMaxScaler()
X_train[['tenure', 'MonthlyCharges', 'TotalCharges']] = scaler.fit_transform(X_train[['tenure', 'MonthlyCharges', 'TotalCharges']])
X_train.head()

Unnamed: 0,tenure,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,SeniorCitizen,Partner,Dependents,MultipleLines_No,MultipleLines_Yes,...,OnlineBackup_No,OnlineBackup_Yes,DeviceProtection_No,DeviceProtection_Yes,TechSupport_No,TechSupport_Yes,StreamingTV_No,StreamingTV_Yes,StreamingMovies_No,StreamingMovies_Yes
942,0.323944,1,1,0.611612,0.231144,0,0,0,True,False,...,True,False,True,False,True,False,True,False,False,True
3730,0.788732,1,1,0.858358,0.688296,0,1,0,False,True,...,False,True,False,True,True,False,False,True,False,True
1761,0.802817,1,1,0.057057,0.174309,0,0,0,False,True,...,False,False,False,False,False,False,False,False,False,False
2283,0.0,1,1,0.528529,0.006151,0,0,0,True,False,...,True,False,True,False,True,False,True,False,True,False
1872,0.56338,1,0,0.00951,0.091591,0,1,1,True,False,...,False,False,False,False,False,False,False,False,False,False
