In [2]:
#%matplotlib inline
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
import numpy as np
import seaborn as sns
import statsmodels.api as sm
import matplotlib.pyplot as plt
import pandas as pd
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import warnings
import scipy.stats as stats
warnings.filterwarnings('ignore')
plt.style.use('fivethirtyeight')
from sklearn import datasets, linear_model

In [75]:
df = pd.read_csv('Telco_Customer_Churn.csv')

In [4]:
df.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [5]:
column_desc = {'customerID':'Customer ID',
'gender':'Whether the customer is a male or a female',
'SeniorCitizen':'Whether the customer is a senior citizen or not (1, 0)',
'Partner':'Whether the customer has a partner or not (Yes, No)',
'Dependents':'Whether the customer has dependents or not (Yes, No)',
'tenure':'Number of months the customer has stayed with the company',
'PhoneService':'Whether the customer has a phone service or not (Yes, No)',
'MultipleLines':'Whether the customer has multiple lines or not (Yes, No, No phone service)',
'InternetService':'Customer’s internet service provider (DSL, Fiber optic, No)',
'OnlineSecurity':'Whether the customer has online security or not (Yes, No, No internet service)',
'OnlineBackup':'Whether the customer has online backup or not (Yes, No, No internet service)',
'DeviceProtection':'Whether the customer has device protection or not (Yes, No, No internet service)',
'TechSupport':'Whether the customer has tech support or not (Yes, No, No internet service)',
'StreamingTV':'Whether the customer has streaming TV or not (Yes, No, No internet service)',
'StreamingMovies':'Whether the customer has streaming movies or not (Yes, No, No internet service)',
'Contract':'The contract term of the customer (Month-to-month, One year, Two year)',
'PaperlessBilling':'Whether the customer has paperless billing or not (Yes, No)',
'PaymentMethod':'The customer’s payment method (Electronic check, Mailed check, Bank transfer (automatic), Credit card (automatic))',
'MonthlyCharges':'The amount charged to the customer monthly',
'TotalCharges':'The total amount charged to the customer',
'Churn':'Whether the customer churned or not (Yes or No)',}

In [6]:
column_info = pd.DataFrame.from_dict([column_desc])

In [7]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


## Data Preprocessing

In [8]:
print ("Rows     : " ,df.shape[0])
print ("Columns  : " ,df.shape[1])
print ("\nFeatures : \n" ,df.columns.tolist())
print ("\nMissing values :  ", df.isnull().sum().values.sum())
print ("\nUnique values :  \n",df.nunique())

Rows     :  7043
Columns  :  21

Features : 
 ['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn']

Missing values :   0

Unique values :  
 customerID          7043
gender                 2
SeniorCitizen          2
Partner                2
Dependents             2
tenure                73
PhoneService           2
MultipleLines          3
InternetService        3
OnlineSecurity         3
OnlineBackup           3
DeviceProtection       3
TechSupport            3
StreamingTV            3
StreamingMovies        3
Contract               3
PaperlessBilling       2
PaymentMethod          4
MonthlyCharges      1585
TotalCharges        6531
Churn                  2
dtype: int64


In [9]:
df.dtypes # check the data types in the data

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [63]:
df.gender.unique()

array(['Female', 'Male'], dtype=object)

In [10]:
# TotalCharges should be numerical

In [61]:
df.TotalCharges = pd.to_numeric(df.TotalCharges, errors='coerce')
#df.dtypes

In [71]:
df.Partner.unique()

array(['Yes', 'No'], dtype=object)

In [None]:
#df.isnull().sum()

In [30]:
from missingpy import MissForest

In [31]:
type(df['TotalCharges'])

pandas.core.series.Series

In [56]:
X = df[['TotalCharges']]

In [57]:
for col in X.columns:
    print((col, sum(X[col].isnull())))

('TotalCharges', 11)


In [62]:
imputer = MissForest(max_iter=10)
X_imputed = imputer.fit_transform(X)

ValueError: Found array with 0 feature(s) (shape=(7032, 0)) while a minimum of 1 is required.

In [54]:
X_imputed

array([[  29.85,   29.85],
       [1889.5 ,   56.95],
       [ 108.15,   53.85],
       ...,
       [ 346.45,   29.6 ],
       [ 306.6 ,   74.4 ],
       [6844.5 ,  105.65]])

In [59]:
df['MonthlyCharges']

0        29.85
1        56.95
2        53.85
3        42.30
4        70.70
5        99.65
6        89.10
7        29.75
8       104.80
9        56.15
10       49.95
11       18.95
12      100.35
13      103.70
14      105.50
15      113.25
16       20.65
17      106.70
18       55.20
19       90.05
20       39.65
21       19.80
22       20.15
23       59.90
24       59.60
25       55.30
26       99.35
27       30.20
28       90.25
29       64.70
         ...  
7013     93.40
7014     89.20
7015     85.20
7016     49.95
7017     20.65
7018     70.65
7019     20.15
7020     19.20
7021     59.80
7022    104.95
7023    103.50
7024     84.80
7025     95.05
7026     44.20
7027     73.35
7028     64.10
7029     44.40
7030     20.05
7031     60.00
7032     75.75
7033     69.50
7034    102.95
7035     78.70
7036     60.65
7037     21.15
7038     84.80
7039    103.20
7040     29.60
7041     74.40
7042    105.65
Name: MonthlyCharges, Length: 7043, dtype: float64

In [12]:
df.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges
count,7043.0,7043.0,7043.0,7032.0
mean,0.162147,32.371149,64.761692,2283.300441
std,0.368612,24.559481,30.090047,2266.771362
min,0.0,0.0,18.25,18.8
25%,0.0,9.0,35.5,401.45
50%,0.0,29.0,70.35,1397.475
75%,0.0,55.0,89.85,3794.7375
max,1.0,72.0,118.75,8684.8


### Check Different Group for Class Imbalance

In [13]:
Tran_df = df.copy()
Tran_df['SeniorCitizen'].replace({1 : 'Yes', 0 : 'No'}, inplace=True)

In [14]:
def groups(group):
    num = []
    gname = []
    for name in list(Tran_df[group].unique()):
        num.append(len(name))
        gname.append(name)
    # Pie Chart
    group_unique = []
    plt.figure(figsize=(10, 8))
    plt.pie(num, labels=list(Tran_df[group].unique()), autopct="%1d%%")
    plt.axis('equal')
    plt.title('Data Set Based on {}'.format(group))
    plt.show()
    
    # Bar Chart
    ax = plt.subplot()
    plt.bar(range(len(num)), num, color = 'green')
    ax.set_xticks(range(0, len(num)))
    ax.set_xticklabels(gname)
    plt.title('Data Set Based on {}'.format(group))
    plt.xlabel('{}'.format(group))
    plt.ylabel('Count of {}'.format(group))
    plt.show()

In [15]:
interact(groups, group=['gender', 'SeniorCitizen', 'Partner', 'Churn', 'Dependents']);

interactive(children=(Dropdown(description='group', options=('gender', 'SeniorCitizen', 'Partner', 'Churn', 'D…

### Group by Senior Citizen and Check the number of Senior Citizen that are Female

In [16]:
seniors = Tran_df.groupby('SeniorCitizen').get_group('Yes')

In [17]:
senior_gender = seniors.groupby('gender')
Male_Senior_Citizen = senior_gender.get_group('Male')
Female_Senior_Citizen = senior_gender.get_group('Female')

### Explore Data about Senior Citizens

In [18]:
def senior_groups(group):
    snum = []
    sname = []
    for name in list(seniors[group].unique()):
        snum.append(len(name))
        sname.append(name)
    # Pie Chart
    group_unique = []
    plt.figure(figsize=(10, 8))
    plt.pie(snum, labels=list(seniors[group].unique()), autopct="%1d%%")
    plt.axis('equal')
    plt.title('Data Set Based on {}'.format(group))
    plt.show()
    
    # Bar Chart
    ax = plt.subplot()
    plt.bar(range(len(snum)), snum, color = 'green')
    ax.set_xticks(range(0, len(snum)))
    ax.set_xticklabels(sname)
    plt.title('Data Set Based on {}'.format(group))
    plt.xlabel('{}'.format(group))
    plt.ylabel('Count of {}'.format(group))
    plt.show()

In [19]:
interact(senior_groups, group=['gender', 'Partner', 'Churn', 'Dependents']);

interactive(children=(Dropdown(description='group', options=('gender', 'Partner', 'Churn', 'Dependents'), valu…

In [20]:
from pivottablejs import pivot_ui
pivot_ui(df)

### Data Preprocessing

In [76]:
# Rename the gender categories
df['gender'].replace({'Female':0,'Male':1}, inplace=True)

# Rename the Partner column
df['Partner'].replace({'No':0,'Yes':1}, inplace=True)

# Rename the Dependent column
df['Dependents'].replace({'No':0,'Yes':1}, inplace=True)

In [104]:
# Rename the Dependent column
df['PaperlessBilling'].replace({'No':0,'Yes':1}, inplace=True)

In [107]:
df['Churn'].replace({'No':0,'Yes':1}, inplace=True)

In [82]:
# Rename the Dependent column
df['PhoneService'].replace({'No':0,'Yes':1}, inplace=True)

In [86]:
# Rename the Dependent column
df['MultipleLines'].replace({'No phone service':0,'No':1, 'Yes': 2}, inplace=True)

In [88]:
df['InternetService'].replace({'No':0,'DSL':1, 'Fiber optic': 2}, inplace=True)

In [91]:
df['OnlineSecurity'].replace({'No internet service':0,'No':1, 'Yes': 2}, inplace=True)

In [93]:
df['OnlineBackup'].replace({'No internet service':0,'No':1, 'Yes': 2}, inplace=True)

In [95]:
df['DeviceProtection'].replace({'No internet service':0,'No':1, 'Yes': 2}, inplace=True)

In [97]:
df['TechSupport'].replace({'No internet service':0,'No':1, 'Yes': 2}, inplace=True)

In [99]:
df['StreamingTV'].replace({'No internet service':0,'No':1, 'Yes': 2}, inplace=True)

In [101]:
df['StreamingMovies'].replace({'No internet service':0,'No':1, 'Yes': 2}, inplace=True)

In [106]:
df.Churn.unique()

array(['No', 'Yes'], dtype=object)

In [84]:
df.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')