<div class="alert alert-block alert-info">
    
# <font color=red>TELCO CUSTOMER CHURN PREDICTION</font>

### USING LOGISTIC REGRESSION ANALYSIS

In [79]:
#%matplotlib inline
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
import numpy as np
import seaborn as sns
import statsmodels.api as sm
import matplotlib.pyplot as plt
import pandas as pd
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import warnings
import scipy.stats as stats
warnings.filterwarnings('ignore')
plt.style.use('fivethirtyeight')
from sklearn import datasets, linear_model

In [90]:
df = pd.read_csv('Telco_Customer_Churn.csv')

In [81]:
# df.columns

In [96]:
column_desc = {'CustomerID':'Customer ID',
'Gender':'Whether the customer is a male or a female',
'SeniorCitizen':'Whether the customer is a senior citizen or not (1, 0)',
'Partner':'Whether the customer has a partner or not (Yes, No)',
'Dependents':'Whether the customer has dependents or not (Yes, No)',
'Tenure':'Number of months the customer has stayed with the company',
'PhoneService':'Whether the customer has a phone service or not (Yes, No)',
'MultipleLines':'Whether the customer has multiple lines or not (Yes, No, No phone service)',
'InternetService':'Customer’s internet service provider (DSL, Fiber optic, No)',
'OnlineSecurity':'Whether the customer has online security or not (Yes, No, No internet service)',
'OnlineBackup':'Whether the customer has online backup or not (Yes, No, No internet service)',
'DeviceProtection':'Whether the customer has device protection or not (Yes, No, No internet service)',
'TechSupport':'Whether the customer has tech support or not (Yes, No, No internet service)',
'StreamingTV':'Whether the customer has streaming TV or not (Yes, No, No internet service)',
'StreamingMovies':'Whether the customer has streaming movies or not (Yes, No, No internet service)',
'Contract':'The contract term of the customer (Month-to-month, One year, Two year)',
'PaperlessBilling':'Whether the customer has paperless billing or not (Yes, No)',
'PaymentMethod':'The customer’s payment method (Electronic check, Mailed check, Bank transfer (automatic), Credit card (automatic))',
'MonthlyCharges':'The amount charged to the customer monthly',
'TotalCharges':'The total amount charged to the customer',
'Churn':'Whether the customer churned or not (Yes or No)',}

In [97]:
column_info = pd.DataFrame.from_dict([column_desc])

In [98]:
#column_info

In [99]:
# Renaming the 3 columns.
df = df.rename(columns={'customerID' : 'CustomerID' , 'gender': 'Gender', 'tenure':'Tenure'})

## Data Preprocessing

In [86]:
print ("Rows     : " ,df.shape[0])
print ("Columns  : " ,df.shape[1])
print ("\nFeatures : \n" ,df.columns.tolist())
print ("\nMissing values :  ", df.isnull().sum().values.sum())
print ("\nUnique values :  \n",df.nunique())

Rows     :  7043
Columns  :  21

Features : 
 ['CustomerID', 'Gender', 'SeniorCitizen', 'Partner', 'Dependents', 'Tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn']

Missing values :   0

Unique values :  
 CustomerID          7043
Gender                 2
SeniorCitizen          2
Partner                2
Dependents             2
Tenure                73
PhoneService           2
MultipleLines          3
InternetService        3
OnlineSecurity         3
OnlineBackup           3
DeviceProtection       3
TechSupport            3
StreamingTV            3
StreamingMovies        3
Contract               3
PaperlessBilling       2
PaymentMethod          4
MonthlyCharges      1585
TotalCharges        6531
Churn                  2
dtype: int64


In [87]:
#df.dtypes # Check the data types of each features

In [91]:
# Check for Missing Data
for col in df.columns:
    print((col, sum(df[col].isnull())))

('customerID', 0)
('gender', 0)
('SeniorCitizen', 0)
('Partner', 0)
('Dependents', 0)
('tenure', 0)
('PhoneService', 0)
('MultipleLines', 0)
('InternetService', 0)
('OnlineSecurity', 0)
('OnlineBackup', 0)
('DeviceProtection', 0)
('TechSupport', 0)
('StreamingTV', 0)
('StreamingMovies', 0)
('Contract', 0)
('PaperlessBilling', 0)
('PaymentMethod', 0)
('MonthlyCharges', 0)
('TotalCharges', 0)
('Churn', 0)


In [88]:
# Print the unique entries in the features
column_ex = ['MonthlyCharges', 'CustomerID', 'Tenure', 'TotalCharges']
for name in df.columns:
    if name not in column_ex:
        print('{} is : '.format(name), df[name].unique())

Gender is :  ['Female' 'Male']
SeniorCitizen is :  [0 1]
Partner is :  ['Yes' 'No']
Dependents is :  ['No' 'Yes']
PhoneService is :  ['No' 'Yes']
MultipleLines is :  ['No phone service' 'No' 'Yes']
InternetService is :  ['DSL' 'Fiber optic' 'No']
OnlineSecurity is :  ['No' 'Yes' 'No internet service']
OnlineBackup is :  ['Yes' 'No' 'No internet service']
DeviceProtection is :  ['No' 'Yes' 'No internet service']
TechSupport is :  ['No' 'Yes' 'No internet service']
StreamingTV is :  ['No' 'Yes' 'No internet service']
StreamingMovies is :  ['No' 'Yes' 'No internet service']
Contract is :  ['Month-to-month' 'One year' 'Two year']
PaperlessBilling is :  ['Yes' 'No']
PaymentMethod is :  ['Electronic check' 'Mailed check' 'Bank transfer (automatic)'
 'Credit card (automatic)']
Churn is :  ['No' 'Yes']


In [60]:
Tran_df = df.copy()
Tran_df['SeniorCitizen'].replace({1 : 'Yes', 0 : 'No'}, inplace=True)

In [61]:
# Quickly view the DataSet
def groups(group):
    num = []
    gname = []
    for name in list(Tran_df[group].unique()):
        num.append(len(name))
        gname.append(name)
    # Pie Chart
    group_unique = []
    plt.figure(figsize=(10, 8))
    plt.pie(num, labels=list(Tran_df[group].unique()), autopct="%1d%%")
    plt.axis('equal')
    plt.title('Data Set Based on {}'.format(group))
    plt.show()
    
    # Bar Chart
    ax = plt.subplot()
    plt.bar(range(len(num)), num, color = 'green')
    ax.set_xticks(range(0, len(num)))
    ax.set_xticklabels(gname)
    plt.title('Data Set Based on {}'.format(group))
    plt.xlabel('{}'.format(group))
    plt.ylabel('Count of {}'.format(group))
    plt.show()

In [62]:
interact(groups, group=['Gender', 'SeniorCitizen', 'Partner', 'Churn', 'Dependents']);

interactive(children=(Dropdown(description='group', options=('Gender', 'SeniorCitizen', 'Partner', 'Churn', 'D…

In [65]:
# Reshape the DataFrame
ls_3 = ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']

for name in ls_3:
    df[name].replace({'No internet service':0,'No':1, 'Yes': 2}, inplace=True)
    
ls_2 = ['Partner', 'Dependents', 'PaperlessBilling', 'PhoneService', 'Churn']

for name in ls_2:
    df[name].replace({'No':0,'Yes':1}, inplace=True)

df['Gender'].replace({'Female':0,'Male':1}, inplace=True)
df['MultipleLines'].replace({'No phone service':0,'No':1, 'Yes': 2}, inplace=True)
df['InternetService'].replace({'No':0,'DSL':1, 'Fiber optic': 2}, inplace=True)
df['Contract'].replace({'Month-to-month': 0, 'One year': 1, 'Two year': 2}, inplace=True)
df['PaymentMethod'].replace({'Electronic check': 0, 'Mailed check': 1, 'Bank transfer (automatic)': 2,
                             'Credit card (automatic)': 3}, inplace=True)

TypeError: Cannot compare types 'ndarray(dtype=int64)' and 'str'

In [66]:
df.TotalCharges = pd.to_numeric(df.TotalCharges, errors='coerce')

In [67]:
df.dtypes

CustomerID           object
Gender               object
SeniorCitizen         int64
Partner               int64
Dependents            int64
Tenure                int64
PhoneService          int64
MultipleLines        object
InternetService      object
OnlineSecurity        int64
OnlineBackup          int64
DeviceProtection      int64
TechSupport           int64
StreamingTV           int64
StreamingMovies       int64
Contract             object
PaperlessBilling      int64
PaymentMethod        object
MonthlyCharges      float64
TotalCharges        float64
Churn                 int64
dtype: object

In [71]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
CustomerID          7043 non-null object
Gender              7043 non-null object
SeniorCitizen       7043 non-null int64
Partner             7043 non-null int64
Dependents          7043 non-null int64
Tenure              7043 non-null int64
PhoneService        7043 non-null int64
MultipleLines       7043 non-null object
InternetService     7043 non-null object
OnlineSecurity      7043 non-null int64
OnlineBackup        7043 non-null int64
DeviceProtection    7043 non-null int64
TechSupport         7043 non-null int64
StreamingTV         7043 non-null int64
StreamingMovies     7043 non-null int64
Contract            7043 non-null object
PaperlessBilling    7043 non-null int64
PaymentMethod       7043 non-null object
MonthlyCharges      7043 non-null float64
TotalCharges        7032 non-null float64
Churn               7043 non-null int64
dtypes: float64(2), int64(13), object(6)
me

In [75]:
#Identifying the rows containing missing data
missing_value_row = list(df[df['TotalCharges'] == " "].index)
print('Missing Value Rows-->', missing_value_row , '\nTotal rows-->', len(missing_value_row))

Missing Value Rows--> [] 
Total rows--> 0


In [76]:
# Replacing the spaces with 0
for missing_row in missing_value_row :
    df['TotalCharges'][missing_row] = 0

In [78]:
for col in df.columns:
    print((col, sum(df[col].isnull())))

('CustomerID', 0)
('Gender', 0)
('SeniorCitizen', 0)
('Partner', 0)
('Dependents', 0)
('Tenure', 0)
('PhoneService', 0)
('MultipleLines', 0)
('InternetService', 0)
('OnlineSecurity', 0)
('OnlineBackup', 0)
('DeviceProtection', 0)
('TechSupport', 0)
('StreamingTV', 0)
('StreamingMovies', 0)
('Contract', 0)
('PaperlessBilling', 0)
('PaymentMethod', 0)
('MonthlyCharges', 0)
('TotalCharges', 11)
('Churn', 0)


In [68]:
# Check for missing data in each columns
for col in df.columns:
    print((col, sum(df[col].isnull())))

('CustomerID', 0)
('Gender', 0)
('SeniorCitizen', 0)
('Partner', 0)
('Dependents', 0)
('Tenure', 0)
('PhoneService', 0)
('MultipleLines', 0)
('InternetService', 0)
('OnlineSecurity', 0)
('OnlineBackup', 0)
('DeviceProtection', 0)
('TechSupport', 0)
('StreamingTV', 0)
('StreamingMovies', 0)
('Contract', 0)
('PaperlessBilling', 0)
('PaymentMethod', 0)
('MonthlyCharges', 0)
('TotalCharges', 11)
('Churn', 0)


### Check Different Group for Class Imbalance

In [69]:
from pivottablejs import pivot_ui
pivot_ui(df)

In [100]:
!pip install sklearn.grid_search

Collecting sklearn.grid_search


  ERROR: Could not find a version that satisfies the requirement sklearn.grid_search (from versions: none)
ERROR: No matching distribution found for sklearn.grid_search
