In [1]:
#########Importing libraries##########
import numpy as np
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from PIL import  Image
import itertools
import warnings
warnings.filterwarnings("ignore")
import io

############Visualization#############
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.figure_factory as ff
######################################

In [2]:
########Bring in the DATA!###########

telcom = pd.read_csv("db/telemarker_db.csv")
print(telcom.head(2))
#######Data Cleaning Process#########

#Replacing spaces with null values in total charges column
telcom['TotalCharges'] = telcom["TotalCharges"].replace(" ",np.nan)
print("Percentage of null: " + str(round(telcom['TotalCharges'].isnull().sum()/telcom['TotalCharges'].count()*100,4))+"%")

#Dropping null values from total charges column which contain .16% missing data
telcom = telcom[telcom["TotalCharges"].notnull()]
telcom = telcom.reset_index()[telcom.columns]

#convert to float type
telcom["TotalCharges"] = telcom["TotalCharges"].astype(float)
telcom["MonthlyCharges"] = telcom["MonthlyCharges"].astype(float)

#replace 'No phone service' to No
telcom["MultipleLines"] = telcom["MultipleLines"].replace("No phone service","No")

#replace 'No internet service' to No for the following columns
replace_cols = [ 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
                'TechSupport','StreamingTV', 'StreamingMovies']
for i in replace_cols : 
    telcom[i]  = telcom[i].replace({'No internet service' : 'No'})

#Change types of columns to boolean
YesNoColumns = ["Partner","Dependents","PhoneService","MultipleLines","OnlineSecurity",
                "OnlineBackup","DeviceProtection","TechSupport","StreamingTV","StreamingMovies",
                "PaperlessBilling","Churn"]
for i in YesNoColumns :
    telcom[i]  = telcom[i].replace({'Yes':1,'No':0})
BoolColumns = YesNoColumns.copy()
BoolColumns.append("SeniorCitizen")
telcom[BoolColumns] = telcom[BoolColumns].astype('bool')

telcom.head()

   customerID  gender       City State           Phone  SeniorCitizen Partner  \
0  7590-VHVEG  Female  Sujzojpab    CA  (461) 230-8635              0     Yes   
1  5575-GNVDE    Male     Bozuto    WY  (500) 288-7672              0      No   

  Dependents  tenure PhoneService       ...       StreamingTV StreamingMovies  \
0         No       1           No       ...                No              No   
1         No      34          Yes       ...                No              No   

         Contract PaperlessBilling     PaymentMethod MonthlyCharges  \
0  Month-to-month              Yes  Electronic check          29.85   
1        One year               No      Mailed check          56.95   

  TotalCharges Churn              Name         Address  
0        29.85    No  Margaret Gregory  1927 Gusta Way  
1       1889.5    No      Allie Valdez  525 Ahev Grove  

[2 rows x 26 columns]
Percentage of null: 0.1564%


Unnamed: 0,customerID,gender,City,State,Phone,SeniorCitizen,Partner,Dependents,tenure,PhoneService,...,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,Name,Address
0,7590-VHVEG,Female,Sujzojpab,CA,(461) 230-8635,False,True,False,1,False,...,False,False,Month-to-month,True,Electronic check,29.85,29.85,False,Margaret Gregory,1927 Gusta Way
1,5575-GNVDE,Male,Bozuto,WY,(500) 288-7672,False,False,False,34,True,...,False,False,One year,False,Mailed check,56.95,1889.5,False,Allie Valdez,525 Ahev Grove
2,3668-QPYBK,Male,Cefzizof,NY,(368) 757-6896,False,False,False,2,True,...,False,False,Month-to-month,True,Mailed check,53.85,108.15,True,Bertha Padilla,985 Utomez Extension
3,7795-CFOCW,Male,Zavictej,NV,(865) 723-2538,False,False,False,45,False,...,False,False,One year,False,Bank transfer (automatic),42.3,1840.75,False,Ian Perez,1030 Gagicu Avenue
4,9237-HQITU,Female,Socgimo,HI,(206) 668-9076,False,False,False,2,True,...,False,False,Month-to-month,True,Electronic check,70.7,151.65,True,Gilbert Murray,1905 Ukiij Path


In [3]:
#Separating churn and non churn customers
churn     = telcom[telcom["Churn"] == True]
not_churn = telcom[telcom["Churn"] == False]

#Separating catagorical and numerical columns
Id_col     = ['customerID']
target_col = ["Churn"]
cat_cols   = telcom.nunique()[telcom.nunique() < 6].keys().tolist()
cat_cols   = [x for x in cat_cols if x not in target_col]
pers_cols =  ["City","State","Phone","Name","Address"]
num_cols   = [x for x in telcom.columns if x not in cat_cols + target_col + Id_col + pers_cols]
num_cols

['tenure', 'MonthlyCharges', 'TotalCharges']

In [4]:
#labels
lab = telcom["Churn"].value_counts().keys().tolist()
lab = ["Churn" if x else "Not Churn" for x in lab]
#values
val = telcom["Churn"].value_counts().values.tolist()

trace = go.Pie(labels = lab ,
               values = val ,
               marker = dict(colors =  [ 'royalblue' ,'lime'],
                             line = dict(color = "white",
                                         width =  1.3)
                            ),
               rotation = 90,
               hoverinfo = "label+value+text",
               hole = .5
              )
layout = go.Layout(dict(title = "Customer attrition in data",
                        plot_bgcolor  = "rgb(243,243,243)",
                        paper_bgcolor = "rgb(243,243,243)",
                       )
                  )

data = [trace]
fig = go.Figure(data = data,layout = layout)
py.iplot(fig)

In [5]:
def plot_pie(column) :
    
    trace1 = go.Pie(values  = churn[column].value_counts().values.tolist(),
                    labels  = churn[column].value_counts().keys().tolist(),
                    hoverinfo = "label+percent+name",
                    domain  = dict(x = [0,.48]),
                    name    = "Churn Customers",
                    marker  = dict(line = dict(width = 2,
                                               color = "rgb(243,243,243)")
                                  ),
                    hole    = .6
                   )
    trace2 = go.Pie(values  = not_churn[column].value_counts().values.tolist(),
                    labels  = not_churn[column].value_counts().keys().tolist(),
                    hoverinfo = "label+percent+name",
                    marker  = dict(line = dict(width = 2,
                                               color = "rgb(243,243,243)")
                                  ),
                    domain  = dict(x = [.52,1]),
                    hole    = .6,
                    name    = "Non churn customers" 
                   )


    layout = go.Layout(dict(title = column + " distribution in customer attrition ",
                            plot_bgcolor  = "rgb(243,243,243)",
                            paper_bgcolor = "rgb(243,243,243)",
                            annotations = [dict(text = "churn customers",
                                                font = dict(size = 13),
                                                showarrow = False,
                                                x = .15, y = .5),
                                           dict(text = "Non churn customers",
                                                font = dict(size = 13),
                                                showarrow = False,
                                                x = .88,y = .5
                                               )
                                          ]
                           )
                      )
    data = [trace1,trace2]
    fig  = go.Figure(data = data,layout = layout)
    py.iplot(fig)
    
for i in cat_cols :
    plot_pie(i)    

In [6]:
def histogram(column) :
    trace1 = go.Histogram(x  = churn[column],
                          histnorm= "percent",
                          name = "Churn Customers",
                          marker = dict(line = dict(width = .5,
                                                    color = "black"
                                                    )
                                        ),
                         opacity = .9 
                         ) 
    
    trace2 = go.Histogram(x  = not_churn[column],
                          histnorm = "percent",
                          name = "Non churn customers",
                          marker = dict(line = dict(width = .5,
                                              color = "black"
                                             )
                                 ),
                          opacity = .9
                         )
    
    data = [trace1,trace2]
    layout = go.Layout(dict(title =column + " distribution in customer attrition ",
                            plot_bgcolor  = "rgb(243,243,243)",
                            paper_bgcolor = "rgb(243,243,243)",
                            xaxis = dict(gridcolor = 'rgb(255, 255, 255)',
                                             title = column,
                                             zerolinewidth=1,
                                             ticklen=5,
                                             gridwidth=2
                                            ),
                            yaxis = dict(gridcolor = 'rgb(255, 255, 255)',
                                             title = "percent",
                                             zerolinewidth=1,
                                             ticklen=5,
                                             gridwidth=2
                                            ),
                           )
                      )
    fig  = go.Figure(data=data,layout=layout)
    
    py.iplot(fig)

    
for i in num_cols :
    histogram(i)

In [7]:
#correlation
correlation = telcom.corr()
#tick labels
matrix_cols = correlation.columns.tolist()
#convert to array
corr_array  = np.array(correlation)

#Plotting
trace = go.Heatmap(z = corr_array,
                   x = matrix_cols,
                   y = matrix_cols,
                   colorscale = "Viridis",
                   colorbar   = dict(title = "Pearson Correlation coefficient",
                                     titleside = "right"
                                    ) ,
                  )

layout = go.Layout(dict(title = "Correlation Matrix for variables",
                        autosize = False,
                        height  = 720,
                        width   = 800,
                        margin  = dict(r = 0 ,l = 210,
                                       t = 25,b = 210,
                                      ),
                        yaxis   = dict(tickfont = dict(size = 9)),
                        xaxis   = dict(tickfont = dict(size = 9))
                       )
                  )

data = [trace]
fig = go.Figure(data=data,layout=layout)
py.iplot(fig)