In [1]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format='retina'

In [2]:
data = '../data/telco.csv'
df = pd.read_csv(data)

In [3]:
df.shape

(7043, 21)

In [4]:
df.head(2)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No


In [5]:
df.head().T

Unnamed: 0,0,1,2,3,4
customerID,7590-VHVEG,5575-GNVDE,3668-QPYBK,7795-CFOCW,9237-HQITU
gender,Female,Male,Male,Male,Female
SeniorCitizen,0,0,0,0,0
Partner,Yes,No,No,No,No
Dependents,No,No,No,No,No
tenure,1,34,2,45,2
PhoneService,No,Yes,Yes,No,Yes
MultipleLines,No phone service,No,No,No phone service,No
InternetService,DSL,DSL,DSL,DSL,Fiber optic
OnlineSecurity,No,Yes,Yes,Yes,No


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [7]:
df.isnull().sum().sum()

0

#### Data Preparation

In [8]:
# 'coerce' fills errors with Null values
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

In [9]:
df.TotalCharges.isnull().sum()

11

In [10]:
df.TotalCharges = df.TotalCharges.fillna(0)

In [11]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [12]:
df.columns

Index(['customerid', 'gender', 'seniorcitizen', 'partner', 'dependents',
       'tenure', 'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod', 'monthlycharges', 'totalcharges', 'churn'],
      dtype='object')

In [13]:
# get the string columns
df.dtypes[df.dtypes == 'object']

customerid          object
gender              object
partner             object
dependents          object
phoneservice        object
multiplelines       object
internetservice     object
onlinesecurity      object
onlinebackup        object
deviceprotection    object
techsupport         object
streamingtv         object
streamingmovies     object
contract            object
paperlessbilling    object
paymentmethod       object
churn               object
dtype: object

In [14]:
del df['customerid']

In [15]:
string_cols = df.dtypes[df.dtypes == 'object'].index

In [16]:
# bring all values in string columns to lower case
for col in string_cols:
    df[col] = df[col].str.lower().str.replace(' ', '_')

In [17]:
df.duplicated().sum()

22

In [18]:
# from yes/no to 1/0
df.churn = (df.churn == 'yes').astype('uint8')

#### Split data with `train_test_split`

In [19]:
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=42)

In [20]:
df_train.shape[0], df_val.shape[0], df_test.shape[0]

(4225, 1409, 1409)

In [21]:
y_train = df_train.churn.values
y_val = df_val.churn.values
y_test = df_test.churn.values

In [22]:
del df_train['churn']
del df_val['churn']
del df_test['churn']

#### EDA

In [23]:
# double check for nulls
df_train_full.isnull().sum().sum()

0

In [24]:
df_train_full.churn.value_counts(normalize=True).round(2)

0    0.73
1    0.27
Name: churn, dtype: float64

In [25]:
churn_rate = df_train_full.churn.mean().round(2)
churn_rate

0.27

In [26]:
numerical = ['tenure', 'monthlycharges', 'totalcharges']
categorical = []
# use df_train.columns because we dropped a taget varibale in that data
for col in df_train.columns:
    if col not in numerical:
        categorical.append(col)
categorical

['gender',
 'seniorcitizen',
 'partner',
 'dependents',
 'phoneservice',
 'multiplelines',
 'internetservice',
 'onlinesecurity',
 'onlinebackup',
 'deviceprotection',
 'techsupport',
 'streamingtv',
 'streamingmovies',
 'contract',
 'paperlessbilling',
 'paymentmethod']

In [27]:
df_train_full[categorical].nunique()

gender              2
seniorcitizen       2
partner             2
dependents          2
phoneservice        2
multiplelines       3
internetservice     3
onlinesecurity      3
onlinebackup        3
deviceprotection    3
techsupport         3
streamingtv         3
streamingmovies     3
contract            3
paperlessbilling    2
paymentmethod       4
dtype: int64

In [28]:
vc = pd.DataFrame(columns=[])
for col in categorical:
    display(df_train_full[col].value_counts(normalize=True).round(2).to_frame())

Unnamed: 0,gender
male,0.51
female,0.49


Unnamed: 0,seniorcitizen
0,0.84
1,0.16


Unnamed: 0,partner
no,0.52
yes,0.48


Unnamed: 0,dependents
no,0.7
yes,0.3


Unnamed: 0,phoneservice
yes,0.9
no,0.1


Unnamed: 0,multiplelines
no,0.48
yes,0.43
no_phone_service,0.1


Unnamed: 0,internetservice
fiber_optic,0.44
dsl,0.34
no,0.22


Unnamed: 0,onlinesecurity
no,0.5
yes,0.29
no_internet_service,0.22


Unnamed: 0,onlinebackup
no,0.43
yes,0.35
no_internet_service,0.22


Unnamed: 0,deviceprotection
no,0.44
yes,0.34
no_internet_service,0.22


Unnamed: 0,techsupport
no,0.5
yes,0.29
no_internet_service,0.22


Unnamed: 0,streamingtv
no,0.4
yes,0.39
no_internet_service,0.22


Unnamed: 0,streamingmovies
no,0.39
yes,0.39
no_internet_service,0.22


Unnamed: 0,contract
month-to-month,0.55
two_year,0.24
one_year,0.22


Unnamed: 0,paperlessbilling
yes,0.59
no,0.41


Unnamed: 0,paymentmethod
electronic_check,0.33
mailed_check,0.23
bank_transfer_(automatic),0.22
credit_card_(automatic),0.22


In [29]:
def get_telco_data():
    data = '../data/telco.csv'
    df = pd.read_csv(data)
    # 'coerce' fills errors with Null values
    df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
    # fill nulls with zeros
    df.TotalCharges = df.TotalCharges.fillna(0)
    # rename columns
    df.columns = df.columns.str.lower().str.replace(' ', '_')
    # drop customer id
    del df['customerid']
    # get names of string columns
    string_cols = df.dtypes[df.dtypes == 'object'].index
    # bring all values in string columns to lower case
    for col in string_cols:
        df[col] = df[col].str.lower().str.replace(' ', '_')
    # from yes/no to 1/0
    df.churn = (df.churn == 'yes').astype('uint8')

    return df
    

In [31]:
get_telco_data().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   seniorcitizen     7043 non-null   int64  
 2   partner           7043 non-null   object 
 3   dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   phoneservice      7043 non-null   object 
 6   multiplelines     7043 non-null   object 
 7   internetservice   7043 non-null   object 
 8   onlinesecurity    7043 non-null   object 
 9   onlinebackup      7043 non-null   object 
 10  deviceprotection  7043 non-null   object 
 11  techsupport       7043 non-null   object 
 12  streamingtv       7043 non-null   object 
 13  streamingmovies   7043 non-null   object 
 14  contract          7043 non-null   object 
 15  paperlessbilling  7043 non-null   object 
 16  paymentmethod     7043 non-null   object 


In [32]:
def split_telco_data(df=get_telco_data(), seed=42, explore=True):
    df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=seed)
    if explore:
        return df_train_full
    else:
        df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=seed)
        # get y arrays
        y_train = df_train.churn.values
        y_val = df_val.churn.values
        y_test = df_test.churn.values
        # delete target var from data sets
        del df_train['churn']
        del df_val['churn']
        del df_test['churn']

        return df_train, df_val, df_test, y_train, y_val, y_test

In [34]:
split_telco_data().info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5634 entries, 2142 to 860
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            5634 non-null   object 
 1   seniorcitizen     5634 non-null   int64  
 2   partner           5634 non-null   object 
 3   dependents        5634 non-null   object 
 4   tenure            5634 non-null   int64  
 5   phoneservice      5634 non-null   object 
 6   multiplelines     5634 non-null   object 
 7   internetservice   5634 non-null   object 
 8   onlinesecurity    5634 non-null   object 
 9   onlinebackup      5634 non-null   object 
 10  deviceprotection  5634 non-null   object 
 11  techsupport       5634 non-null   object 
 12  streamingtv       5634 non-null   object 
 13  streamingmovies   5634 non-null   object 
 14  contract          5634 non-null   object 
 15  paperlessbilling  5634 non-null   object 
 16  paymentmethod     5634 non-null   object