In [2]:
import pandas as pd

In [3]:
pd.__version__

'2.2.3'

## Data Preparation

In [101]:
df = pd.read_csv("dataset.csv")
SizeDF = len(df)
SizeDF

7043

In [102]:
df.head().T

Unnamed: 0,0,1,2,3,4
customerID,7590-VHVEG,5575-GNVDE,3668-QPYBK,7795-CFOCW,9237-HQITU
gender,Female,Male,Male,Male,Female
SeniorCitizen,0,0,0,0,0
Partner,Yes,No,No,No,No
Dependents,No,No,No,No,No
tenure,1,34,2,45,2
PhoneService,No,Yes,Yes,No,Yes
MultipleLines,No phone service,No,No,No phone service,No
InternetService,DSL,DSL,DSL,DSL,Fiber optic
OnlineSecurity,No,Yes,Yes,Yes,No


In [104]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [105]:
df.gender.str.lower().str.replace(' ', '_')

0       female
1         male
2         male
3         male
4       female
         ...  
7038      male
7039    female
7040    female
7041      male
7042      male
Name: gender, Length: 7043, dtype: object

In [106]:
ColsValsToModify = list(df.dtypes[df.dtypes == 'object'].index)
for c in ColsValsToModify:
    df[c] = df[c].str.lower().str.replace(' ', '_')

In [107]:
df.head().T

Unnamed: 0,0,1,2,3,4
customerid,7590-vhveg,5575-gnvde,3668-qpybk,7795-cfocw,9237-hqitu
gender,female,male,male,male,female
seniorcitizen,0,0,0,0,0
partner,yes,no,no,no,no
dependents,no,no,no,no,no
tenure,1,34,2,45,2
phoneservice,no,yes,yes,no,yes
multiplelines,no_phone_service,no,no,no_phone_service,no
internetservice,dsl,dsl,dsl,dsl,fiber_optic
onlinesecurity,no,yes,yes,yes,no


In [108]:
df.dtypes

customerid           object
gender               object
seniorcitizen         int64
partner              object
dependents           object
tenure                int64
phoneservice         object
multiplelines        object
internetservice      object
onlinesecurity       object
onlinebackup         object
deviceprotection     object
techsupport          object
streamingtv          object
streamingmovies      object
contract             object
paperlessbilling     object
paymentmethod        object
monthlycharges      float64
totalcharges         object
churn                object
dtype: object

In [109]:
df.totalcharges

0         29.85
1        1889.5
2        108.15
3       1840.75
4        151.65
         ...   
7038     1990.5
7039     7362.9
7040     346.45
7041      306.6
7042     6844.5
Name: totalcharges, Length: 7043, dtype: object

In [110]:
TC = pd.to_numeric(df.totalcharges, errors="coerce")
TC

0         29.85
1       1889.50
2        108.15
3       1840.75
4        151.65
         ...   
7038    1990.50
7039    7362.90
7040     346.45
7041     306.60
7042    6844.50
Name: totalcharges, Length: 7043, dtype: float64

In [111]:
df[["customerid", "totalcharges"]][TC.isnull()]

Unnamed: 0,customerid,totalcharges
488,4472-lvygi,_
753,3115-czmzd,_
936,5709-lvoeq,_
1082,4367-nuyao,_
1340,1371-dwpaz,_
3331,7644-omvmy,_
3826,3213-vvolg,_
4380,2520-sgtta,_
5218,2923-arzlg,_
6670,4075-wkniu,_


In [112]:
df.totalcharges = pd.to_numeric(df.totalcharges, errors = "coerce")

In [113]:
df.totalcharges = df.totalcharges.fillna(0)

In [114]:
df.totalcharges.dtypes

dtype('float64')

In [115]:
df.totalcharges.isnull().sum()

np.int64(0)

In [116]:
df.churn = (df.churn == 'yes').astype(int)

In [117]:
df[['totalcharges', 'churn']].head(8)

Unnamed: 0,totalcharges,churn
0,29.85,0
1,1889.5,0
2,108.15,1
3,1840.75,0
4,151.65,1
5,820.5,1
6,1949.4,0
7,301.9,0


In [118]:
df.head(5).T

Unnamed: 0,0,1,2,3,4
customerid,7590-vhveg,5575-gnvde,3668-qpybk,7795-cfocw,9237-hqitu
gender,female,male,male,male,female
seniorcitizen,0,0,0,0,0
partner,yes,no,no,no,no
dependents,no,no,no,no,no
tenure,1,34,2,45,2
phoneservice,no,yes,yes,no,yes
multiplelines,no_phone_service,no,no,no_phone_service,no
internetservice,dsl,dsl,dsl,dsl,fiber_optic
onlinesecurity,no,yes,yes,yes,no


In [119]:
df.dtypes

customerid           object
gender               object
seniorcitizen         int64
partner              object
dependents           object
tenure                int64
phoneservice         object
multiplelines        object
internetservice      object
onlinesecurity       object
onlinebackup         object
deviceprotection     object
techsupport          object
streamingtv          object
streamingmovies      object
contract             object
paperlessbilling     object
paymentmethod        object
monthlycharges      float64
totalcharges        float64
churn                 int64
dtype: object

## Set up validation framework

In [120]:
from sklearn.model_selection import train_test_split

In [122]:
FullTrainDF, TestDF = train_test_split(df, test_size=0.2, random_state=1)

In [123]:
len(FullTrainDF) + len(TestDF) == SizeDF

True

In [124]:
# the test_size is 0.25 here instead of 0.2 since 20% of the full dataset (df) is 25% of 80% of the dataset (FullTrainDF)
TrainDF, ValDF = train_test_split(FullTrainDF, test_size=0.25, random_state=1)  

In [125]:
len(TrainDF), len(ValDF), len(TestDF)

(4225, 1409, 1409)

In [126]:
len(TrainDF) + len(ValDF) + len(TestDF) == SizeDF

True

In [127]:
# since the indices were shuffled, you can either reset them to make it ordered or delete them as well, doesn't matter
TrainDF = TrainDF.reset_index(drop=True)
ValDF = ValDF.reset_index(drop=True)
TestDF = TestDF.reset_index(drop=True)

In [128]:
TrainDF.head()

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,8015-ihcgw,female,0,yes,yes,72,yes,yes,fiber_optic,yes,...,yes,yes,yes,yes,two_year,yes,electronic_check,115.5,8425.15,0
1,1960-uycnn,male,0,no,no,10,yes,yes,fiber_optic,no,...,yes,no,no,yes,month-to-month,yes,electronic_check,95.25,1021.55,0
2,9250-wypll,female,0,no,no,5,yes,yes,fiber_optic,no,...,no,no,no,no,month-to-month,no,electronic_check,75.55,413.65,1
3,6786-obwqr,female,0,yes,yes,5,yes,no,fiber_optic,no,...,no,no,yes,no,month-to-month,yes,electronic_check,80.85,356.1,0
4,1328-euzhc,female,0,yes,no,18,yes,no,no,no_internet_service,...,no_internet_service,no_internet_service,no_internet_service,no_internet_service,two_year,no,mailed_check,20.1,370.5,0


In [141]:
# storing y values as numpy array values
yTrain = TrainDF.churn.values
yVal = ValDF.churn.values
yTest = TestDF.churn.values

# deleting y values from dataset
del TrainDF['churn']
del ValDF['churn']
del TestDF['churn']

AttributeError: 'DataFrame' object has no attribute 'churn'

In [142]:
# validating deletion of 'y value' column
"churn" in TrainDF, "churn" in ValDF, "churn" in TestDF

(False, False, False)

## EDA

In [166]:
FullTrainDF = FullTrainDF.reset_index(drop=True)

In [167]:
# checking for any null values in the dataset
FullTrainDF.isnull().sum()

customerid          0
gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
paperlessbilling    0
paymentmethod       0
monthlycharges      0
totalcharges        0
churn               0
dtype: int64

In [175]:
# checking churn rate (i.e rate of counts of 1 in the datset)
GlobChurnRate = FullTrainDF['churn'].value_counts(normalize=True)[1]
# GlobChurnRate = FullTrainDF.churn.mean()
# can also be checked using mean of the value since it only returns number of 1s

In [178]:
print("Global churn rate:", round(GlobChurnRate, 2))

Global churn rate: 0.27


In [179]:
FullTrainDF.dtypes

customerid           object
gender               object
seniorcitizen         int64
partner              object
dependents           object
tenure                int64
phoneservice         object
multiplelines        object
internetservice      object
onlinesecurity       object
onlinebackup         object
deviceprotection     object
techsupport          object
streamingtv          object
streamingmovies      object
contract             object
paperlessbilling     object
paymentmethod        object
monthlycharges      float64
totalcharges        float64
churn                 int64
dtype: object

In [181]:
# numerical value columns
NumValues = ["tenure", "monthlycharges", "totalcharges"]

In [182]:
df.columns

Index(['customerid', 'gender', 'seniorcitizen', 'partner', 'dependents',
       'tenure', 'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod', 'monthlycharges', 'totalcharges', 'churn'],
      dtype='object')

In [183]:
# categorical value columns
CategoricalValues = ['gender', 'seniorcitizen', 'partner', 'dependents',
       'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod']

In [187]:
# no. of unique values in each categorical column
FullTrainDF[CategoricalValues].nunique()

gender              2
seniorcitizen       2
partner             2
dependents          2
phoneservice        2
multiplelines       3
internetservice     3
onlinesecurity      3
onlinebackup        3
deviceprotection    3
techsupport         3
streamingtv         3
streamingmovies     3
contract            3
paperlessbilling    2
paymentmethod       4
dtype: int64