In [1]:
# Import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Data Wrangling

In [37]:
# Import dataset

data = pd.read_csv('data_1.csv')
data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [38]:
data.shape

(7043, 21)

In [39]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [40]:
data.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692
std,0.368612,24.559481,30.090047
min,0.0,0.0,18.25
25%,0.0,9.0,35.5
50%,0.0,29.0,70.35
75%,0.0,55.0,89.85
max,1.0,72.0,118.75


In [41]:
data.isna().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

## Data Cleaning

In [42]:
# Replace yes and no to 1 and o in the Churn column

data['Churn'] = data['Churn'].apply(lambda x:1 if x=='Yes' else 0)
data['Churn']

0       0
1       0
2       1
3       0
4       1
       ..
7038    0
7039    0
7040    0
7041    1
7042    0
Name: Churn, Length: 7043, dtype: int64

In [43]:
# Handle missing values and change the data type for the TotalCharges column

data['TotalCharges'] = data['TotalCharges'].replace(' ', np.nan).astype(float)
data['TotalCharges'].dtypes

dtype('float64')

In [44]:
# Drop null values (just in case)

data.dropna(inplace=True)

In [45]:
# Normalize and transform continuous variables

data['MonthlyCharges'] = np.log(data['MonthlyCharges'])
data['MonthlyCharges'] = (data['MonthlyCharges']-data['MonthlyCharges'].mean())/data['MonthlyCharges'].std()
data['TotalCharges'] = np.log(data['TotalCharges'])
data['TotalCharges'] = (data['TotalCharges']-data['TotalCharges'].mean())/data['TotalCharges'].std()
data['tenure'] = (data['tenure']-data['tenure'].mean())/data['tenure'].std()

In [46]:
data['MonthlyCharges']

0      -1.054244
1       0.032896
2      -0.061298
3      -0.467578
4       0.396862
          ...   
7038    0.702899
7039    1.033378
7040   -1.068398
7041    0.482708
7042    1.072863
Name: MonthlyCharges, Length: 7032, dtype: float64

In [47]:
data['TotalCharges']

0      -2.281382
1       0.389269
2      -1.452520
3       0.372439
4      -1.234860
          ...   
7038    0.422797
7039    1.265008
7040   -0.702928
7041   -0.781604
7042    1.218001
Name: TotalCharges, Length: 7032, dtype: float64

In [48]:
data['tenure']

0      -1.280157
1       0.064298
2      -1.239416
3       0.512450
4      -1.239416
          ...   
7038   -0.343113
7039    1.612459
7040   -0.872746
7041   -1.157934
7042    1.368012
Name: tenure, Length: 7032, dtype: float64

In [49]:
# Convert Boolean values with one-hot encoding

data['SeniorCitizen'].loc[data.SeniorCitizen == 1] = 'Yes'
data['SeniorCitizen'].loc[data.SeniorCitizen == 0] = 'No'

data['SeniorCitizen']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['SeniorCitizen'].loc[data.SeniorCitizen == 1] = 'Yes'
  data['SeniorCitizen'].loc[data.SeniorCitizen == 1] = 'Yes'


0        No
1        No
2        No
3        No
4        No
       ... 
7038     No
7039     No
7040     No
7041    Yes
7042     No
Name: SeniorCitizen, Length: 7032, dtype: object

In [50]:
data_updated = data[['tenure','MonthlyCharges','TotalCharges','Churn']].copy(deep=True)

for col in list(data.columns):
    if col not in ['tenure','MonthlyCharges','TotalCharges','Churn'] and data[col].nunique() < 5:
        dummy_vars = pd.get_dummies(data[col])
        dummy_vars.columns = [col + '_' + str(x) for x in dummy_vars.columns]
        data_updated = pd.concat([data_updated, dummy_vars], axis=1)

data_updated

Unnamed: 0,tenure,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,SeniorCitizen_No,SeniorCitizen_Yes,Partner_No,Partner_Yes,...,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,-1.280157,-1.054244,-2.281382,0,True,False,True,False,False,True,...,False,True,False,False,False,True,False,False,True,False
1,0.064298,0.032896,0.389269,0,False,True,True,False,True,False,...,False,False,True,False,True,False,False,False,False,True
2,-1.239416,-0.061298,-1.452520,1,False,True,True,False,True,False,...,False,True,False,False,False,True,False,False,False,True
3,0.512450,-0.467578,0.372439,0,False,True,True,False,True,False,...,False,False,True,False,True,False,True,False,False,False
4,-1.239416,0.396862,-1.234860,1,True,False,True,False,True,False,...,False,True,False,False,False,True,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,-0.343113,0.702899,0.422797,0,False,True,True,False,False,True,...,True,False,True,False,False,True,False,False,False,True
7039,1.612459,1.033378,1.265008,0,True,False,True,False,False,True,...,True,False,True,False,False,True,False,True,False,False
7040,-0.872746,-1.068398,-0.702928,0,True,False,True,False,False,True,...,False,True,False,False,False,True,False,False,True,False
7041,-1.157934,0.482708,-0.781604,1,False,True,False,True,False,True,...,False,True,False,False,False,True,False,False,False,True


## Multilayer Perceptron Model (MLP)

In [51]:
from keras.models import Sequential
from keras.layers import Dense
from sklearn.model_selection import train_test_split

In [52]:
# Define independent and dependent variables

target = 'Churn'
features = [x for x in list(data_updated.columns) if x != target]

In [53]:
# Build model

model = Sequential()
model.add(Dense(16, input_dim = len(features), activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))


# Compile model

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


# Split and train dataset

X_train, X_test, y_train, y_test = train_test_split(data_updated[features], data_updated[target], test_size=0.2, random_state=23)

history = model.fit(X_train, y_train, epochs=50, batch_size=100)

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type float).