#### Loading data sources

In [432]:
!pip install imblearn
!pip install imbalanced-learn
!pip install patsy

Looking in indexes: https://pypi.python.org/simple, https://cognite.jfrog.io/cognite/api/pypi/snakepit/simple
Looking in indexes: https://pypi.python.org/simple, https://cognite.jfrog.io/cognite/api/pypi/snakepit/simple
Looking in indexes: https://pypi.python.org/simple, https://cognite.jfrog.io/cognite/api/pypi/snakepit/simple


In [433]:
# This Jupyter Notebook does feature engineering joining all features in all tables.

import matplotlib.pyplot as plt
import pandas as pd
import seaborn
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, r2_score
from datetime import datetime
from imblearn.over_sampling import SMOTE
import seaborn as sns
import numpy as np
from sklearn.metrics import confusion_matrix, accuracy_score,classification_report,roc_curve,roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
le = LabelEncoder()
ohe = OneHotEncoder(sparse=False)


#### Parameters

#### Load the dataset

In [434]:
# Read employees dataset
df_customer_raw = pd.read_csv('data/WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [435]:
df_customer_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [436]:
df_customer = df_customer_raw

In [437]:
df_customer.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [438]:
df_customer['gender'].unique()

array(['Female', 'Male'], dtype=object)

In [439]:
encoded_data = ohe.fit_transform(df_customer[['gender']])
encoded_df = pd.DataFrame(encoded_data, columns=ohe.get_feature_names_out(['gender']))
df_customer = pd.concat([df_customer, encoded_df], axis=1)
df_customer.drop(columns=['gender'], inplace=True)
df_customer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   MultipleLines     7043 non-null   object 
 7   InternetService   7043 non-null   object 
 8   OnlineSecurity    7043 non-null   object 
 9   OnlineBackup      7043 non-null   object 
 10  DeviceProtection  7043 non-null   object 
 11  TechSupport       7043 non-null   object 
 12  StreamingTV       7043 non-null   object 
 13  StreamingMovies   7043 non-null   object 
 14  Contract          7043 non-null   object 
 15  PaperlessBilling  7043 non-null   object 
 16  PaymentMethod     7043 non-null   object 


In [440]:
df_customer['Partner'].unique()

array(['Yes', 'No'], dtype=object)

In [441]:
df_customer['Partner'] = le.fit_transform(df_customer['Partner'])
df_customer['Partner'].unique()

array([1, 0])

In [442]:
df_customer['Dependents'].unique()

array(['No', 'Yes'], dtype=object)

In [443]:
df_customer['Dependents'] = le.fit_transform(df_customer['Dependents'])
df_customer['Dependents'].unique()

array([0, 1])

In [444]:
df_customer['PhoneService'].unique()

array(['No', 'Yes'], dtype=object)

In [445]:
df_customer['PhoneService'] = le.fit_transform(df_customer['PhoneService'])
df_customer['PhoneService'].unique()

array([0, 1])

In [446]:
df_customer['InternetService'].unique()

array(['DSL', 'Fiber optic', 'No'], dtype=object)

In [447]:
encoded_data = ohe.fit_transform(df_customer[['InternetService']])
encoded_df = pd.DataFrame(encoded_data, columns=ohe.get_feature_names_out(['InternetService']))
df_customer = pd.concat([df_customer, encoded_df], axis=1)
df_customer.drop(columns=['InternetService'], inplace=True)
df_customer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 24 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   customerID                   7043 non-null   object 
 1   SeniorCitizen                7043 non-null   int64  
 2   Partner                      7043 non-null   int64  
 3   Dependents                   7043 non-null   int64  
 4   tenure                       7043 non-null   int64  
 5   PhoneService                 7043 non-null   int64  
 6   MultipleLines                7043 non-null   object 
 7   OnlineSecurity               7043 non-null   object 
 8   OnlineBackup                 7043 non-null   object 
 9   DeviceProtection             7043 non-null   object 
 10  TechSupport                  7043 non-null   object 
 11  StreamingTV                  7043 non-null   object 
 12  StreamingMovies              7043 non-null   object 
 13  Contract          

In [448]:
encoded_data = ohe.fit_transform(df_customer[['OnlineSecurity']])
encoded_df = pd.DataFrame(encoded_data, columns=ohe.get_feature_names_out(['OnlineSecurity']))
df_customer = pd.concat([df_customer, encoded_df], axis=1)
df_customer.drop(columns=['OnlineSecurity'], inplace=True)
df_customer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 26 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   customerID                          7043 non-null   object 
 1   SeniorCitizen                       7043 non-null   int64  
 2   Partner                             7043 non-null   int64  
 3   Dependents                          7043 non-null   int64  
 4   tenure                              7043 non-null   int64  
 5   PhoneService                        7043 non-null   int64  
 6   MultipleLines                       7043 non-null   object 
 7   OnlineBackup                        7043 non-null   object 
 8   DeviceProtection                    7043 non-null   object 
 9   TechSupport                         7043 non-null   object 
 10  StreamingTV                         7043 non-null   object 
 11  StreamingMovies                     7043 no

In [449]:
df_customer['OnlineBackup'].unique()

array(['Yes', 'No', 'No internet service'], dtype=object)

In [450]:
encoded_data = ohe.fit_transform(df_customer[['OnlineBackup']])
encoded_df = pd.DataFrame(encoded_data, columns=ohe.get_feature_names_out(['OnlineBackup']))
df_customer = pd.concat([df_customer, encoded_df], axis=1)
df_customer.drop(columns=['OnlineBackup'], inplace=True)
df_customer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 28 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   customerID                          7043 non-null   object 
 1   SeniorCitizen                       7043 non-null   int64  
 2   Partner                             7043 non-null   int64  
 3   Dependents                          7043 non-null   int64  
 4   tenure                              7043 non-null   int64  
 5   PhoneService                        7043 non-null   int64  
 6   MultipleLines                       7043 non-null   object 
 7   DeviceProtection                    7043 non-null   object 
 8   TechSupport                         7043 non-null   object 
 9   StreamingTV                         7043 non-null   object 
 10  StreamingMovies                     7043 non-null   object 
 11  Contract                            7043 no

In [451]:
df_customer['DeviceProtection'].unique()

array(['No', 'Yes', 'No internet service'], dtype=object)

In [452]:
df_customer['TechSupport'].unique()

array(['No', 'Yes', 'No internet service'], dtype=object)

In [453]:
df_customer['StreamingMovies'].unique()

array(['No', 'Yes', 'No internet service'], dtype=object)

In [454]:
df_customer['StreamingTV'].unique()

array(['No', 'Yes', 'No internet service'], dtype=object)

In [455]:
df_customer['DeviceProtection'].unique()

array(['No', 'Yes', 'No internet service'], dtype=object)

In [456]:
encoded_data = ohe.fit_transform(df_customer[['DeviceProtection']])
encoded_df = pd.DataFrame(encoded_data, columns=ohe.get_feature_names_out(['DeviceProtection']))
df_customer = pd.concat([df_customer, encoded_df], axis=1)
df_customer.drop(columns=['DeviceProtection'], inplace=True)
df_customer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 30 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   customerID                            7043 non-null   object 
 1   SeniorCitizen                         7043 non-null   int64  
 2   Partner                               7043 non-null   int64  
 3   Dependents                            7043 non-null   int64  
 4   tenure                                7043 non-null   int64  
 5   PhoneService                          7043 non-null   int64  
 6   MultipleLines                         7043 non-null   object 
 7   TechSupport                           7043 non-null   object 
 8   StreamingTV                           7043 non-null   object 
 9   StreamingMovies                       7043 non-null   object 
 10  Contract                              7043 non-null   object 
 11  PaperlessBilling 