In [1]:
#import packages
import numpy as np
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('data/TelcoCustomerChurn.csv')

In [3]:
services = ['PhoneService','MultipleLines','InternetService','OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies']

df['servicesCount'] = 0
for row in range(df.shape[0]):
    current = df.loc[row, services].isin(['Yes','DSL','Fiber optic'])
    df.loc[row,'servicesCount'] = current[current == True].count()

In [4]:
#non-changed variables
df_model = df[['tenure','MonthlyCharges','TotalCharges','servicesCount','SeniorCitizen']].copy()
df_model['TotalCharges'] = df_model['TotalCharges'].replace(' ',np.NaN).astype(float)
df_model

Unnamed: 0,tenure,MonthlyCharges,TotalCharges,servicesCount,SeniorCitizen
0,1,29.85,29.85,2,0
1,34,56.95,1889.50,4,0
2,2,53.85,108.15,4,0
3,45,42.30,1840.75,4,0
4,2,70.70,151.65,2,0
...,...,...,...,...,...
7038,24,84.80,1990.50,8,0
7039,72,103.20,7362.90,7,0
7040,11,29.60,346.45,2,0
7041,4,74.40,306.60,3,1


In [5]:
#Creating Flag columns
#Yes -> 1
#No, No phone service, No internet service -> 0
df_model['Partner'] = df['Partner'].replace(['No', 'Yes'], [0,1])
df_model['Dependents'] = df['Dependents'].replace(['No', 'Yes'], [0,1])
df_model['PhoneService'] = df['PhoneService'].replace(['No', 'Yes'], [0,1])
df_model['MultipleLines'] = df['MultipleLines'].replace(['No','No phone service','Yes'], [0,0,1])
df_model['OnlineSecurity'] = df['OnlineSecurity'].replace(['No','No internet service','Yes'], [0,0,1])
df_model['OnlineBackup'] = df['OnlineBackup'].replace(['No','No internet service','Yes'], [0,0,1])
df_model['DeviceProtection'] = df['DeviceProtection'].replace(['No','No internet service','Yes'], [0,0,1])
df_model['TechSupport'] = df['TechSupport'].replace(['No','No internet service','Yes'], [0,0,1])
df_model['StreamingTV'] = df['StreamingTV'].replace(['No','No internet service','Yes'], [0,0,1])
df_model['StreamingMovies'] = df['StreamingMovies'].replace(['No','No internet service','Yes'], [0,0,1])

In [6]:
#Creating dummy variables
# - Internet Service Dummies ('No', 'DSL', 'Fiber optic' -> 2 vars)
# - Type of Contract Dummies ('Month-to-month','One year','Two year' -> 2 vars)
# - Payment Method Dummies ('Eletronic Check','Mailed Check','Bank transfer (automatic)','Credit card (automatic)' -> 3 vars)
dummies = pd.get_dummies(df[['InternetService','Contract','PaymentMethod']])
dummies = dummies.drop(['InternetService_No','Contract_Two year','PaymentMethod_Credit card (automatic)'],axis=1)
dummies = dummies.rename(columns={"InternetService_DSL":"IS_DSL",
                                  "InternetService_Fiber optic":"IS_FiberOptic",
                                  "Contract_Month-to-month":"C_MonthToMonth",
                                  "Contract_One year":"C_OneYear",
                                  "PaymentMethod_Bank transfer (automatic)": "PM_BankTransfer", 
                                  "PaymentMethod_Electronic check": "PM_ElectronicCheck",
                                  "PaymentMethod_Mailed check": "PM_MailedCheck"})

In [7]:
df_model = pd.concat([df_model, dummies], axis=1)

In [8]:
#At last add the churn variable in binary form
df_model['Churn'] = df['Churn'].replace(['No', 'Yes'], [0,1])
df_model.head()

Unnamed: 0,tenure,MonthlyCharges,TotalCharges,servicesCount,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,OnlineSecurity,...,StreamingTV,StreamingMovies,IS_DSL,IS_FiberOptic,C_MonthToMonth,C_OneYear,PM_BankTransfer,PM_ElectronicCheck,PM_MailedCheck,Churn
0,1,29.85,29.85,2,0,1,0,0,0,0,...,0,0,1,0,1,0,0,1,0,0
1,34,56.95,1889.5,4,0,0,0,1,0,1,...,0,0,1,0,0,1,0,0,1,0
2,2,53.85,108.15,4,0,0,0,1,0,1,...,0,0,1,0,1,0,0,0,1,1
3,45,42.3,1840.75,4,0,0,0,0,0,1,...,0,0,1,0,0,1,1,0,0,0
4,2,70.7,151.65,2,0,0,0,1,0,0,...,0,0,0,1,1,0,0,1,0,1


In [9]:
df_model.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 23 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   tenure              7043 non-null   int64  
 1   MonthlyCharges      7043 non-null   float64
 2   TotalCharges        7032 non-null   float64
 3   servicesCount       7043 non-null   int64  
 4   SeniorCitizen       7043 non-null   int64  
 5   Partner             7043 non-null   int64  
 6   Dependents          7043 non-null   int64  
 7   PhoneService        7043 non-null   int64  
 8   MultipleLines       7043 non-null   int64  
 9   OnlineSecurity      7043 non-null   int64  
 10  OnlineBackup        7043 non-null   int64  
 11  DeviceProtection    7043 non-null   int64  
 12  TechSupport         7043 non-null   int64  
 13  StreamingTV         7043 non-null   int64  
 14  StreamingMovies     7043 non-null   int64  
 15  IS_DSL              7043 non-null   uint8  
 16  IS_Fib

In [10]:
#The column TotalCharges had null values that need to be droped
df_model.dropna(inplace=True)

In [11]:
df_model.to_csv('data/TransformedTelcoCustomerChurn.csv',index=False)