In [1]:
import numpy as np
import pandas as pd

In [26]:
df = pd.read_json('https://raw.githubusercontent.com/sthemonica/alura-voz/main/Dados/Telco-Customer-Churn.json')

customer_df = pd.DataFrame(df['customer'].values.tolist(), columns=list(df['customer'][0].keys()))
phone_df = pd.DataFrame(df['phone'].values.tolist(), columns=list(df['phone'][0].keys()))
internet_df = pd.DataFrame(df['internet'].values.tolist(), columns=list(df['internet'][0].keys()))
account_df = pd.DataFrame(df['account'].values.tolist(), columns=list(df['account'][0].keys()))

df = pd.concat([df[['customerID', 'Churn']], customer_df, phone_df, internet_df, account_df], axis=1)

charges_df = pd.DataFrame(df['Charges'].values.tolist(), columns=list(df['Charges'][0].keys()))
charges_df = charges_df.rename(columns = {'Monthly': 'MonthlyCharges', 'Total': 'TotalCharges'})

df = df.drop('Charges', axis=1)

df = pd.concat([df, charges_df], axis=1)

In [27]:
# a lot of 'Total_charges' values are string and some are int,
# so we are going to convert them all into floats,
# but, some of the string values are empty strings,
# therefore, before converting them into floats,
# we need to deal with these empty strings first.

df['TotalCharges'] = df['TotalCharges'].apply(lambda x: '0' if x == ' ' else x)
df['TotalCharges'] = df['TotalCharges'].apply(float)

In [28]:
# creating DailyCharges column
daily_charges = df['MonthlyCharges']/30
df.insert(18, 'DailyCharges', daily_charges)

In [29]:
df.columns

Index(['customerID', 'Churn', 'gender', 'SeniorCitizen', 'Partner',
       'Dependents', 'tenure', 'PhoneService', 'MultipleLines',
       'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
       'PaperlessBilling', 'DailyCharges', 'PaymentMethod', 'MonthlyCharges',
       'TotalCharges'],
      dtype='object')

In [30]:
df.head()

Unnamed: 0,customerID,Churn,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,DailyCharges,PaymentMethod,MonthlyCharges,TotalCharges
0,0002-ORFBO,No,Female,0,Yes,Yes,9,Yes,No,DSL,...,No,Yes,Yes,No,One year,Yes,2.186667,Mailed check,65.6,593.3
1,0003-MKNFE,No,Male,0,No,No,9,Yes,Yes,DSL,...,No,No,No,Yes,Month-to-month,No,1.996667,Mailed check,59.9,542.4
2,0004-TLHLJ,Yes,Male,0,No,No,4,Yes,No,Fiber optic,...,Yes,No,No,No,Month-to-month,Yes,2.463333,Electronic check,73.9,280.85
3,0011-IGKFF,Yes,Male,1,Yes,No,13,Yes,No,Fiber optic,...,Yes,No,Yes,Yes,Month-to-month,Yes,3.266667,Electronic check,98.0,1237.85
4,0013-EXCHZ,Yes,Female,1,Yes,No,3,Yes,No,Fiber optic,...,No,Yes,Yes,No,Month-to-month,Yes,2.796667,Mailed check,83.9,267.4


In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7267 entries, 0 to 7266
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7267 non-null   object 
 1   Churn             7267 non-null   object 
 2   gender            7267 non-null   object 
 3   SeniorCitizen     7267 non-null   int64  
 4   Partner           7267 non-null   object 
 5   Dependents        7267 non-null   object 
 6   tenure            7267 non-null   int64  
 7   PhoneService      7267 non-null   object 
 8   MultipleLines     7267 non-null   object 
 9   InternetService   7267 non-null   object 
 10  OnlineSecurity    7267 non-null   object 
 11  OnlineBackup      7267 non-null   object 
 12  DeviceProtection  7267 non-null   object 
 13  TechSupport       7267 non-null   object 
 14  StreamingTV       7267 non-null   object 
 15  StreamingMovies   7267 non-null   object 
 16  Contract          7267 non-null   object 


In [32]:
df.select_dtypes('object')

Unnamed: 0,customerID,Churn,gender,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod
0,0002-ORFBO,No,Female,Yes,Yes,Yes,No,DSL,No,Yes,No,Yes,Yes,No,One year,Yes,Mailed check
1,0003-MKNFE,No,Male,No,No,Yes,Yes,DSL,No,No,No,No,No,Yes,Month-to-month,No,Mailed check
2,0004-TLHLJ,Yes,Male,No,No,Yes,No,Fiber optic,No,No,Yes,No,No,No,Month-to-month,Yes,Electronic check
3,0011-IGKFF,Yes,Male,Yes,No,Yes,No,Fiber optic,No,Yes,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check
4,0013-EXCHZ,Yes,Female,Yes,No,Yes,No,Fiber optic,No,No,No,Yes,Yes,No,Month-to-month,Yes,Mailed check
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7262,9987-LUTYD,No,Female,No,No,Yes,No,DSL,Yes,No,No,Yes,No,No,One year,No,Mailed check
7263,9992-RRAMN,Yes,Male,Yes,No,Yes,Yes,Fiber optic,No,No,No,No,No,Yes,Month-to-month,Yes,Electronic check
7264,9992-UJOEL,No,Male,No,No,Yes,No,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Mailed check
7265,9993-LHIEB,No,Male,Yes,Yes,Yes,No,DSL,Yes,No,Yes,Yes,No,Yes,Two year,No,Mailed check


In [33]:
df.columns

Index(['customerID', 'Churn', 'gender', 'SeniorCitizen', 'Partner',
       'Dependents', 'tenure', 'PhoneService', 'MultipleLines',
       'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
       'PaperlessBilling', 'DailyCharges', 'PaymentMethod', 'MonthlyCharges',
       'TotalCharges'],
      dtype='object')