<a href="https://colab.research.google.com/github/nd823/data-cleaning/blob/master/telco_data_cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import libraries

In [0]:
import pandas as pd
import numpy as np

# Import data

In [0]:
df = pd.read_csv("https://github.com/treselle-systems/customer_churn_analysis/raw/master/WA_Fn-UseC_-Telco-Customer-Churn.csv")

# Intial check


## Preview data

In [3]:
df.head().T

Unnamed: 0,0,1,2,3,4
customerID,7590-VHVEG,5575-GNVDE,3668-QPYBK,7795-CFOCW,9237-HQITU
gender,Female,Male,Male,Male,Female
SeniorCitizen,0,0,0,0,0
Partner,Yes,No,No,No,No
Dependents,No,No,No,No,No
tenure,1,34,2,45,2
PhoneService,No,Yes,Yes,No,Yes
MultipleLines,No phone service,No,No,No phone service,No
InternetService,DSL,DSL,DSL,DSL,Fiber optic
OnlineSecurity,No,Yes,Yes,Yes,No


## Check column data types

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
customerID          7043 non-null object
gender              7043 non-null object
SeniorCitizen       7043 non-null int64
Partner             7043 non-null object
Dependents          7043 non-null object
tenure              7043 non-null int64
PhoneService        7043 non-null object
MultipleLines       7043 non-null object
InternetService     7043 non-null object
OnlineSecurity      7043 non-null object
OnlineBackup        7043 non-null object
DeviceProtection    7043 non-null object
TechSupport         7043 non-null object
StreamingTV         7043 non-null object
StreamingMovies     7043 non-null object
Contract            7043 non-null object
PaperlessBilling    7043 non-null object
PaymentMethod       7043 non-null object
MonthlyCharges      7043 non-null float64
TotalCharges        7043 non-null object
Churn               7043 non-null object
dtypes: float64(1), int64(2), obj

Need to:
- Drop `customerID` column
- Convert `SeniorCitizen` column to `category` type
- Rename levels of all categorical variables to reflect column name
- Convert `TotalCharges` column to `float64` type

# Data cleaning

## Drop rows with tenure=0 (n=11)

In [5]:
df[df['tenure']==0]

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
488,4472-LVYGI,Female,0,Yes,Yes,0,No,No phone service,DSL,Yes,No,Yes,Yes,Yes,No,Two year,Yes,Bank transfer (automatic),52.55,,No
753,3115-CZMZD,Male,0,No,Yes,0,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,20.25,,No
936,5709-LVOEQ,Female,0,Yes,Yes,0,Yes,No,DSL,Yes,Yes,Yes,No,Yes,Yes,Two year,No,Mailed check,80.85,,No
1082,4367-NUYAO,Male,0,Yes,Yes,0,Yes,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,25.75,,No
1340,1371-DWPAZ,Female,0,Yes,Yes,0,No,No phone service,DSL,Yes,Yes,Yes,Yes,Yes,No,Two year,No,Credit card (automatic),56.05,,No
3331,7644-OMVMY,Male,0,Yes,Yes,0,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,19.85,,No
3826,3213-VVOLG,Male,0,Yes,Yes,0,Yes,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,25.35,,No
4380,2520-SGTTA,Female,0,Yes,Yes,0,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,20.0,,No
5218,2923-ARZLG,Male,0,Yes,Yes,0,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,One year,Yes,Mailed check,19.7,,No
6670,4075-WKNIU,Female,0,Yes,Yes,0,Yes,Yes,DSL,No,Yes,Yes,Yes,Yes,No,Two year,No,Mailed check,73.35,,No


In [0]:
df = df[df['tenure'] > 0 ]

## Drop `customerID` column

In [0]:
df = df.drop(['customerID'], axis = 1)

In [0]:
df['SeniorCitizen'] = np.where(df['SeniorCitizen'] == 1, 'Yes', 'No')

df['SeniorCitizen'] = df['SeniorCitizen'].astype('object', copy=False)

## Rename levels of all categorical variables to reflect column name

In [0]:
col_list = ['Partner', 'Dependents','PhoneService', 'DeviceProtection', 'MultipleLines', 'OnlineSecurity', 
            'OnlineBackup', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'PaperlessBilling', 'Churn']

for col in col_list:
    df[col] = np.where(df[col]=='Yes', col, 'No'+' '+col)

In [0]:
df['SeniorCitizen'] = np.where(df['SeniorCitizen']=='1', 'SeniorCitizen', 'Not SeniorCitizen')

df['InternetService'] = df['InternetService'].replace({'No':'No InternetService'})

## Convert `TotalCharges` column to `float64` type

In [0]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7032 entries, 0 to 7042
Data columns (total 20 columns):
gender              7032 non-null object
SeniorCitizen       7032 non-null object
Partner             7032 non-null object
Dependents          7032 non-null object
tenure              7032 non-null int64
PhoneService        7032 non-null object
MultipleLines       7032 non-null object
InternetService     7032 non-null object
OnlineSecurity      7032 non-null object
OnlineBackup        7032 non-null object
DeviceProtection    7032 non-null object
TechSupport         7032 non-null object
StreamingTV         7032 non-null object
StreamingMovies     7032 non-null object
Contract            7032 non-null object
PaperlessBilling    7032 non-null object
PaymentMethod       7032 non-null object
MonthlyCharges      7032 non-null float64
TotalCharges        7032 non-null float64
Churn               7032 non-null object
dtypes: float64(2), int64(1), object(17)
memory usage: 1.1+ MB


## Create new `TotalCharges` column

There are missing data in the `TotalCharges` column. We will create new `Calculated_TotalCharges` column by multiplying the `MonthlyCharges` and `tenure` columns.


In [0]:
## Drop the TotalCharges column
df.drop(['TotalCharges'], axis=1, inplace=True)

## Create a Calculated_TotalCharges column from tenure and MonthlyCharges, which will not have missing values
df["Calculated_TotalCharges"] = df['MonthlyCharges'] * df['tenure']

# Final check

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7032 entries, 0 to 7042
Data columns (total 20 columns):
gender                     7032 non-null object
SeniorCitizen              7032 non-null object
Partner                    7032 non-null object
Dependents                 7032 non-null object
tenure                     7032 non-null int64
PhoneService               7032 non-null object
MultipleLines              7032 non-null object
InternetService            7032 non-null object
OnlineSecurity             7032 non-null object
OnlineBackup               7032 non-null object
DeviceProtection           7032 non-null object
TechSupport                7032 non-null object
StreamingTV                7032 non-null object
StreamingMovies            7032 non-null object
Contract                   7032 non-null object
PaperlessBilling           7032 non-null object
PaymentMethod              7032 non-null object
MonthlyCharges             7032 non-null float64
Churn                      70

In [15]:
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,Churn,Calculated_TotalCharges
0,Female,Not SeniorCitizen,Partner,No Dependents,1,No PhoneService,No MultipleLines,DSL,No OnlineSecurity,OnlineBackup,No DeviceProtection,No TechSupport,No StreamingTV,No StreamingMovies,Month-to-month,PaperlessBilling,Electronic check,29.85,No Churn,29.85
1,Male,Not SeniorCitizen,No Partner,No Dependents,34,PhoneService,No MultipleLines,DSL,OnlineSecurity,No OnlineBackup,DeviceProtection,No TechSupport,No StreamingTV,No StreamingMovies,One year,No PaperlessBilling,Mailed check,56.95,No Churn,1936.3
2,Male,Not SeniorCitizen,No Partner,No Dependents,2,PhoneService,No MultipleLines,DSL,OnlineSecurity,OnlineBackup,No DeviceProtection,No TechSupport,No StreamingTV,No StreamingMovies,Month-to-month,PaperlessBilling,Mailed check,53.85,Churn,107.7
3,Male,Not SeniorCitizen,No Partner,No Dependents,45,No PhoneService,No MultipleLines,DSL,OnlineSecurity,No OnlineBackup,DeviceProtection,TechSupport,No StreamingTV,No StreamingMovies,One year,No PaperlessBilling,Bank transfer (automatic),42.3,No Churn,1903.5
4,Female,Not SeniorCitizen,No Partner,No Dependents,2,PhoneService,No MultipleLines,Fiber optic,No OnlineSecurity,No OnlineBackup,No DeviceProtection,No TechSupport,No StreamingTV,No StreamingMovies,Month-to-month,PaperlessBilling,Electronic check,70.7,Churn,141.4


# Export to file

In [0]:
df.to_csv('./telco_cleaned_Jun13.csv', index=False)