<a href="https://colab.research.google.com/github/nd823/data-cleaning/blob/master/telco_data_cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import data

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("https://github.com/treselle-systems/customer_churn_analysis/raw/master/WA_Fn-UseC_-Telco-Customer-Churn.csv")

# Intial check


## Preview data

In [2]:
df.head().T

Unnamed: 0,0,1,2,3,4
customerID,7590-VHVEG,5575-GNVDE,3668-QPYBK,7795-CFOCW,9237-HQITU
gender,Female,Male,Male,Male,Female
SeniorCitizen,0,0,0,0,0
Partner,Yes,No,No,No,No
Dependents,No,No,No,No,No
tenure,1,34,2,45,2
PhoneService,No,Yes,Yes,No,Yes
MultipleLines,No phone service,No,No,No phone service,No
InternetService,DSL,DSL,DSL,DSL,Fiber optic
OnlineSecurity,No,Yes,Yes,Yes,No


## Check column data types

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
customerID          7043 non-null object
gender              7043 non-null object
SeniorCitizen       7043 non-null int64
Partner             7043 non-null object
Dependents          7043 non-null object
tenure              7043 non-null int64
PhoneService        7043 non-null object
MultipleLines       7043 non-null object
InternetService     7043 non-null object
OnlineSecurity      7043 non-null object
OnlineBackup        7043 non-null object
DeviceProtection    7043 non-null object
TechSupport         7043 non-null object
StreamingTV         7043 non-null object
StreamingMovies     7043 non-null object
Contract            7043 non-null object
PaperlessBilling    7043 non-null object
PaymentMethod       7043 non-null object
MonthlyCharges      7043 non-null float64
TotalCharges        7043 non-null object
Churn               7043 non-null object
dtypes: float64(1), int64(2), obj

# Feature engineering

## Bin `MonthlyCharges` column into categories

In [4]:
df['Binned_MonthlyCharges'] = pd.cut(x=df['MonthlyCharges'], 
                                     bins=[0, 30, 55, 70, 90, 110, 120], 
                                     labels=['$0-30', '$30-55', '$55-70', '$70-90', '$90-110', '$110-120'])

df['Binned_MonthlyCharges'] = df['Binned_MonthlyCharges'].astype('category')

df['Binned_MonthlyCharges'].cat.reorder_categories(['$0-30', '$30-55', '$55-70', '$70-90', '$90-110', '$110-120'],
                                                   inplace=True)

## Bin `Tenure` column into categories

In [5]:
df['Binned_Tenure'] = pd.cut(x=df['tenure'], 
                             bins=[0, 5, 20, 40, 65, 72], 
                             labels=['0-5m', '5-20m', '20-40m', '40-65m', '65-72m'])

df['Binned_Tenure'] = df['Binned_Tenure'].astype('category')

df['Binned_Tenure'].cat.reorder_categories(['0-5m', '5-20m', '20-40m', '40-65m', '65-72m'],
                                           inplace=True)

# Data cleaning

## Drop `customerID` column

In [6]:
df = df.drop(['customerID'], axis = 1)

## Re-encode `SeniorCitizen` column

In [7]:
df['SeniorCitizen'] = df['SeniorCitizen'].astype(str)
                       
df['SeniorCitizen'] = np.where(df['SeniorCitizen']=='1', 'Yes', 'No')

## Rename columns

In [8]:
df.columns = ['Gender', 'SeniorCitizen', 'Partner', 'Dependents', 'Tenure',
               'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
               'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
               'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
               'MonthlyCharges', 'TotalCharges', 'Churn', 'Binned_MonthlyCharges',
               'Binned_Tenure']

## Drop rows with tenure=0 (n=11)

In [9]:
df = df[df['Tenure'] > 0 ]

## Convert `TotalCharges` column to `float64` type

In [10]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7032 entries, 0 to 7042
Data columns (total 22 columns):
Gender                   7032 non-null object
SeniorCitizen            7032 non-null object
Partner                  7032 non-null object
Dependents               7032 non-null object
Tenure                   7032 non-null int64
PhoneService             7032 non-null object
MultipleLines            7032 non-null object
InternetService          7032 non-null object
OnlineSecurity           7032 non-null object
OnlineBackup             7032 non-null object
DeviceProtection         7032 non-null object
TechSupport              7032 non-null object
StreamingTV              7032 non-null object
StreamingMovies          7032 non-null object
Contract                 7032 non-null object
PaperlessBilling         7032 non-null object
PaymentMethod            7032 non-null object
MonthlyCharges           7032 non-null float64
TotalCharges             7032 non-null float64
Churn             

## Set all categorical variable types to "category"

In [11]:
for col in df.columns:
    if df[col].dtypes == 'object':
        df[col] = df[col].astype('category')
        
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7032 entries, 0 to 7042
Data columns (total 22 columns):
Gender                   7032 non-null category
SeniorCitizen            7032 non-null category
Partner                  7032 non-null category
Dependents               7032 non-null category
Tenure                   7032 non-null int64
PhoneService             7032 non-null category
MultipleLines            7032 non-null category
InternetService          7032 non-null category
OnlineSecurity           7032 non-null category
OnlineBackup             7032 non-null category
DeviceProtection         7032 non-null category
TechSupport              7032 non-null category
StreamingTV              7032 non-null category
StreamingMovies          7032 non-null category
Contract                 7032 non-null category
PaperlessBilling         7032 non-null category
PaymentMethod            7032 non-null category
MonthlyCharges           7032 non-null float64
TotalCharges             7032 non

## Combine sparse levels

In [12]:
for col in ['MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']:
    df[col] = df[col].replace({'No internet service':'No'})
    
df['MultipleLines'] = df['MultipleLines'].replace({'No phone service':'No'})

# Export to file for survival analysis

In [13]:
df.head().T

Unnamed: 0,0,1,2,3,4
Gender,Female,Male,Male,Male,Female
SeniorCitizen,No,No,No,No,No
Partner,Yes,No,No,No,No
Dependents,No,No,No,No,No
Tenure,1,34,2,45,2
PhoneService,No,Yes,Yes,No,Yes
MultipleLines,No,No,No,No,No
InternetService,DSL,DSL,DSL,DSL,Fiber optic
OnlineSecurity,No,Yes,Yes,Yes,No
OnlineBackup,Yes,No,Yes,No,No


In [14]:
for col in df.columns:
    print(df[col].value_counts())

Male      3549
Female    3483
Name: Gender, dtype: int64
No     5890
Yes    1142
Name: SeniorCitizen, dtype: int64
No     3639
Yes    3393
Name: Partner, dtype: int64
No     4933
Yes    2099
Name: Dependents, dtype: int64
1     613
72    362
2     238
3     200
4     176
71    170
5     133
7     131
8     123
70    119
9     119
12    117
10    116
6     110
13    109
68    100
15     99
11     99
67     98
18     97
69     95
24     94
22     90
66     89
35     88
17     87
23     85
16     80
64     80
52     80
     ... 
29     72
20     71
53     70
62     70
41     70
32     69
47     68
51     68
54     68
50     68
58     67
49     66
57     65
37     65
31     65
43     65
34     65
42     65
40     64
48     64
33     64
55     64
21     63
45     61
59     60
38     59
28     57
39     56
44     51
36     50
Name: Tenure, Length: 72, dtype: int64
Yes    6352
No      680
Name: PhoneService, dtype: int64
No     4065
Yes    2967
Name: MultipleLines, dtype: int64
Fiber optic   

In [15]:
df.to_csv('./telco_cleaned_og_levels_Jun21.csv', index=False)

# Rename levels of all categorical variables to reflect column name

To prepare the dataframe for factor analysis.

In [16]:
col_list = ['SeniorCitizen', 'Partner', 'Dependents','PhoneService', 'DeviceProtection', 'MultipleLines', 'OnlineSecurity', 
            'OnlineBackup', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'PaperlessBilling', 'Churn']

for col in col_list:
    df[col] = np.where(df[col]=='Yes', col, 'No'+' '+col)
    
df['InternetService'] = df['InternetService'].replace({'No':'No internet service'})

In [17]:
df.head().T

Unnamed: 0,0,1,2,3,4
Gender,Female,Male,Male,Male,Female
SeniorCitizen,No SeniorCitizen,No SeniorCitizen,No SeniorCitizen,No SeniorCitizen,No SeniorCitizen
Partner,Partner,No Partner,No Partner,No Partner,No Partner
Dependents,No Dependents,No Dependents,No Dependents,No Dependents,No Dependents
Tenure,1,34,2,45,2
PhoneService,No PhoneService,PhoneService,PhoneService,No PhoneService,PhoneService
MultipleLines,No MultipleLines,No MultipleLines,No MultipleLines,No MultipleLines,No MultipleLines
InternetService,DSL,DSL,DSL,DSL,Fiber optic
OnlineSecurity,No OnlineSecurity,OnlineSecurity,OnlineSecurity,OnlineSecurity,No OnlineSecurity
OnlineBackup,OnlineBackup,No OnlineBackup,OnlineBackup,No OnlineBackup,No OnlineBackup


In [18]:
for col in df.columns:
    print(df[col].value_counts())

Male      3549
Female    3483
Name: Gender, dtype: int64
No SeniorCitizen    5890
SeniorCitizen       1142
Name: SeniorCitizen, dtype: int64
No Partner    3639
Partner       3393
Name: Partner, dtype: int64
No Dependents    4933
Dependents       2099
Name: Dependents, dtype: int64
1     613
72    362
2     238
3     200
4     176
71    170
5     133
7     131
8     123
70    119
9     119
12    117
10    116
6     110
13    109
68    100
15     99
11     99
67     98
18     97
69     95
24     94
22     90
66     89
35     88
17     87
23     85
16     80
64     80
52     80
     ... 
29     72
20     71
53     70
62     70
41     70
32     69
47     68
51     68
54     68
50     68
58     67
49     66
57     65
37     65
31     65
43     65
34     65
42     65
40     64
48     64
33     64
55     64
21     63
45     61
59     60
38     59
28     57
39     56
44     51
36     50
Name: Tenure, Length: 72, dtype: int64
PhoneService       6352
No PhoneService     680
Name: PhoneService, d

# Export to file for factor analysis

In [19]:
df.to_csv('./telco_cleaned_renamed_levels_Jun21.csv', index=False)