## Telco Customer Churn Prediction

This project aims to create a machine learning model in order to predict churn customers in Telco based on contract type, payment methods, monthly charges and total charges by using random forest.

## Import Library

In [58]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from imblearn.over_sampling import SMOTE 

from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix

## Load dataset

In [18]:
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
df.head(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


- The number of non-churn customer is significantly larger than churn customer.
- The number of non-churn customer is over 5000 while churn customer is below 2000

## Data Preprocessing

### Missing Value Handling and Feature Selection

In [22]:
missing_values = ['Na','N/A','--','na',' ']
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv', na_values = missing_values)
df.isnull().sum()
#There are 11 missing value in TotalCharges

customerID           0
gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

In [23]:
#replacing missing value with mean
df['TotalCharges'] = df['TotalCharges'].fillna(df['TotalCharges'].mean())
df.isnull().sum()
#no missing value found

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [6]:
#Feature selection
df = df[['Contract','PaymentMethod','MonthlyCharges','TotalCharges','Churn']]

### Categorical Data Encoding (Frequency Encoding)

In [7]:
#Encoding contract variable
freq_con = df['Contract'].value_counts().reset_index()
freq_con.rename(columns={"index": "Contract", "Contract": "freq_contract"}, inplace = True)
freq_con['pct_contract'] = round((freq_con['freq_contract']/freq_con['freq_contract'].sum())*100,2)
freq_con


Unnamed: 0,Contract,freq_contract,pct_contract
0,Month-to-month,3875,55.02
1,Two year,1695,24.07
2,One year,1473,20.91


In [8]:
#comparing original data and encoded data
df = df.merge(freq_con[['Contract','pct_contract']], on='Contract', how='inner')
df[['Contract','pct_contract']]

Unnamed: 0,Contract,pct_contract
0,Month-to-month,55.02
1,Month-to-month,55.02
2,Month-to-month,55.02
3,Month-to-month,55.02
4,Month-to-month,55.02
...,...,...
7038,Two year,24.07
7039,Two year,24.07
7040,Two year,24.07
7041,Two year,24.07


In [9]:
#Encoding payment Method variable
freq_pay = df['PaymentMethod'].value_counts().reset_index()
freq_pay.rename(columns={"index": "PaymentMethod", "PaymentMethod": "freq_payment"}, inplace = True)
freq_pay['pct_payment'] = round((freq_pay['freq_payment']/freq_pay['freq_payment'].sum())*100,2)
freq_pay

Unnamed: 0,PaymentMethod,freq_payment,pct_payment
0,Electronic check,2365,33.58
1,Mailed check,1612,22.89
2,Bank transfer (automatic),1544,21.92
3,Credit card (automatic),1522,21.61


In [10]:
#comparing original data and encoded data
df = df.merge(freq_pay[['PaymentMethod','pct_payment']], on='PaymentMethod', how='inner')
df[['Contract','pct_contract']]

Unnamed: 0,Contract,pct_contract
0,Month-to-month,55.02
1,Month-to-month,55.02
2,Month-to-month,55.02
3,Month-to-month,55.02
4,Month-to-month,55.02
...,...,...
7038,Two year,24.07
7039,Two year,24.07
7040,Two year,24.07
7041,Two year,24.07


In [11]:
#Dropping unencoded variable 
df = df.drop(['Contract','PaymentMethod'],axis=1)

### Imbalanced Data Handling

In [12]:
target = df['Churn'].value_counts().reset_index()
target.rename(columns={'index':'Churn', 'Churn':'Freq'}, inplace=True)
target['Percentage'] = round((target['Freq']/target['Freq'].sum())*100,2)
target
#Data distribution is imbalanced

Unnamed: 0,Churn,Freq,Percentage
0,No,5174,73.46
1,Yes,1869,26.54


In [13]:
from imblearn import under_sampling, over_sampling

x = df.drop(['Churn'], axis=1)
y = df['Churn']

#Oversampling with smote
x_over_smote, y_over_smote = over_sampling.SMOTE().fit_resample(x,y)

In [14]:
#Merging oversampled data 
df = pd.concat([x_over_smote, y_over_smote], axis=1)
df['Churn'].value_counts()

Yes    5174
No     5174
Name: Churn, dtype: int64

## Modeling (Random Forest)

### Split Dataset

In [15]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

### Modeling dengan Random Forest

In [16]:
model = RandomForestClassifier(random_state=42)
model.fit(x_train, y_train)
preds = model.predict(x_test)

## Evaluation

Because data is already balanced, using accuracy is valid

In [17]:
print('Accuracy',accuracy_score(y_test, preds))
#The model accuracy is 76.6 %

Accuracy 0.7666098807495741
