#Predicting Customer Churn

## Import Library

In [None]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from imblearn.over_sampling import SMOTE 

from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix

## Load dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = pd.read_csv('/content/drive/MyDrive/python/day 18/WA_Fn-UseC_-Telco-Customer-Churn.csv')
df.head(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


##Encoding

In [None]:
categoricals = ['customerID','gender','Partner','Dependents','PhoneService','MultipleLines','InternetService','OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies','Contract','PaperlessBilling','PaymentMethod','Churn']

numericals = ['SeniorCitizen','tenure','MonthlyCharges','TotalCharges']

In [None]:
# describing categorical data
df[categoricals].describe()

Unnamed: 0,customerID,gender,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,Churn
count,7043,7043,7043,7043,7043,7043,7043,7043,7043,7043,7043,7043,7043,7043,7043,7043,7043
unique,7043,2,2,2,2,3,3,3,3,3,3,3,3,3,2,4,2
top,7590-VHVEG,Male,No,No,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,No
freq,1,3555,3641,4933,6361,3390,3096,3498,3088,3095,3473,2810,2785,3875,4171,2365,5174


In [None]:
#More specific description of categorical data.
for col in categoricals:
  print(f"Value counts of {col} column")
  print(df[col].value_counts(), '\n')

Value counts of customerID column
7590-VHVEG    1
3791-LGQCY    1
6008-NAIXK    1
5956-YHHRX    1
5365-LLFYV    1
             ..
9796-MVYXX    1
2637-FKFSY    1
1552-AAGRX    1
4304-TSPVK    1
3186-AJIEK    1
Name: customerID, Length: 7043, dtype: int64 

Value counts of gender column
Male      3555
Female    3488
Name: gender, dtype: int64 

Value counts of Partner column
No     3641
Yes    3402
Name: Partner, dtype: int64 

Value counts of Dependents column
No     4933
Yes    2110
Name: Dependents, dtype: int64 

Value counts of PhoneService column
Yes    6361
No      682
Name: PhoneService, dtype: int64 

Value counts of MultipleLines column
No                  3390
Yes                 2971
No phone service     682
Name: MultipleLines, dtype: int64 

Value counts of InternetService column
Fiber optic    3096
DSL            2421
No             1526
Name: InternetService, dtype: int64 

Value counts of OnlineSecurity column
No                     3498
Yes                    2019
No i

In [None]:
#Convert the Dtype of TotalCharges to float
#There are some rows that invalid so it will be dropped

df = df.loc[~df['TotalCharges'].str.contains(' ')]
df['TotalCharges'] = df['TotalCharges'].astype(float)

In [None]:
#Copying data
df1 = df.copy()

focusing to services from the company

In [None]:
#Label Encoding for columns which has 2 values

label_encoder = preprocessing.LabelEncoder()

df1['gender']= label_encoder.fit_transform(df1['gender'])
df1['Partner']= label_encoder.fit_transform(df1['Partner'])
df1['Dependents']= label_encoder.fit_transform(df1['Dependents'])
df1['PhoneService']= label_encoder.fit_transform(df1['PhoneService'])
df1['PaperlessBilling']= label_encoder.fit_transform(df1['PaperlessBilling'])
df1['Churn']= label_encoder.fit_transform(df1['Churn'])

In [None]:
#MultipleLines has 3 values but 'No phone service' is the same as 'No' in PhoneService
code = {'Yes':1, 'No':0, 'No phone service':0}
df1['MultipleLines'] = df1['MultipleLines'].map(code)


#some column has 3 values but 'No internet service' is the same as 'No' in InternetService
code2 = {'Yes':1, 'No':0, 'No internet service':0}
df1['OnlineSecurity'] = df1['OnlineSecurity'].map(code2)
df1['OnlineBackup'] = df1['OnlineBackup'].map(code2)
df1['DeviceProtection'] = df1['DeviceProtection'].map(code2)
df1['TechSupport'] = df1['TechSupport'].map(code2)
df1['StreamingTV'] = df1['StreamingTV'].map(code2)
df1['StreamingMovies'] = df1['StreamingMovies'].map(code2)

#Contract has hierarchy, so it is okay to use label encoding
code3 = {'Month-to-month':1, 'One year':2, 'Two year':3}
df1['Contract'] = df1['Contract'].map(code3)

In [None]:
#I decide to drop some column that doesn't include company's services
df1 = df1.drop('customerID', axis=1)
df1 = df1.drop('InternetService', axis=1)
df1 = df1.drop('PaymentMethod', axis=1)

In [None]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7032 entries, 0 to 7042
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7032 non-null   int64  
 1   SeniorCitizen     7032 non-null   int64  
 2   Partner           7032 non-null   int64  
 3   Dependents        7032 non-null   int64  
 4   tenure            7032 non-null   int64  
 5   PhoneService      7032 non-null   int64  
 6   MultipleLines     7032 non-null   int64  
 7   OnlineSecurity    7032 non-null   int64  
 8   OnlineBackup      7032 non-null   int64  
 9   DeviceProtection  7032 non-null   int64  
 10  TechSupport       7032 non-null   int64  
 11  StreamingTV       7032 non-null   int64  
 12  StreamingMovies   7032 non-null   int64  
 13  Contract          7032 non-null   int64  
 14  PaperlessBilling  7032 non-null   int64  
 15  MonthlyCharges    7032 non-null   float64
 16  TotalCharges      7032 non-null   float64


there is no missing value

## Data Preprocessing

In [None]:
df1.Churn.value_counts(normalize=True)*100

0    73.421502
1    26.578498
Name: Churn, dtype: float64

In [None]:
X = df1.drop('Churn', axis=1)
y = df1['Churn']

In [None]:
from imblearn import under_sampling, over_sampling

# Overampling
X_over, y_over = over_sampling.RandomOverSampler().fit_resample(X, y)
df1_oversampling = pd.concat([X_over, y_over], axis=1)
df1_oversampling.head(3)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,MonthlyCharges,TotalCharges,Churn
0,0,0,1,0,1,0,0,0,1,0,0,0,0,1,1,29.85,29.85,0
1,1,0,0,0,34,1,0,1,0,1,0,0,0,2,0,56.95,1889.5,0
2,1,0,0,0,2,1,0,1,1,0,0,0,0,1,1,53.85,108.15,1


In [None]:
df1_oversampling.Churn.value_counts(normalize=True)*100

0    50.0
1    50.0
Name: Churn, dtype: float64

In [None]:
X = df1_oversampling.drop('Churn', axis=1)
y = df1_oversampling['Churn']

In [None]:
#Split data
X_train, X_test,y_train,y_test = train_test_split(X,
                                                y,
                                                test_size = 0.3,
                                                random_state = 42)

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(random_state=42)
logreg.fit(X_train, y_train)

LogisticRegression(random_state=42)

In [None]:
#accuracy
y_pred_reg = logreg.predict(X_test)
print('accuracy',accuracy_score(y_test, y_pred_reg))

accuracy 0.763395739186572


##Desicion Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train,y_train)

DecisionTreeClassifier(random_state=42)

In [None]:
#accuracy
y_pred_dt = dt.predict(X_test)
print('accuracy',accuracy_score(y_test, y_pred_dt))

accuracy 0.854744996772111


##Random Forest

In [None]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
preds = model.predict(X_test)

In [None]:
#accuracy
print('accuracy',accuracy_score(y_test, preds))

accuracy 0.8779857972885733


## Evaluation

In [None]:
#accuracy
print('Accuracy of Ramdom Forest Model:',accuracy_score(y_test, preds))

Accuracy of Ramdom Forest Model: 0.8779857972885733


In [None]:
from sklearn.metrics import accuracy_score, recall_score, precision_score
print('Precision ',precision_score(y_test, preds))
print('Recall ',recall_score(y_test, preds))

Precision  0.8338141950375072
Recall  0.9413680781758957


Conclusion:
1. The model taken is Random Forest because it has the greatest accuracy compares with the other two models, which is about 87%
2. The data already balanced so that we can evaluate the model using the level of accuracy
3. The model's ability to predict customer churn is 83%
4. The model's ability to predict customer non-churn is 94% 