## Import Library

In [45]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from imblearn.over_sampling import BorderlineSMOTE
from imblearn import under_sampling, over_sampling
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix

## Load dataset

In [46]:
df = pd.read_csv('telco_customer_churn.csv')
df.sample(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
4325,2700-LUEVA,Male,0,No,No,1,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Month-to-month,No,Credit card (automatic),20.75,20.75,No
5162,4801-KFYKL,Male,0,No,Yes,8,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Month-to-month,No,Bank transfer (automatic),19.45,159.2,No
6864,5480-HPRRX,Female,1,No,No,3,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,No,Electronic check,25.3,77.75,Yes
1585,4522-XRWWI,Male,0,Yes,No,42,Yes,Yes,DSL,Yes,...,No,Yes,Yes,Yes,One year,No,Credit card (automatic),80.45,3375.9,No
1400,7521-YXVZY,Male,0,No,Yes,3,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Month-to-month,No,Mailed check,19.95,58.3,No


In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


## Data Understanding

In [48]:
df.isna().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

tidak ada data null

In [6]:
df.duplicated().sum()

0

tidak ada data duplikat

In [49]:
# convert TotalCharges column to float type 
df['TotalCharges'] = df['TotalCharges'].replace(' ')
df['TotalCharges'] = df['TotalCharges'].astype('float')

In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [51]:
categorikal = list(df.select_dtypes(include = 'object')) # filtering object column
categorikal

['customerID',
 'gender',
 'Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod',
 'Churn']

In [52]:
for i in categorikal:
    print(f"value count for {i} is: ")
    print(df[i].value_counts(),'\n')

value count for customerID is: 
7590-VHVEG    1
3791-LGQCY    1
6008-NAIXK    1
5956-YHHRX    1
5365-LLFYV    1
             ..
9796-MVYXX    1
2637-FKFSY    1
1552-AAGRX    1
4304-TSPVK    1
3186-AJIEK    1
Name: customerID, Length: 7043, dtype: int64 

value count for gender is: 
Male      3555
Female    3488
Name: gender, dtype: int64 

value count for Partner is: 
No     3641
Yes    3402
Name: Partner, dtype: int64 

value count for Dependents is: 
No     4933
Yes    2110
Name: Dependents, dtype: int64 

value count for PhoneService is: 
Yes    6361
No      682
Name: PhoneService, dtype: int64 

value count for MultipleLines is: 
No                  3390
Yes                 2971
No phone service     682
Name: MultipleLines, dtype: int64 

value count for InternetService is: 
Fiber optic    3096
DSL            2421
No             1526
Name: InternetService, dtype: int64 

value count for OnlineSecurity is: 
No                     3498
Yes                    2019
No internet service 

## Data Preprocessing

In [53]:
# label encoding untuk value yang yes, dan no
# yes = 1, no = 0 
map_class = {'Yes': 1,
             'No': 0}
df['Partner'] = df['Partner'].map(map_class)
df['Dependents']= df['Dependents'].map(map_class)
df['PhoneService'] = df['PhoneService'].map(map_class)
df['PaperlessBilling'] = df['PaperlessBilling'].map(map_class)


In [54]:
map_gender = {'Male': 1,
             'Female': 0}
df['gender'] = df['gender'].map(map_gender)

In [55]:
df

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,0,0,1,0,1,0,No phone service,DSL,No,...,No,No,No,No,Month-to-month,1,Electronic check,29.85,29.85,No
1,5575-GNVDE,1,0,0,0,34,1,No,DSL,Yes,...,Yes,No,No,No,One year,0,Mailed check,56.95,1889.50,No
2,3668-QPYBK,1,0,0,0,2,1,No,DSL,Yes,...,No,No,No,No,Month-to-month,1,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,1,0,0,0,45,0,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,0,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,0,0,0,0,2,1,No,Fiber optic,No,...,No,No,No,No,Month-to-month,1,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,1,0,1,1,24,1,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,1,Mailed check,84.80,1990.50,No
7039,2234-XADUH,0,0,1,1,72,1,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,1,Credit card (automatic),103.20,7362.90,No
7040,4801-JZAZL,0,0,1,1,11,0,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,1,Electronic check,29.60,346.45,No
7041,8361-LTMKD,1,1,1,0,4,1,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,1,Mailed check,74.40,306.60,Yes


Ordinal encoding kolom contract

In [56]:
map_contract = {'Month-to-month': 1,
             'One year': 2,
             'Two year': 3}
df['Contract'] = df['Contract'].map(map_contract)
df

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,0,0,1,0,1,0,No phone service,DSL,No,...,No,No,No,No,1,1,Electronic check,29.85,29.85,No
1,5575-GNVDE,1,0,0,0,34,1,No,DSL,Yes,...,Yes,No,No,No,2,0,Mailed check,56.95,1889.50,No
2,3668-QPYBK,1,0,0,0,2,1,No,DSL,Yes,...,No,No,No,No,1,1,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,1,0,0,0,45,0,No phone service,DSL,Yes,...,Yes,Yes,No,No,2,0,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,0,0,0,0,2,1,No,Fiber optic,No,...,No,No,No,No,1,1,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,1,0,1,1,24,1,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,2,1,Mailed check,84.80,1990.50,No
7039,2234-XADUH,0,0,1,1,72,1,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,2,1,Credit card (automatic),103.20,7362.90,No
7040,4801-JZAZL,0,0,1,1,11,0,No phone service,DSL,Yes,...,No,No,No,No,1,1,Electronic check,29.60,346.45,No
7041,8361-LTMKD,1,1,1,0,4,1,Yes,Fiber optic,No,...,No,No,No,No,1,1,Mailed check,74.40,306.60,Yes


Frekuensi Encoding

In [57]:
freq_multilines = df['MultipleLines'].value_counts().reset_index()
freq_multilines.rename(columns = {'index':'MultipleLines','MultipleLines':'freq_multi'}, inplace = True)
freq_multilines['persen_value_multi'] = round(freq_multilines['freq_multi']/freq_multilines['freq_multi'].sum()*100,2)
freq_multilines

Unnamed: 0,MultipleLines,freq_multi,persen_value_multi
0,No,3390,48.13
1,Yes,2971,42.18
2,No phone service,682,9.68


In [58]:
freq_inter_serve = df['InternetService'].value_counts().reset_index()
freq_inter_serve.rename(columns = {'index':'InternetService','InternetService':'freq_inter_serve'}, inplace = True)
freq_inter_serve['persen_inter_serve'] = round(freq_inter_serve['freq_inter_serve']/freq_inter_serve['freq_inter_serve'].sum()*100,2)
freq_inter_serve

Unnamed: 0,InternetService,freq_inter_serve,persen_inter_serve
0,Fiber optic,3096,43.96
1,DSL,2421,34.37
2,No,1526,21.67


In [59]:
freq_security = df['OnlineSecurity'].value_counts().reset_index()
freq_security.rename(columns = {'index':'OnlineSecurity','OnlineSecurity':'freq_security'}, inplace = True)
freq_security['persen_security'] = round(freq_security['freq_security']/freq_security['freq_security'].sum()*100,2)
freq_security

Unnamed: 0,OnlineSecurity,freq_security,persen_security
0,No,3498,49.67
1,Yes,2019,28.67
2,No internet service,1526,21.67


In [60]:
freq_backup = df['OnlineBackup'].value_counts().reset_index()
freq_backup.rename(columns = {'index':'OnlineBackup','OnlineBackup':'freq_backup'}, inplace = True)
freq_backup['persen_backup'] = round(freq_backup['freq_backup']/freq_backup['freq_backup'].sum()*100,2)
freq_backup

Unnamed: 0,OnlineBackup,freq_backup,persen_backup
0,No,3088,43.84
1,Yes,2429,34.49
2,No internet service,1526,21.67


In [61]:
freq_protect = df['DeviceProtection'].value_counts().reset_index()
freq_protect.rename(columns = {'index':'DeviceProtection','DeviceProtection':'freq_protect'}, inplace = True)
freq_protect['persen_protect'] = round(freq_protect['freq_protect']/freq_protect['freq_protect'].sum()*100,2)
freq_protect

Unnamed: 0,DeviceProtection,freq_protect,persen_protect
0,No,3095,43.94
1,Yes,2422,34.39
2,No internet service,1526,21.67


In [62]:
freq_tech_support = df['TechSupport'].value_counts().reset_index()
freq_tech_support.rename(columns = {'index':'TechSupport','TechSupport':'freq_tech_support'}, inplace = True)
freq_tech_support['persen_tech_support'] = round(freq_tech_support['freq_tech_support']/freq_tech_support['freq_tech_support'].sum()*100,2)
freq_tech_support

Unnamed: 0,TechSupport,freq_tech_support,persen_tech_support
0,No,3473,49.31
1,Yes,2044,29.02
2,No internet service,1526,21.67


In [63]:
freq_stream_tv = df['StreamingTV'].value_counts().reset_index()
freq_stream_tv.rename(columns = {'index':'StreamingTV','StreamingTV':'freq_stream_tv'}, inplace = True)
freq_stream_tv['persen_stream_tv'] = round(freq_stream_tv['freq_stream_tv']/freq_stream_tv['freq_stream_tv'].sum()*100,2)
freq_stream_tv

Unnamed: 0,StreamingTV,freq_stream_tv,persen_stream_tv
0,No,2810,39.9
1,Yes,2707,38.44
2,No internet service,1526,21.67


In [64]:
freq_stream_mv = df['StreamingMovies'].value_counts().reset_index()
freq_stream_mv.rename(columns = {'index':'StreamingMovies','StreamingMovies':'freq_stream_mv'}, inplace = True)
freq_stream_mv['persen_stream_mv'] = round(freq_stream_mv['freq_stream_mv']/freq_stream_mv['freq_stream_mv'].sum()*100,2)
freq_stream_mv

Unnamed: 0,StreamingMovies,freq_stream_mv,persen_stream_mv
0,No,2785,39.54
1,Yes,2732,38.79
2,No internet service,1526,21.67


In [65]:
df = df.merge(freq_multilines[['MultipleLines','persen_value_multi']], on='MultipleLines')
df = df.merge(freq_inter_serve[['InternetService','persen_inter_serve']], on='InternetService')
df = df.merge(freq_security[['OnlineSecurity','persen_security']], on='OnlineSecurity')
df = df.merge(freq_backup[['OnlineBackup','persen_backup']], on='OnlineBackup')
df = df.merge(freq_protect[['DeviceProtection','persen_protect']], on='DeviceProtection')
df = df.merge(freq_tech_support[['TechSupport','persen_tech_support']], on='TechSupport')
df = df.merge(freq_stream_tv[['StreamingTV','persen_stream_tv']], on='StreamingTV')
df = df.merge(freq_stream_mv[['StreamingMovies','persen_stream_mv']], on='StreamingMovies')
df

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,TotalCharges,Churn,persen_value_multi,persen_inter_serve,persen_security,persen_backup,persen_protect,persen_tech_support,persen_stream_tv,persen_stream_mv
0,7590-VHVEG,0,0,1,0,1,0,No phone service,DSL,No,...,29.85,No,9.68,34.37,49.67,34.49,43.94,49.31,39.90,39.54
1,8665-UTDHZ,1,0,1,1,1,0,No phone service,DSL,No,...,30.20,Yes,9.68,34.37,49.67,34.49,43.94,49.31,39.90,39.54
2,6317-YPKDH,0,0,0,0,1,0,No phone service,DSL,No,...,29.95,Yes,9.68,34.37,49.67,34.49,43.94,49.31,39.90,39.54
3,6235-VDHOM,0,1,0,0,5,0,No phone service,DSL,No,...,131.05,Yes,9.68,34.37,49.67,34.49,43.94,49.31,39.90,39.54
4,6543-CPZMK,1,0,1,1,9,0,No phone service,DSL,No,...,248.95,Yes,9.68,34.37,49.67,34.49,43.94,49.31,39.90,39.54
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,4707-MAXGU,1,0,1,0,72,1,Yes,No,No internet service,...,1872.20,No,42.18,21.67,21.67,21.67,21.67,21.67,21.67,21.67
7039,4534-WGCIR,0,0,1,1,58,1,Yes,No,No internet service,...,1509.90,No,42.18,21.67,21.67,21.67,21.67,21.67,21.67,21.67
7040,6976-BWGLQ,0,0,1,1,72,1,Yes,No,No internet service,...,1787.35,No,42.18,21.67,21.67,21.67,21.67,21.67,21.67,21.67
7041,2511-ALLCS,0,0,1,1,35,1,Yes,No,No internet service,...,821.60,No,42.18,21.67,21.67,21.67,21.67,21.67,21.67,21.67


In [66]:
df = df.drop(['MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies'], axis=1)

In [67]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   customerID           7043 non-null   object 
 1   gender               7043 non-null   int64  
 2   SeniorCitizen        7043 non-null   int64  
 3   Partner              7043 non-null   int64  
 4   Dependents           7043 non-null   int64  
 5   tenure               7043 non-null   int64  
 6   PhoneService         7043 non-null   int64  
 7   Contract             7043 non-null   int64  
 8   PaperlessBilling     7043 non-null   int64  
 9   PaymentMethod        7043 non-null   object 
 10  MonthlyCharges       7043 non-null   float64
 11  TotalCharges         7043 non-null   float64
 12  Churn                7043 non-null   object 
 13  persen_value_multi   7043 non-null   float64
 14  persen_inter_serve   7043 non-null   float64
 15  persen_security      7043 non-null   f

One hot encooding payment method

In [68]:
dumies_payment_method = pd.get_dummies(df['PaymentMethod'],prefix='PaymentMethod')
dumies_payment_method

Unnamed: 0,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,0,1,0
1,0,0,1,0
2,1,0,0,0
3,0,0,1,0
4,0,0,1,0
...,...,...,...,...
7038,0,1,0,0
7039,0,1,0,0
7040,1,0,0,0
7041,1,0,0,0


In [69]:
df = pd.concat([df,dumies_payment_method], axis =1)
df

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,Contract,PaperlessBilling,PaymentMethod,...,persen_security,persen_backup,persen_protect,persen_tech_support,persen_stream_tv,persen_stream_mv,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,7590-VHVEG,0,0,1,0,1,0,1,1,Electronic check,...,49.67,34.49,43.94,49.31,39.90,39.54,0,0,1,0
1,8665-UTDHZ,1,0,1,1,1,0,1,0,Electronic check,...,49.67,34.49,43.94,49.31,39.90,39.54,0,0,1,0
2,6317-YPKDH,0,0,0,0,1,0,1,0,Bank transfer (automatic),...,49.67,34.49,43.94,49.31,39.90,39.54,1,0,0,0
3,6235-VDHOM,0,1,0,0,5,0,1,0,Electronic check,...,49.67,34.49,43.94,49.31,39.90,39.54,0,0,1,0
4,6543-CPZMK,1,0,1,1,9,0,1,1,Electronic check,...,49.67,34.49,43.94,49.31,39.90,39.54,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,4707-MAXGU,1,0,1,0,72,1,3,0,Credit card (automatic),...,21.67,21.67,21.67,21.67,21.67,21.67,0,1,0,0
7039,4534-WGCIR,0,0,1,1,58,1,3,0,Credit card (automatic),...,21.67,21.67,21.67,21.67,21.67,21.67,0,1,0,0
7040,6976-BWGLQ,0,0,1,1,72,1,3,0,Bank transfer (automatic),...,21.67,21.67,21.67,21.67,21.67,21.67,1,0,0,0
7041,2511-ALLCS,0,0,1,1,35,1,3,0,Bank transfer (automatic),...,21.67,21.67,21.67,21.67,21.67,21.67,1,0,0,0


In [70]:
df = df.drop(['PaymentMethod'], axis =1)
df

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,Contract,PaperlessBilling,MonthlyCharges,...,persen_security,persen_backup,persen_protect,persen_tech_support,persen_stream_tv,persen_stream_mv,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,7590-VHVEG,0,0,1,0,1,0,1,1,29.85,...,49.67,34.49,43.94,49.31,39.90,39.54,0,0,1,0
1,8665-UTDHZ,1,0,1,1,1,0,1,0,30.20,...,49.67,34.49,43.94,49.31,39.90,39.54,0,0,1,0
2,6317-YPKDH,0,0,0,0,1,0,1,0,29.95,...,49.67,34.49,43.94,49.31,39.90,39.54,1,0,0,0
3,6235-VDHOM,0,1,0,0,5,0,1,0,28.45,...,49.67,34.49,43.94,49.31,39.90,39.54,0,0,1,0
4,6543-CPZMK,1,0,1,1,9,0,1,1,29.95,...,49.67,34.49,43.94,49.31,39.90,39.54,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,4707-MAXGU,1,0,1,0,72,1,3,0,25.85,...,21.67,21.67,21.67,21.67,21.67,21.67,0,1,0,0
7039,4534-WGCIR,0,0,1,1,58,1,3,0,25.15,...,21.67,21.67,21.67,21.67,21.67,21.67,0,1,0,0
7040,6976-BWGLQ,0,0,1,1,72,1,3,0,25.20,...,21.67,21.67,21.67,21.67,21.67,21.67,1,0,0,0
7041,2511-ALLCS,0,0,1,1,35,1,3,0,24.30,...,21.67,21.67,21.67,21.67,21.67,21.67,1,0,0,0


In [71]:
df = df.drop(['customerID'], axis =1)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7043 entries, 0 to 7042
Data columns (total 23 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   gender                                   7043 non-null   int64  
 1   SeniorCitizen                            7043 non-null   int64  
 2   Partner                                  7043 non-null   int64  
 3   Dependents                               7043 non-null   int64  
 4   tenure                                   7043 non-null   int64  
 5   PhoneService                             7043 non-null   int64  
 6   Contract                                 7043 non-null   int64  
 7   PaperlessBilling                         7043 non-null   int64  
 8   MonthlyCharges                           7043 non-null   float64
 9   TotalCharges                             7043 non-null   float64
 10  Churn                                    7043 no

## Modeling (Gunakan lebih min 2 model dan bandingkan hasil evaluasinya)

Bebas menggunakan model, mau menggunakan decision tree, random forest, xgboost, dll juga boleh<br><br>
silahkan berekspresi :)

In [72]:
df['Churn'].value_counts()

No     5174
Yes    1869
Name: Churn, dtype: int64

## imbalance data handling

In [116]:
# menghapus kolom MonthlyCharges, karena punya hubungan multicolinearity dengan totalcharges
x = df.drop(['Churn','MonthlyCharges'], axis=1)
y = df['Churn']

#Over sampling with SMOTE
X_over_smote, y_over_smote = over_sampling.SMOTE().fit_resample(x,y)

In [117]:
df_oversampling_smote = pd.concat([X_over_smote, y_over_smote], axis = 1)
df_oversampling_smote

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,Contract,PaperlessBilling,TotalCharges,persen_value_multi,...,persen_backup,persen_protect,persen_tech_support,persen_stream_tv,persen_stream_mv,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,Churn
0,0,0,1,0,1,0,1,1,29.850000,9.680000,...,34.490000,43.940000,49.310000,39.900000,39.54,0,0,1,0,No
1,1,0,1,1,1,0,1,0,30.200000,9.680000,...,34.490000,43.940000,49.310000,39.900000,39.54,0,0,1,0,Yes
2,0,0,0,0,1,0,1,0,29.950000,9.680000,...,34.490000,43.940000,49.310000,39.900000,39.54,1,0,0,0,Yes
3,0,1,0,0,5,0,1,0,131.050000,9.680000,...,34.490000,43.940000,49.310000,39.900000,39.54,0,0,1,0,Yes
4,1,0,1,1,9,0,1,1,248.950000,9.680000,...,34.490000,43.940000,49.310000,39.900000,39.54,0,0,1,0,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10343,1,0,0,0,1,1,1,0,79.930604,48.130000,...,43.840000,43.940000,49.310000,39.900000,38.79,0,0,1,0,Yes
10344,0,0,0,0,1,1,1,1,45.616400,48.130000,...,43.840000,43.940000,49.310000,39.900000,39.54,0,0,0,1,Yes
10345,1,0,1,0,70,1,1,1,7537.987673,46.653483,...,34.490000,36.759872,34.055047,38.440000,38.79,0,0,0,0,Yes
10346,1,0,0,0,1,1,1,1,69.944554,48.130000,...,43.840000,43.940000,49.310000,39.900000,39.54,0,0,1,0,Yes


In [128]:
df_oversampling_smote['Churn'].value_counts()

No     5174
Yes    5174
Name: Churn, dtype: int64

## feature and target

In [118]:
x = df_oversampling_smote.drop(['Churn'], axis =1)
y = df_oversampling_smote['Churn']

## scaling

In [119]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(x)
X_std = scaler.transform(x)

## split data

In [120]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X_std,
                                                  y,
                                                  test_size=0.3,
                                                  random_state = 42)

# Modeling

pemodelan kali ini, saya menggunakan Logregression, random forest dan decision tree. kemudian membandingkan performa ketiganya

#### Logistic Regression

In [121]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(random_state = 42).fit(X_train,y_train)

In [122]:
y_pred_logreg = logreg.predict(X_test)

In [123]:
from sklearn.metrics import classification_report 

print(classification_report(y_test,y_pred_logreg))

              precision    recall  f1-score   support

          No       0.84      0.80      0.82      1584
         Yes       0.80      0.84      0.82      1521

    accuracy                           0.82      3105
   macro avg       0.82      0.82      0.82      3105
weighted avg       0.82      0.82      0.82      3105



#### Decision Tree

In [124]:
from sklearn.tree import DecisionTreeClassifier

de_tree = DecisionTreeClassifier(random_state = 42).fit(X_train,y_train)
y_pred_decision_tree = de_tree.predict(X_test)
y_pred_decision_tree

array(['Yes', 'No', 'No', ..., 'No', 'No', 'No'], dtype=object)

In [125]:
print(classification_report(y_test,y_pred_decision_tree))

              precision    recall  f1-score   support

          No       0.81      0.79      0.80      1584
         Yes       0.78      0.80      0.79      1521

    accuracy                           0.80      3105
   macro avg       0.80      0.80      0.80      3105
weighted avg       0.80      0.80      0.80      3105



#### Random Forest

In [126]:
random_forest = RandomForestClassifier().fit(X_train, y_train)
y_pred_forest = random_forest.predict(X_test)

In [127]:
print(classification_report(y_test,y_pred_forest))

              precision    recall  f1-score   support

          No       0.84      0.85      0.85      1584
         Yes       0.84      0.83      0.84      1521

    accuracy                           0.84      3105
   macro avg       0.84      0.84      0.84      3105
weighted avg       0.84      0.84      0.84      3105



## Evaluation

pilih model yang terbaik performannya kemudian beri pejelasan kenapa model tersebut lebih baik dibandingkan dengan yang lain

model yang saya pilih adalah random_forest, karena memiliki performa paling tinggi dari model yang lain, dimana nilai akurasi f1-scorenya 0.84 untuk prediksi customer yang churn (yes), dan memiliki nilai performa precision (yes) yang paling besar