# Import Library

In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import imblearn as imb
%matplotlib inline

from imblearn.over_sampling import SMOTE 

from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix, f1_score, roc_auc_score, precision_score
from imblearn import under_sampling, over_sampling
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# Load dataset

In [2]:
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [3]:
df = df.loc[~df['TotalCharges'].str.contains(' ')]

In [4]:
df['TotalCharges'] = df['TotalCharges'].astype(float)

In [5]:
dfc = df.copy()
df.head(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7032 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7032 non-null   object 
 1   gender            7032 non-null   object 
 2   SeniorCitizen     7032 non-null   int64  
 3   Partner           7032 non-null   object 
 4   Dependents        7032 non-null   object 
 5   tenure            7032 non-null   int64  
 6   PhoneService      7032 non-null   object 
 7   MultipleLines     7032 non-null   object 
 8   InternetService   7032 non-null   object 
 9   OnlineSecurity    7032 non-null   object 
 10  OnlineBackup      7032 non-null   object 
 11  DeviceProtection  7032 non-null   object 
 12  TechSupport       7032 non-null   object 
 13  StreamingTV       7032 non-null   object 
 14  StreamingMovies   7032 non-null   object 
 15  Contract          7032 non-null   object 
 16  PaperlessBilling  7032 non-null   object 


In [7]:
dfc['Churn'].value_counts()

No     5163
Yes    1869
Name: Churn, dtype: int64

In [8]:
1869/7032*100

26.578498293515356

# Data Preprocessing

In [9]:
le = preprocessing.LabelEncoder()
le.fit(dfc['gender'])
dfc['gender'] = le.transform(df['gender'])

In [10]:
# Fit Data
le.fit(dfc['Partner'])
le.fit(dfc['Dependents'])
le.fit(dfc['PhoneService'])
le.fit(dfc['PaperlessBilling'])
le.fit(dfc['Churn'])

In [11]:
# Transform Data
dfc['Partner'] = le.transform(df['Partner']) 
dfc['Dependents'] = le.transform(df['Dependents']) 
dfc['PhoneService'] = le.transform(df['PhoneService']) 
dfc['PaperlessBilling'] = le.transform(df['PaperlessBilling']) 
dfc['Churn'] = le.transform(df['Churn']) 

## Maping

In [12]:
# Categorical 
phone_service = {'Yes': 'Yes',
                 'No': 'No',
                 'No phone service': 'No'}

internet_service = {'Yes': 'Yes',
                    'No': 'No',
                    'No internet service': 'No'}
                    
contract = {'Month-to-month' :1,
            'One year' :2,
            'Two year' :3}

yesno = {'Yes':1,
         'No':0}

In [13]:
# Maping Categorical Data
dfc['Contract'] = dfc['Contract'].map(contract)
dfc['MultipleLines'] = dfc['MultipleLines'].map(phone_service)
dfc['MultipleLines'] = dfc['MultipleLines'].map(yesno)

In [14]:
# Maping "No internet service" as "No"
dfc['OnlineSecurity'] = dfc['OnlineSecurity'].map(internet_service)
dfc['OnlineBackup'] = dfc['OnlineBackup'].map(internet_service)
dfc['DeviceProtection'] = dfc['DeviceProtection'].map(internet_service)
dfc['TechSupport'] = dfc['TechSupport'].map(internet_service)
dfc['StreamingTV'] = dfc['StreamingTV'].map(internet_service)
dfc['StreamingMovies'] = dfc['StreamingMovies'].map(internet_service)

In [15]:
# Maping yes no
dfc['OnlineSecurity'] = dfc['OnlineSecurity'].map(yesno)
dfc['OnlineBackup'] = dfc['OnlineBackup'].map(yesno)
dfc['DeviceProtection'] = dfc['DeviceProtection'].map(yesno)
dfc['TechSupport'] = dfc['TechSupport'].map(yesno)
dfc['StreamingTV'] = dfc['StreamingTV'].map(yesno)
dfc['StreamingMovies'] = dfc['StreamingMovies'].map(yesno)

## One Hot Encoding

In [16]:
# One Hot Encoding
aa = pd.get_dummies(dfc['InternetService'], prefix='InternetService')
bb = pd.get_dummies(dfc['PaymentMethod'], prefix='PaymentMethod')

### Insert One Hot Encoding Columns

In [17]:
# Payment
dfc.insert(17, 'PaymentMethod_MailedCheck', bb['PaymentMethod_Mailed check'])
dfc.insert(17, 'PaymentMethod_ElectronicCheck', bb['PaymentMethod_Electronic check'])
dfc.insert(17, 'PaymentMethod_CreditCard', bb['PaymentMethod_Credit card (automatic)'])
dfc.insert(17, 'PaymentMethod_BankTransfer', bb['PaymentMethod_Bank transfer (automatic)'])

# Internet Service
dfc.insert(9, 'InternetService_No', aa['InternetService_No'])
dfc.insert(9, 'InternetService_Fiber', aa['InternetService_Fiber optic'])
dfc.insert(9, 'InternetService_DSL', aa['InternetService_DSL'])

In [18]:
dfc = dfc.drop(['customerID','InternetService', 'PaymentMethod'], axis=1)

### Imbalance Check

In [19]:
dfc['Churn'].value_counts()

0    5163
1    1869
Name: Churn, dtype: int64

In [20]:
dfc['Churn'].value_counts(normalize=True)*100

0    73.421502
1    26.578498
Name: Churn, dtype: float64

### Balancing Data

In [21]:
X = dfc.drop(['Churn'],axis = 1)
Y = dfc['Churn']

In [22]:
xx, yy = over_sampling.SMOTE().fit_resample(X,Y)

## Split Data

In [23]:
(X_train, X_test,
Y_train, Y_test) = train_test_split(xx, yy, test_size=0.25, random_state=42)

# **Modeling** (Gunakan lebih min 2 model dan bandingkan hasil evaluasinya)

Bebas menggunakan model, mau menggunakan decision tree, random forest, xgboost, dll juga boleh<br><br>
silahkan berekspresi :)

## KNN

In [24]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(X_train, Y_train)
pred_knn = knn.predict(X_test)

## Decision Tree

In [25]:
dt = DecisionTreeClassifier(random_state=42)
model2 = dt.fit(X_train, Y_train)
pred_dt = dt.predict(X_test)

## Random Forest

In [26]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, Y_train)
pred_rf = model.predict(X_test)

# **Evaluation**

pilih model yang terbaik performannya kemudian beri pejelasan kenapa model tersebut lebih baik dibandingkan dengan yang lain

## KNN

In [27]:
print('Akurasi',accuracy_score(Y_test, pred_knn))

Akurasi 0.7753679318357862


## Decision Tree

In [29]:
print('Akurasi',accuracy_score(Y_test, pred_dt))

Akurasi 0.790859798605732


## Random Forest

In [30]:
print('Akurasi',accuracy_score(Y_test, pred_rf))

Akurasi 0.8361735089078234


# **Kesimpulan**

Dari Ketiga Model yang digunakan pada penelitian ini, maka saya memilih random forest karena diketahui bahwa model random forest lah yang paling tinggi nilai akurasinya.