## Import Library

## Load dataset

In [None]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from imblearn.over_sampling import SMOTE

from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv('/content/drive/MyDrive/Bootcamp_Data_Science/Churn.csv')
df.head(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


## Data Preprocessing

In [None]:
# Mengonversi 'yes' menjadi 1 dan 'no' menjadi 0 pada kolom 'churn'
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

In [None]:
df.head(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,0
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,0
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,1
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,0
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,1


### Split Data

In [None]:
X = df.drop(['Churn'],axis = 1)
y = df['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
X_train.shape, X_test.shape

((5634, 20), (1409, 20))

In [None]:
X_train.head(3)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
2142,4223-BKEOR,Female,0,No,Yes,21,Yes,No,DSL,Yes,No,Yes,No,No,Yes,One year,No,Mailed check,64.85,1336.8
1623,6035-RIIOM,Female,0,No,No,54,Yes,Yes,Fiber optic,No,Yes,No,No,Yes,Yes,Two year,Yes,Bank transfer (automatic),97.2,5129.45
6074,3797-VTIDR,Male,0,Yes,No,1,No,No phone service,DSL,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,23.45,23.45


### Categorical Encoding

In [None]:
# Mengambil kolom yang bukan numerik
kolom_non_numerik = X_train.select_dtypes(exclude=['number']).columns.tolist()
kolom_non_numerik

['customerID',
 'gender',
 'Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod',
 'TotalCharges']

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

In [None]:
X_train['customerID'] = label_encoder.fit_transform(X_train['customerID'])
X_train['gender'] = label_encoder.fit_transform(X_train['gender'])
X_train['Partner'] = label_encoder.fit_transform(X_train['Partner'])
X_train['Dependents'] = label_encoder.fit_transform(X_train['Dependents'])
X_train['PhoneService'] = label_encoder.fit_transform(X_train['PhoneService'])
X_train['MultipleLines'] = label_encoder.fit_transform(X_train['MultipleLines'])
X_train['InternetService'] = label_encoder.fit_transform(X_train['InternetService'])
X_train['OnlineSecurity'] = label_encoder.fit_transform(X_train['OnlineSecurity'])
X_train['OnlineBackup'] = label_encoder.fit_transform(X_train['OnlineBackup'])
X_train['DeviceProtection'] = label_encoder.fit_transform(X_train['DeviceProtection'])
X_train['TechSupport'] = label_encoder.fit_transform(X_train['TechSupport'])
X_train['StreamingTV'] = label_encoder.fit_transform(X_train['StreamingTV'])
X_train['StreamingMovies'] = label_encoder.fit_transform(X_train['StreamingMovies'])
X_train['Contract'] = label_encoder.fit_transform(X_train['Contract'])
X_train['PaperlessBilling'] = label_encoder.fit_transform(X_train['PaperlessBilling'])
X_train['PaymentMethod'] = label_encoder.fit_transform(X_train['PaymentMethod'])
X_train['TotalCharges'] = label_encoder.fit_transform(X_train['TotalCharges'])

In [None]:
X_train.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
2142,2370,0,0,0,1,21,1,0,0,2,0,2,0,0,2,1,0,3,64.85,492
1623,3457,0,0,0,0,54,1,2,1,0,2,0,0,2,2,2,1,0,97.2,3511
6074,2149,1,0,1,0,1,0,1,0,0,0,0,0,0,0,0,1,2,23.45,1566
1362,1426,1,0,0,0,4,1,0,1,0,0,0,0,0,0,0,1,2,70.2,1627
6754,1545,1,0,0,1,0,1,2,0,2,2,0,2,0,0,2,1,0,61.9,0


In [None]:
'''
# Cek distribusi target
print(y_train.value_counts())

# Menggunakan SMOTE untuk mengatasi ketidakseimbangan
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# Cek distribusi target setelah SMOTE
print(pd.Series(y_train_balanced).value_counts())
'''

'\n# Cek distribusi target\nprint(y_train.value_counts())\n\n# Menggunakan SMOTE untuk mengatasi ketidakseimbangan\nsmote = SMOTE(random_state=42)\nX_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)\n\n# Cek distribusi target setelah SMOTE\nprint(pd.Series(y_train_balanced).value_counts())\n'

In [None]:
X_train.shape, X_test.shape

((5634, 20), (1409, 20))

### Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)
X_std = scaler.transform(X_train)

## Modeling (Gunakan lebih min 2 model dan bandingkan hasil evaluasinya)

Bebas menggunakan model, mau menggunakan decision tree, random forest, xgboost, dll juga boleh<br><br>
silahkan berekspresi :)

### Random Forest

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

model = RandomForestClassifier()

# Inisialisasi KFold split dengan 5 kelompok
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Lists to store metrics
f1_scores = []
recall_scores = []
precision_scores = []

for train_index, test_index in kf.split(X_std):
  X_train_, X_val = X_std[train_index], X_std[test_index]
  y_train_, y_val = y_train.values[train_index], y_train.values[test_index]

  model.fit(X_train_, y_train_)
  y_pred = model.predict(X_val)


  f1_scores.append(f1_score(y_val, y_pred))
  recall_scores.append(precision_score(y_val, y_pred))
  precision_scores.append(f1_score(y_val, y_pred))

f1_mean, mae_std = np.mean(f1_scores), np.std(f1_scores)
recall_mean, recall_std = np.mean(recall_scores), np.std(recall_scores)
precision_mean,precision_std = np.mean(precision_scores), np.std(precision_scores)

In [None]:
X_std.shape, y_train.shape

((5634, 20), (5634,))

In [None]:
f1_mean, mae_std

(0.5427831324444133, 0.014871368066263208)

In [None]:
recall_mean, recall_std

(0.6382285452791898, 0.017838134530609933)

In [None]:
precision_mean,precision_std


(0.5427831324444133, 0.014871368066263208)

### Test

In [None]:
X_test.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
185,1024-GUALD,Female,0,Yes,No,1,No,No phone service,DSL,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,24.8,24.8
2715,0484-JPBRU,Male,0,No,No,41,Yes,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Month-to-month,Yes,Bank transfer (automatic),25.25,996.45
3825,3620-EHIMZ,Female,0,Yes,Yes,52,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,19.35,1031.7
1807,6910-HADCM,Female,0,No,No,1,Yes,No,Fiber optic,No,No,Yes,No,No,No,Month-to-month,No,Electronic check,76.35,76.35
132,8587-XYZSF,Male,0,No,No,67,Yes,No,DSL,No,No,No,Yes,No,No,Two year,No,Bank transfer (automatic),50.55,3260.1


### Categorical Encoding

In [None]:
'''
X_test['customerID'] = label_encoder.transform(X_test['customerID'])
X_test['gender'] = label_encoder.transform(X_test['gender'])
X_test['Partner'] = label_encoder.transform(X_test['Partner'])
X_test['Dependents'] = label_encoder.transform(X_test['Dependents'])
X_test['PhoneService'] = label_encoder.transform(X_test['PhoneService'])
X_test['MultipleLines'] = label_encoder.transform(X_test['MultipleLines'])
X_test['InternetService'] = label_encoder.transform(X_test['InternetService'])
X_test['OnlineSecurity'] = label_encoder.transform(X_test['OnlineSecurity'])
X_test['OnlineBackup'] = label_encoder.transform(X_test['OnlineBackup'])
X_test['DeviceProtection'] = label_encoder.transform(X_test['DeviceProtection'])
X_test['TechSupport'] = label_encoder.transform(X_test['TechSupport'])
X_test['StreamingTV'] = label_encoder.transform(X_test['StreamingTV'])
X_test['StreamingMovies'] = label_encoder.transform(X_test['StreamingMovies'])
X_test['Contract'] = label_encoder.transform(X_test['Contract'])
X_test['PaperlessBilling'] = label_encoder.transform(X_test['PaperlessBilling'])
X_test['PaymentMethod'] = label_encoder.transform(X_test['PaymentMethod'])
X_test['TotalCharges'] = label_encoder.transform(X_test['TotalCharges'])
'''

"\nX_test['customerID'] = label_encoder.transform(X_test['customerID'])\nX_test['gender'] = label_encoder.transform(X_test['gender'])\nX_test['Partner'] = label_encoder.transform(X_test['Partner'])\nX_test['Dependents'] = label_encoder.transform(X_test['Dependents'])\nX_test['PhoneService'] = label_encoder.transform(X_test['PhoneService'])\nX_test['MultipleLines'] = label_encoder.transform(X_test['MultipleLines'])\nX_test['InternetService'] = label_encoder.transform(X_test['InternetService'])\nX_test['OnlineSecurity'] = label_encoder.transform(X_test['OnlineSecurity'])\nX_test['OnlineBackup'] = label_encoder.transform(X_test['OnlineBackup'])\nX_test['DeviceProtection'] = label_encoder.transform(X_test['DeviceProtection'])\nX_test['TechSupport'] = label_encoder.transform(X_test['TechSupport'])\nX_test['StreamingTV'] = label_encoder.transform(X_test['StreamingTV'])\nX_test['StreamingMovies'] = label_encoder.transform(X_test['StreamingMovies'])\nX_test['Contract'] = label_encoder.transfor

In [None]:
from sklearn.preprocessing import OrdinalEncoder

# Create an instance of OrdinalEncoder with handle_unknown
ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

# Assuming categorical_columns is a list of your categorical columns
categorical_columns = ['customerID', 'gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
                     'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
                     'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']

# Fit the OrdinalEncoder on your training data
# Assuming X_train is your training data
ordinal_encoder.fit(X_train[categorical_columns])

# Transform both training and test data
X_train[categorical_columns] = ordinal_encoder.transform(X_train[categorical_columns])
X_test[categorical_columns] = ordinal_encoder.transform(X_test[categorical_columns])

In [None]:
X_test.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
185,-1.0,-1.0,0,-1.0,-1.0,1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,24.8,24.8
2715,-1.0,-1.0,0,-1.0,-1.0,41,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,25.25,996.45
3825,-1.0,-1.0,0,-1.0,-1.0,52,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,19.35,1031.7
1807,-1.0,-1.0,0,-1.0,-1.0,1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,76.35,76.35
132,-1.0,-1.0,0,-1.0,-1.0,67,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,50.55,3260.1


In [None]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1409 entries, 185 to 1161
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        1409 non-null   float64
 1   gender            1409 non-null   float64
 2   SeniorCitizen     1409 non-null   int64  
 3   Partner           1409 non-null   float64
 4   Dependents        1409 non-null   float64
 5   tenure            1409 non-null   int64  
 6   PhoneService      1409 non-null   float64
 7   MultipleLines     1409 non-null   float64
 8   InternetService   1409 non-null   float64
 9   OnlineSecurity    1409 non-null   float64
 10  OnlineBackup      1409 non-null   float64
 11  DeviceProtection  1409 non-null   float64
 12  TechSupport       1409 non-null   float64
 13  StreamingTV       1409 non-null   float64
 14  StreamingMovies   1409 non-null   float64
 15  Contract          1409 non-null   float64
 16  PaperlessBilling  1409 non-null   float64
 17

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Assuming 'TotalCharges' is the problematic column
# Replace empty strings or strings with just spaces with NaN
X_test['TotalCharges'] = pd.to_numeric(X_test['TotalCharges'], errors='coerce')

# Impute NaN values with the mean or median of the column
X_test['TotalCharges'] = X_test['TotalCharges'].fillna(X_test['TotalCharges'].mean()) # Or use median

# Fit the StandardScaler on your training data - assuming you have already done this previously
# scaler = StandardScaler()
# scaler.fit(X_train[['TotalCharges']])  # Only include the 'TotalCharges' column

# Now, you can apply the scaling
X_test_std = scaler.transform(X_test)

In [None]:
y_test_pred = model.predict(X_test_std)

### Evaluation Random Forest

pilih model yang terbaik performannya kemudian beri pejelasan kenapa model tersebut lebih baik dibandingkan dengan yang lain

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.76      0.99      0.86      1036
           1       0.87      0.12      0.21       373

    accuracy                           0.76      1409
   macro avg       0.81      0.56      0.54      1409
weighted avg       0.79      0.76      0.69      1409



### Decision Trees

In [None]:
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

dt_model = DecisionTreeClassifier()

# Inisialisasi KFold split dengan 5 kelompok
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Lists to store metrics
f1_scores = []
recall_scores = []
precision_scores = []

for train_index, test_index in kf.split(X_std):
  X_train_, X_val = X_std[train_index], X_std[test_index]
  y_train_, y_val = y_train.values[train_index], y_train.values[test_index]

  dt_model.fit(X_train_, y_train_)
  y_pred = dt_model.predict(X_val)


  f1_scores.append(f1_score(y_val, y_pred))
  recall_scores.append(precision_score(y_val, y_pred))
  precision_scores.append(f1_score(y_val, y_pred))

f1_mean, mae_std = np.mean(f1_scores), np.std(f1_scores)
recall_mean, recall_std = np.mean(recall_scores), np.std(recall_scores)
precision_mean,precision_std = np.mean(precision_scores), np.std(precision_scores)

In [None]:
X_std.shape, y_train.shape

((5634, 20), (5634,))

In [None]:
f1_mean, mae_std

(0.4788919845053935, 0.015625126559734644)

In [None]:
recall_mean, recall_std

(0.46610768268617286, 0.01572188646632053)

In [None]:
precision_mean,precision_std

(0.4788919845053935, 0.015625126559734644)

In [None]:
X_test.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
185,-1.0,-1.0,0,-1.0,-1.0,1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,24.8,24.8
2715,-1.0,-1.0,0,-1.0,-1.0,41,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,25.25,996.45
3825,-1.0,-1.0,0,-1.0,-1.0,52,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,19.35,1031.7
1807,-1.0,-1.0,0,-1.0,-1.0,1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,76.35,76.35
132,-1.0,-1.0,0,-1.0,-1.0,67,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,50.55,3260.1


In [None]:
y_test_preddt = dt_model.predict(X_test_std)

### Evaluation Decision Trees

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_test_preddt))

              precision    recall  f1-score   support

           0       0.74      0.78      0.76      1036
           1       0.29      0.25      0.27       373

    accuracy                           0.64      1409
   macro avg       0.51      0.51      0.51      1409
weighted avg       0.62      0.64      0.63      1409



## Overall Evaluation

dari hasil evaluasi kedua model:
1. akurasi Random Forest mencapai min 70% dimana lebih besar dari Decision Trees, respectively 76% dan 64%.
2. Precision value dipilih karena melihat dari False Positivenya dimana actual tidak churn tapi diprediksi churn, dimana hal ini lebih krusial untuk di analisa karena dengan begini, perusahaan bisa lebih preventive untuk pelanggan setianya.
3. dilihat dari hasil Precesionnya, model Random Forest juga memilih value lebih besar (87%) dari Decision Trees yang hanya (29%).

Kesimpulan: Model Random Forest lebih bagus dari Decision Trees untuk kasus ini.
