In [1]:
# Import library pengolahan struktur data
import pandas as pd

# Import library pengolahan angka
import numpy as np

## **Client Credit Card Analysis**
---

- Task : Classification
- Objective : Prediksi credit card client default / tidak dari data customer bank di Taiwan

### **Data description:**

Ada 30.000 data dengan 25 fitur. Fitur-fitur tersebut adalah 
- `ID`
- `LIMIT_BAL`: Limit balance (credit limit dalam dollar Taiwan)
- `SEX`: gender (1=male, 2-female)
- `EDUCATION`: level of education (1 = graduate school; 2 = university; 3 = high school; 4 = others)
- `MARRIAGE`: Marital status (1 = married; 2 = single; 3 = others)
- `AGE`: Age (in years)
- `PAY_0` - `PAY_6` indicate the payment status over the last 6 months. (-1 = pay duly; 1 = payment delay for one month; 2 = payment delay for two months; …; 8 = payment delay for eight months; 9 = payment delay for nine months and above)
- `BILL_AMT1` through `BILL_AMT6` (remaining amount in bill for each of the last six months)
- `PAY_AMT_1` through `PAY_AMT6` (amount paid in each bill over the last six months)


**Output variable (desired target)**
- `default.payment.next.month` - Indicator variable for default on the following month. (this is our variable of interest, which we will want to predict)


### Import Data and Data Preprocessing

In [2]:
# Load Data
# Simpan dengan nama bank_df
data = pd.read_csv("data/credit_data.csv",
                   sep = ";")

In [3]:
# Tampilkan seluruh data
data

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,,2.0,2.0,1.0,24.0,2.0,2.0,-1.0,-1.0,-2.0,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,120000.0,2.0,2.0,2.0,26.0,-1.0,2.0,,0.0,0.0,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,,1
2,,2.0,2.0,2.0,34.0,0.0,0.0,0.0,0.0,0.0,...,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,50000.0,2.0,,1.0,37.0,0.0,,0.0,0.0,0.0,...,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,,1069.0,1000.0,0
4,50000.0,,,1.0,57.0,-1.0,0.0,-1.0,0.0,0.0,...,20940.0,19146.0,19131.0,2000.0,,10000.0,9000.0,689.0,679.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,220000.0,1.0,3.0,1.0,39.0,0.0,0.0,0.0,0.0,0.0,...,88004.0,31237.0,15980.0,8500.0,20000.0,5003.0,3047.0,5000.0,1000.0,0
29996,150000.0,1.0,,2.0,43.0,-1.0,-1.0,-1.0,-1.0,0.0,...,8979.0,5190.0,0.0,1837.0,3526.0,,129.0,0.0,0.0,0
29997,30000.0,,2.0,2.0,37.0,4.0,3.0,2.0,-1.0,0.0,...,20878.0,20582.0,19357.0,0.0,0.0,22000.0,4200.0,2000.0,,1
29998,80000.0,1.0,3.0,,41.0,1.0,-1.0,0.0,,0.0,...,52774.0,,48944.0,85900.0,3409.0,1178.0,1926.0,52964.0,1804.0,1


In [4]:
data.shape

# Output
# (Jumlah observasi, jumlah kolom/fitur)

(30000, 24)

In [5]:
# cek data duplicate
duplicate_status = data.duplicated()
duplicate_status

0        False
1        False
2        False
3        False
4        False
         ...  
29995    False
29996    False
29997    False
29998    False
29999    False
Length: 30000, dtype: bool

In [6]:
# Cari jumlah data duplikatnya
duplicate_status.sum()

# FALSE = 0 --> kalo tidak duplikat 
# TRUE = 1 --> kalo duplikat
# Kalau ada yang duplikat, maka jumlahnya > 0

0

In [7]:
data = data.drop_duplicates()

# Tidak ada yang di-drop karena tidak ada duplikat

In [8]:
data.shape

# Selalu sanity check!
# Periksa ulang jumlah observasi

(30000, 24)

In [9]:
# Kita ingin membuat fungsi yang isi perintahnya sebagai berikut
data = pd.read_csv("data/credit_data.csv", sep=";")
print("Data asli            : ", data.shape, "- (#observasi, #kolom)")

data = data.drop_duplicates()
print("Data setelah di-drop : ", data.shape, "- (#observasi, #kolom)")

Data asli            :  (30000, 24) - (#observasi, #kolom)
Data setelah di-drop :  (30000, 24) - (#observasi, #kolom)


In [10]:
def importData(filename):
    """
    Fungsi untuk import data & hapus duplikat
    :param filename: <string> nama file input (format .csv)
    :return df: <pandas dataframe> sampel data
    """

    # read data
    df = pd.read_csv(filename, sep=";")
    print("Data asli            : ", df.shape, "- (#observasi, #kolom)")

    # drop duplicates
    df = df.drop_duplicates()
    print("Data setelah di-drop : ", df.shape, "- (#observasi, #kolom)")

    return df

# (filename) adalah argumen
# Argumen adalah sebuah variable. 
# Jika fungsi tsb. diberi argumen filename = "credit_data.csv", 
# maka semua variabel 'filename' di dalam fungsi 
# akan berubah menjadi "credit_data.csv"

In [11]:
# input
file_credit = "data/credit_data.csv"

# panggil fungsi
data = importData(filename = file_credit)

Data asli            :  (30000, 24) - (#observasi, #kolom)
Data setelah di-drop :  (30000, 24) - (#observasi, #kolom)


In [12]:
data.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,,2.0,2.0,1.0,24.0,2.0,2.0,-1.0,-1.0,-2.0,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,120000.0,2.0,2.0,2.0,26.0,-1.0,2.0,,0.0,0.0,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,,1
2,,2.0,2.0,2.0,34.0,0.0,0.0,0.0,0.0,0.0,...,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,50000.0,2.0,,1.0,37.0,0.0,,0.0,0.0,0.0,...,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,,1069.0,1000.0,0
4,50000.0,,,1.0,57.0,-1.0,0.0,-1.0,0.0,0.0,...,20940.0,19146.0,19131.0,2000.0,,10000.0,9000.0,689.0,679.0,0


### <b><font color='blue'> 2. Data Preprocessing:</font></b>
---
    * Input-Output Split, Train-Test Split
    * Processing Categorical
    * Imputation, Normalization, Drop Duplicates

### **Input-Output Split**

- Fitur `y` adalah output variabel dari data marketing
- yang lainnya menjadi input

In [13]:
data.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,,2.0,2.0,1.0,24.0,2.0,2.0,-1.0,-1.0,-2.0,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,120000.0,2.0,2.0,2.0,26.0,-1.0,2.0,,0.0,0.0,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,,1
2,,2.0,2.0,2.0,34.0,0.0,0.0,0.0,0.0,0.0,...,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,50000.0,2.0,,1.0,37.0,0.0,,0.0,0.0,0.0,...,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,,1069.0,1000.0,0
4,50000.0,,,1.0,57.0,-1.0,0.0,-1.0,0.0,0.0,...,20940.0,19146.0,19131.0,2000.0,,10000.0,9000.0,689.0,679.0,0


In [14]:
output_data = data["default.payment.next.month"]

# buat data yang berisi data target
# pilih data dengan nama kolom `default.payment.next.month`, lalu namakan sebagai output_data

In [15]:
output_data.head()

0    1
1    1
2    0
3    0
4    0
Name: default.payment.next.month, dtype: int64

In [16]:
# **Buat data input**

# - DATA = INPUT + OUTPUT
# - DATA - OUTPUT = INPUT
# - Jadi kalau dari data, kita drop VARIABLE OUTPUT, maka tersisa hanya variabel INPUT.

input_data = data.drop(["default.payment.next.month"], 
                       axis = 1)
input_data.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
0,,2.0,2.0,1.0,24.0,2.0,2.0,-1.0,-1.0,-2.0,...,689.0,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0
1,120000.0,2.0,2.0,2.0,26.0,-1.0,2.0,,0.0,0.0,...,2682.0,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,
2,,2.0,2.0,2.0,34.0,0.0,0.0,0.0,0.0,0.0,...,,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0
3,50000.0,2.0,,1.0,37.0,0.0,,0.0,0.0,0.0,...,49291.0,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,,1069.0,1000.0
4,50000.0,,,1.0,57.0,-1.0,0.0,-1.0,0.0,0.0,...,35835.0,20940.0,19146.0,19131.0,2000.0,,10000.0,9000.0,689.0,679.0


In [17]:
# Buat semuanya jadi fungsi

# buat output_data
# buat input_data
# return input_data dan output_data

# isi perintah yang akan dimasukkan ke dalam fungsi
output_data = data["default.payment.next.month"]
input_data = data.drop("default.payment.next.month",
                       axis = 1)

In [18]:
def extractInputOutput(data,
                       output_column_name,
                    #    column_to_drop
                       ):
    """
    Fungsi untuk memisahkan data input dan output
    :param data: <pandas dataframe> data seluruh sample
    :param output_column_name: <string> nama kolom output
    :return input_data: <pandas dataframe> data input
    :return output_data: <pandas series> data output
    """
    # drop data
    # data = data.drop(columns = column_to_drop)
    output_data = data[output_column_name]
    input_data = data.drop(output_column_name,
                           axis = 1)
    
    return input_data, output_data

# (data, output_column_name) adalah argumen
# Argumen adalah sebuah variable. 
# Jika fungsi tsb. diberi argumen data = credit_data, 
# maka semua variabel 'data' di dalam fungsi akan berubah menjadi credit_data

In [19]:
# Jangan sampai salah urutan dalam penempatan return
# column_to_drop = ["Unnamed: 0"]
output_column_name = ["default.payment.next.month"]

X, y = extractInputOutput(data = data,
                          output_column_name = output_column_name,
                        #   column_to_drop = column_to_drop
                          )

In [20]:
X.head(2)

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
0,,2.0,2.0,1.0,24.0,2.0,2.0,-1.0,-1.0,-2.0,...,689.0,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0
1,120000.0,2.0,2.0,2.0,26.0,-1.0,2.0,,0.0,0.0,...,2682.0,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,


In [21]:
y.head(2)

Unnamed: 0,default.payment.next.month
0,1
1,1


In [22]:
for i in X.columns:
    print(i, ":", len(X[i].value_counts()))

LIMIT_BAL : 81
SEX : 2
EDUCATION : 7
MARRIAGE : 4
AGE : 56
PAY_0 : 11
PAY_2 : 11
PAY_3 : 11
PAY_4 : 11
PAY_5 : 10
PAY_6 : 10
BILL_AMT1 : 21179
BILL_AMT2 : 20743
BILL_AMT3 : 20523
BILL_AMT4 : 20076
BILL_AMT5 : 19495
BILL_AMT6 : 19123
PAY_AMT1 : 7515
PAY_AMT2 : 7443
PAY_AMT3 : 7151
PAY_AMT4 : 6576
PAY_AMT5 : 6543
PAY_AMT6 : 6564


In [23]:
# Check repayment status
repayment_columns = ["PAY_0", "PAY_2", "PAY_3", "PAY_4", "PAY_5", "PAY_6"]

for i in repayment_columns:
    print(X[i].value_counts(normalize=True))

 0.0    0.490045
-1.0    0.190549
 1.0    0.122330
-2.0    0.092337
 2.0    0.089363
 3.0    0.010590
 4.0    0.002611
 5.0    0.000834
 8.0    0.000653
 6.0    0.000363
 7.0    0.000326
Name: PAY_0, dtype: float64
 0.0    0.523188
-1.0    0.201820
 2.0    0.131477
-2.0    0.126473
 3.0    0.011059
 4.0    0.003191
 1.0    0.000979
 5.0    0.000761
 7.0    0.000616
 6.0    0.000399
 8.0    0.000036
Name: PAY_2, dtype: float64
 0.0    0.525787
-1.0    0.199682
-2.0    0.135278
 2.0    0.126387
 3.0    0.007662
 4.0    0.002530
 7.0    0.000867
 6.0    0.000831
 5.0    0.000723
 1.0    0.000145
 8.0    0.000108
Name: PAY_3, dtype: float64
 0.0    0.548248
-1.0    0.190492
-2.0    0.144472
 2.0    0.105120
 3.0    0.006051
 4.0    0.002247
 7.0    0.001848
 5.0    0.001196
 6.0    0.000181
 1.0    0.000072
 8.0    0.000072
Name: PAY_4, dtype: float64
 0.0    0.564964
-1.0    0.184945
-2.0    0.151993
 2.0    0.086848
 3.0    0.005715
 4.0    0.002858
 7.0    0.001989
 5.0    0.000506
 6.0

In [24]:
# There are some classes of category that doesn't give information enough
# Based on these proportion below, we should grouping some classes:
# PAY_0 : class 4-8 into class 3
# PAY_2 : class 4-8 into class 3, class 1 into class 2
# PAY_3 : class 4-8 into class 3, class 1 into class 2
# PAY_4 : class 4-8 into class 3, class 1 into class 2
# PAY_5 : class 4-8 into class 3
# PAY_6 : class 4-8 into class 3

pay_list_to_3 = ["PAY_0", "PAY_2", "PAY_3", "PAY_4", "PAY_5", "PAY_6"]
pay_list_to_2 = ["PAY_2", "PAY_3", "PAY_4"]

list_set_3 = [4.0, 5.0, 6.0, 7.0, 8.0]

# Convert to 3
for i in pay_list_to_3:
    for j in list_set_3:
        data.loc[data[i] == j, i] = 3.0

# Convert to 2
for i in pay_list_to_2:
    data.loc[data[i] == 1.0, i] = 2.0

In [25]:
for i in repayment_columns:
    print(data[i].value_counts(normalize=True))

 0.0    0.490045
-1.0    0.190549
 1.0    0.122330
-2.0    0.092337
 2.0    0.089363
 3.0    0.015377
Name: PAY_0, dtype: float64
 0.0    0.523188
-1.0    0.201820
 2.0    0.132456
-2.0    0.126473
 3.0    0.016063
Name: PAY_2, dtype: float64
 0.0    0.525787
-1.0    0.199682
-2.0    0.135278
 2.0    0.126531
 3.0    0.012722
Name: PAY_3, dtype: float64
 0.0    0.548248
-1.0    0.190492
-2.0    0.144472
 2.0    0.105193
 3.0    0.011595
Name: PAY_4, dtype: float64
 0.0    0.564964
-1.0    0.184945
-2.0    0.151993
 2.0    0.086848
 3.0    0.011249
Name: PAY_5, dtype: float64
 0.0    0.543062
-1.0    0.190831
-2.0    0.163244
 2.0    0.092704
 3.0    0.010160
Name: PAY_6, dtype: float64


In [26]:
# Education
# EDUCATION: (1=graduate school, 2=university, 3=high school, 4=others, 5=unknown, 6=unknown)
data["EDUCATION"].value_counts(normalize=True)

2.0    0.466696
1.0    0.354689
3.0    0.163071
5.0    0.009261
4.0    0.004104
6.0    0.001707
0.0    0.000472
Name: EDUCATION, dtype: float64

In [27]:
# Terdapat kelas bernilai "0.0" yang tidak terdapat di Deskripsi Variabel
# kelas 4-6 pada kategori Education kurang dapat menjelaskan tingkat pendidikan dan sedikitnya informasi yang dapat diberikan
# kelas 0, 4, 5, dan 6 dijadikan satu kelas, yakni others atau kelas 4.0
edu_list_to_4 = [0.0, 5.0, 6.0]

for i in edu_list_to_4:
    data.loc[data['EDUCATION']== i, 'EDUCATION'] = 4.0

In [28]:
# redefine the input and output after change some values of class in repayment and education
X, y = extractInputOutput(data = data,
                          output_column_name = output_column_name,
                        #   column_to_drop = column_to_drop
                          )

In [29]:
X.head(2)

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
0,,2.0,2.0,1.0,24.0,2.0,2.0,-1.0,-1.0,-2.0,...,689.0,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0
1,120000.0,2.0,2.0,2.0,26.0,-1.0,2.0,,0.0,0.0,...,2682.0,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,


### **Train-Test Split**

- **Kenapa?**
  - Karena tidak mau overfit data training
  - Test data akan menjadi future data
  - Kita akan latih model ML di data training, dengan CV (Cross-validation)
  - Selanjutnya melakukan evaluasi di data testing

  **Train Test Split Function**
1. `X` adalah input
2. `y` adalah output (target)
3. `test_size` adalah seberapa besar proporsi data test dari keseluruhan data. Contoh `test_size = 0.2` artinya data test akan berisi 20% data.
4. `random_state` adalah kunci untuk random. Harus di-setting sama. Misal `random_state = 123`.
5. Output:
   - `X_train` = input dari data training
   - `X_test` = input dari data testing
   - `y_train` = output dari data training
   - `y_test` = output dari data testing
6. Urutan outputnya: `X_train, X_test, y_train, y_test`. Tidak boleh terbalik

> Readmore: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

In [30]:
# Import train-test splitting library dari sklearn (scikit learn)
from sklearn.model_selection import train_test_split

In [31]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.25,
                                                    random_state = 12)

In [32]:
# Sanity check hasil splitting
print(X_train.shape)
print(X_test.shape)

(22500, 23)
(7500, 23)


In [33]:
# Ratio
X_test.shape[0] / X.shape[0]

# Hasil 0.25 - sesuai dengan test_size kita

0.25

### **Data Imputation**

- Proses pengisian data yang kosong (NaN)
- Ada 2 hal yang diperhatikan:
  - Numerical Imputation
  - Categorical Imputation

In [34]:
X_train.isnull().sum()

# Output: nama variabel, True/False.
# Jika True, maka ada data yang kosong

# Ada 2500-2700 data yang kosong

LIMIT_BAL    1824
SEX          1740
EDUCATION    1878
MARRIAGE     1729
AGE          1829
PAY_0        1826
PAY_2        1798
PAY_3        1778
PAY_4        1810
PAY_5        1782
PAY_6        1765
BILL_AMT1    1790
BILL_AMT2    1837
BILL_AMT3    1774
BILL_AMT4    1771
BILL_AMT5    1861
BILL_AMT6    1898
PAY_AMT1     1875
PAY_AMT2     1829
PAY_AMT3     1731
PAY_AMT4     1754
PAY_AMT5     1818
PAY_AMT6     1776
dtype: int64

In [35]:
X_train.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
21804,120000.0,2.0,3.0,2.0,,0.0,0.0,0.0,0.0,0.0,...,107838.0,103273.0,101353.0,,4000.0,5013.0,4000.0,,3130.0,4000.0
29559,20000.0,1.0,2.0,1.0,38.0,,,0.0,0.0,0.0,...,11608.0,12313.0,12848.0,800.0,1477.0,1500.0,,1500.0,800.0,800.0
10281,60000.0,2.0,2.0,2.0,26.0,1.0,2.0,0.0,0.0,2.0,...,55709.0,54173.0,54065.0,57499.0,2000.0,2500.0,3200.0,1000.0,4500.0,0.0
6256,100000.0,2.0,,2.0,29.0,0.0,0.0,0.0,,0.0,...,19664.0,16407.0,15068.0,40794.0,1775.0,1407.0,,935.0,42000.0,1500.0
3040,60000.0,1.0,1.0,1.0,38.0,2.0,0.0,0.0,,2.0,...,32396.0,,35456.0,34736.0,2000.0,,2800.0,1500.0,0.0,1500.0


Data kategorikal:
- job
- marital
- education
- default
- housing
- loan
- contact
- month
- poutcome

Sisanya adalah numerical

In [36]:
X_train.columns

Index(['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2',
       'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6'],
      dtype='object')

In [37]:
#_get_numeric_data() hanya akan mengambil column berisikan integer dan float
# hati-hati dengan data kategoric yang berbentuk integer!!
X_train_numerical = X_train._get_numeric_data() 
X_train_numerical.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
21804,120000.0,2.0,3.0,2.0,,0.0,0.0,0.0,0.0,0.0,...,107838.0,103273.0,101353.0,,4000.0,5013.0,4000.0,,3130.0,4000.0
29559,20000.0,1.0,2.0,1.0,38.0,,,0.0,0.0,0.0,...,11608.0,12313.0,12848.0,800.0,1477.0,1500.0,,1500.0,800.0,800.0
10281,60000.0,2.0,2.0,2.0,26.0,1.0,2.0,0.0,0.0,2.0,...,55709.0,54173.0,54065.0,57499.0,2000.0,2500.0,3200.0,1000.0,4500.0,0.0
6256,100000.0,2.0,,2.0,29.0,0.0,0.0,0.0,,0.0,...,19664.0,16407.0,15068.0,40794.0,1775.0,1407.0,,935.0,42000.0,1500.0
3040,60000.0,1.0,1.0,1.0,38.0,2.0,0.0,0.0,,2.0,...,32396.0,,35456.0,34736.0,2000.0,,2800.0,1500.0,0.0,1500.0


In [38]:
# Buat kolom numerik
categorical_column = ["SEX", "EDUCATION", "MARRIAGE",
                      "PAY_0", "PAY_2", "PAY_3", "PAY_4",
                      "PAY_5", "PAY_6"]

numerical_column = ["LIMIT_BAL", "AGE", "BILL_AMT1", "BILL_AMT2",
                    "BILL_AMT3", "BILL_AMT4", "BILL_AMT5", "BILL_AMT6",
                    "PAY_AMT1", "PAY_AMT2", "PAY_AMT3", "PAY_AMT4",
                    "PAY_AMT5", "PAY_AMT6"]

In [39]:
# Seleksi dataframe numerik
X_train_numerical = X_train[numerical_column]
X_train_numerical.head()

Unnamed: 0,LIMIT_BAL,AGE,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
21804,120000.0,,106621.0,105893.0,107838.0,103273.0,101353.0,,4000.0,5013.0,4000.0,,3130.0,4000.0
29559,20000.0,38.0,9269.0,,11608.0,12313.0,12848.0,800.0,1477.0,1500.0,,1500.0,800.0,800.0
10281,60000.0,26.0,58603.0,58918.0,55709.0,54173.0,54065.0,57499.0,2000.0,2500.0,3200.0,1000.0,4500.0,0.0
6256,100000.0,29.0,17848.0,19032.0,19664.0,16407.0,15068.0,40794.0,1775.0,1407.0,,935.0,42000.0,1500.0
3040,60000.0,38.0,30013.0,31226.0,32396.0,,35456.0,34736.0,2000.0,,2800.0,1500.0,0.0,1500.0


In [40]:
X_train_numerical.isnull().any()

# Semua variabel numerical memiliki missing values

LIMIT_BAL    True
AGE          True
BILL_AMT1    True
BILL_AMT2    True
BILL_AMT3    True
BILL_AMT4    True
BILL_AMT5    True
BILL_AMT6    True
PAY_AMT1     True
PAY_AMT2     True
PAY_AMT3     True
PAY_AMT4     True
PAY_AMT5     True
PAY_AMT6     True
dtype: bool

In [41]:
from sklearn.impute import SimpleImputer

In [42]:
imputer = SimpleImputer(missing_values = np.nan,
                        strategy = "median")

# namakan function SimpleImputer menjadi imputer, jangan lupa tanda kurung ()
# missing_values adalah tanda missing values dalam data.
#   - bisa NaN, bisa 999, bisa "KOSONG"
# Strategy median adalah strategy imputasi, 
# jika data kosong, diganti dengan median target
# Strategi lainnya adalah: mean

In [43]:
# Isi perintah yang akan dibuat dalam fungsi

# Fit imputer
imputer.fit(X_train_numerical)

# Transform
imputed_data = imputer.transform(X_train_numerical)
X_train_numerical_imputed = pd.DataFrame(imputed_data)

X_train_numerical_imputed.columns = X_train_numerical.columns
X_train_numerical_imputed.index = X_train_numerical.index

In [44]:
X_train_numerical_imputed.isnull().any()

LIMIT_BAL    False
AGE          False
BILL_AMT1    False
BILL_AMT2    False
BILL_AMT3    False
BILL_AMT4    False
BILL_AMT5    False
BILL_AMT6    False
PAY_AMT1     False
PAY_AMT2     False
PAY_AMT3     False
PAY_AMT4     False
PAY_AMT5     False
PAY_AMT6     False
dtype: bool

In [45]:
from sklearn.impute import SimpleImputer

def numericalImputation(data, numerical_column):
    """
    Fungsi untuk melakukan imputasi data numerik
    :param data: <pandas dataframe> sample data input
    :param numerical_column: <list> list kolom numerik data
    :return X_train_numerical: <pandas dataframe> data numerik
    :return imputer_numerical: numerical imputer method
    """
    # Filter data numerik
    numerical_data = data[numerical_column]

    # Buat imputer
    imputer_numerical = SimpleImputer(missing_values = np.nan,
                                      strategy = "median")
    imputer_numerical.fit(numerical_data)

    # Transform
    imputed_data = imputer_numerical.transform(numerical_data)
    numerical_data_imputed = pd.DataFrame(imputed_data)

    numerical_data_imputed.columns = numerical_column
    numerical_data_imputed.index = numerical_data.index

    return numerical_data_imputed, imputer_numerical

In [46]:
# Imputation Numeric
X_train_numerical, imputer_numerical = numericalImputation(data = X_train,
                                                           numerical_column = numerical_column)
X_train_numerical.isnull().any()

LIMIT_BAL    False
AGE          False
BILL_AMT1    False
BILL_AMT2    False
BILL_AMT3    False
BILL_AMT4    False
BILL_AMT5    False
BILL_AMT6    False
PAY_AMT1     False
PAY_AMT2     False
PAY_AMT3     False
PAY_AMT4     False
PAY_AMT5     False
PAY_AMT6     False
dtype: bool

In [47]:
categorical_column

['SEX',
 'EDUCATION',
 'MARRIAGE',
 'PAY_0',
 'PAY_2',
 'PAY_3',
 'PAY_4',
 'PAY_5',
 'PAY_6']

In [48]:
# Periksa lagi missing value
categorical_data = X_train[categorical_column]
categorical_data.isnull().sum()

SEX          1740
EDUCATION    1878
MARRIAGE     1729
PAY_0        1826
PAY_2        1798
PAY_3        1778
PAY_4        1810
PAY_5        1782
PAY_6        1765
dtype: int64

In [49]:
# Kita isi kolom kategorik dengan "KOSONG"
categorical_data = X_train[categorical_column]
categorical_data = categorical_data.fillna(value="KOSONG")

In [50]:
categorical_data.isnull().sum()

SEX          0
EDUCATION    0
MARRIAGE     0
PAY_0        0
PAY_2        0
PAY_3        0
PAY_4        0
PAY_5        0
PAY_6        0
dtype: int64

In [51]:
def categoricalImputation(data, categorical_column):
    """
    Fungsi untuk melakukan imputasi data kategorik
    :param data: <pandas dataframe> sample data input
    :param categorical_column: <list> list kolom kategorikal data
    :return categorical_data: <pandas dataframe> data kategorikal
    """
    # seleksi data
    categorical_data = data[categorical_column]

    # lakukan imputasi
    categorical_data = categorical_data.fillna(value="KOSONG")

    return categorical_data


In [52]:
X_train_categorical = categoricalImputation(data = X_train,
                                            categorical_column = categorical_column)

In [53]:
X_train_categorical.isnull().sum()

SEX          0
EDUCATION    0
MARRIAGE     0
PAY_0        0
PAY_2        0
PAY_3        0
PAY_4        0
PAY_5        0
PAY_6        0
dtype: int64

In [54]:
categorical_ohe = pd.get_dummies(X_train_categorical)

In [55]:
categorical_ohe.head(2)

Unnamed: 0,SEX_1.0,SEX_2.0,SEX_KOSONG,EDUCATION_1.0,EDUCATION_2.0,EDUCATION_3.0,EDUCATION_4.0,EDUCATION_KOSONG,MARRIAGE_0.0,MARRIAGE_1.0,...,PAY_5_0.0,PAY_5_2.0,PAY_5_3.0,PAY_5_KOSONG,PAY_6_-2.0,PAY_6_-1.0,PAY_6_0.0,PAY_6_2.0,PAY_6_3.0,PAY_6_KOSONG
21804,0,1,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
29559,1,0,0,0,1,0,0,0,0,1,...,1,0,0,0,0,1,0,0,0,0


In [56]:
def extractCategorical(data, categorical_column):
    """
    Fungsi untuk ekstrak data kategorikal dengan One Hot Encoding
    :param data: <pandas dataframe> data sample
    :param categorical_column: <list> list kolom kategorik
    :return categorical_ohe: <pandas dataframe> data sample dengan ohe
    """
    data_categorical = categoricalImputation(data = data,
                                             categorical_column = categorical_column)
    categorical_ohe = pd.get_dummies(data_categorical)

    return categorical_ohe

In [57]:
X_train_categorical_ohe = extractCategorical(data = X_train,
                                             categorical_column = categorical_column)
X_train_categorical_ohe.head()

Unnamed: 0,SEX_1.0,SEX_2.0,SEX_KOSONG,EDUCATION_1.0,EDUCATION_2.0,EDUCATION_3.0,EDUCATION_4.0,EDUCATION_KOSONG,MARRIAGE_0.0,MARRIAGE_1.0,...,PAY_5_0.0,PAY_5_2.0,PAY_5_3.0,PAY_5_KOSONG,PAY_6_-2.0,PAY_6_-1.0,PAY_6_0.0,PAY_6_2.0,PAY_6_3.0,PAY_6_KOSONG
21804,0,1,0,0,0,1,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
29559,1,0,0,0,1,0,0,0,0,1,...,1,0,0,0,0,1,0,0,0,0
10281,0,1,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
6256,0,1,0,0,0,0,0,1,0,0,...,1,0,0,0,0,1,0,0,0,0
3040,1,0,0,1,0,0,0,0,0,1,...,0,1,0,0,0,0,0,1,0,0


In [58]:
# Simpan kolom OHE untuk diimplementasikan dalam testing data
# Agar shape-nya konsisten
ohe_columns = X_train_categorical_ohe.columns
ohe_columns

Index(['SEX_1.0', 'SEX_2.0', 'SEX_KOSONG', 'EDUCATION_1.0', 'EDUCATION_2.0',
       'EDUCATION_3.0', 'EDUCATION_4.0', 'EDUCATION_KOSONG', 'MARRIAGE_0.0',
       'MARRIAGE_1.0', 'MARRIAGE_2.0', 'MARRIAGE_3.0', 'MARRIAGE_KOSONG',
       'PAY_0_-2.0', 'PAY_0_-1.0', 'PAY_0_0.0', 'PAY_0_1.0', 'PAY_0_2.0',
       'PAY_0_3.0', 'PAY_0_KOSONG', 'PAY_2_-2.0', 'PAY_2_-1.0', 'PAY_2_0.0',
       'PAY_2_2.0', 'PAY_2_3.0', 'PAY_2_KOSONG', 'PAY_3_-2.0', 'PAY_3_-1.0',
       'PAY_3_0.0', 'PAY_3_2.0', 'PAY_3_3.0', 'PAY_3_KOSONG', 'PAY_4_-2.0',
       'PAY_4_-1.0', 'PAY_4_0.0', 'PAY_4_2.0', 'PAY_4_3.0', 'PAY_4_KOSONG',
       'PAY_5_-2.0', 'PAY_5_-1.0', 'PAY_5_0.0', 'PAY_5_2.0', 'PAY_5_3.0',
       'PAY_5_KOSONG', 'PAY_6_-2.0', 'PAY_6_-1.0', 'PAY_6_0.0', 'PAY_6_2.0',
       'PAY_6_3.0', 'PAY_6_KOSONG'],
      dtype='object')

In [59]:
# Join data Numerical dan Categorical
# Data numerik & kategorik harus disatukan kembali
# Penyatuan dengan pd.concat

X_train_concat = pd.concat([X_train_numerical,
                            X_train_categorical_ohe],
                           axis = 1)
X_train_concat.head()

Unnamed: 0,LIMIT_BAL,AGE,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,...,PAY_5_0.0,PAY_5_2.0,PAY_5_3.0,PAY_5_KOSONG,PAY_6_-2.0,PAY_6_-1.0,PAY_6_0.0,PAY_6_2.0,PAY_6_3.0,PAY_6_KOSONG
21804,120000.0,34.0,106621.0,105893.0,107838.0,103273.0,101353.0,17270.0,4000.0,5013.0,...,1,0,0,0,0,0,1,0,0,0
29559,20000.0,38.0,9269.0,21848.0,11608.0,12313.0,12848.0,800.0,1477.0,1500.0,...,1,0,0,0,0,1,0,0,0,0
10281,60000.0,26.0,58603.0,58918.0,55709.0,54173.0,54065.0,57499.0,2000.0,2500.0,...,0,1,0,0,0,0,0,1,0,0
6256,100000.0,29.0,17848.0,19032.0,19664.0,16407.0,15068.0,40794.0,1775.0,1407.0,...,1,0,0,0,0,1,0,0,0,0
3040,60000.0,38.0,30013.0,31226.0,32396.0,19225.0,35456.0,34736.0,2000.0,2019.0,...,0,1,0,0,0,0,0,1,0,0


In [60]:
X_train_concat.shape

(22500, 64)

In [61]:
X_train_concat.isnull().any()

LIMIT_BAL       False
AGE             False
BILL_AMT1       False
BILL_AMT2       False
BILL_AMT3       False
                ...  
PAY_6_-1.0      False
PAY_6_0.0       False
PAY_6_2.0       False
PAY_6_3.0       False
PAY_6_KOSONG    False
Length: 64, dtype: bool

### **Standardizing Variables**

- Menyamakan skala dari variabel input
- `fit`: imputer agar mengetahui mean dan standar deviasi dari setiap kolom
- `transform`: isi data dengan value yang sudah dinormalisasi
- output dari transform berupa pandas dataframe
- normalize dikeluarkan karena akan digunakan pada data test

In [62]:
from sklearn.preprocessing import StandardScaler

# Buat fungsi
def standardizerData(data):
    """
    Fungsi untuk melakukan standarisasi data
    :param data: <pandas dataframe> sampel data
    :return standardized_data: <pandas dataframe> sampel data standard
    :return standardizer: method untuk standardisasi data
    """
    data_columns = data.columns  # agar nama kolom tidak hilang
    data_index = data.index  # agar index tidak hilang

    # buat (fit) standardizer
    standardizer = StandardScaler()
    standardizer.fit(data)

    # transform data
    standardized_data_raw = standardizer.transform(data)
    standardized_data = pd.DataFrame(standardized_data_raw)
    standardized_data.columns = data_columns
    standardized_data.index = data_index

    return standardized_data, standardizer

In [63]:
X_train_clean, standardizer = standardizerData(data = X_train_concat)
X_train_clean.head()

Unnamed: 0,LIMIT_BAL,AGE,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,...,PAY_5_0.0,PAY_5_2.0,PAY_5_3.0,PAY_5_KOSONG,PAY_6_-2.0,PAY_6_-1.0,PAY_6_0.0,PAY_6_2.0,PAY_6_3.0,PAY_6_KOSONG
21804,-0.360265,-0.154808,0.801379,0.842062,0.91754,0.981687,1.048785,-0.346323,-0.088803,-0.026074,...,0.956756,-0.292831,-0.103397,-0.293278,-0.41502,-0.463859,0.993091,-0.304397,-0.09683,-0.291756
29559,-1.164155,0.297348,-0.561229,-0.369602,-0.500832,-0.471209,-0.43836,-0.62905,-0.240299,-0.178741,...,0.956756,-0.292831,-0.103397,-0.293278,-0.41502,2.155827,-1.006958,-0.304397,-0.09683,-0.291756
10281,-0.842599,-1.05912,0.129285,0.164831,0.14919,0.197417,0.254207,0.344256,-0.208895,-0.135284,...,-1.045198,3.414936,-0.103397,-0.293278,-0.41502,-0.463859,-1.006958,3.285187,-0.09683,-0.291756
6256,-0.521043,-0.720003,-0.441151,-0.410199,-0.382091,-0.405816,-0.401058,0.057495,-0.222405,-0.182783,...,0.956756,-0.292831,-0.103397,-0.293278,-0.41502,2.155827,-1.006958,-0.304397,-0.09683,-0.291756
3040,-0.842599,0.297348,-0.270881,-0.2344,-0.194429,-0.360805,-0.058479,-0.046498,-0.208895,-0.156187,...,-1.045198,3.414936,-0.103397,-0.293278,-0.41502,-0.463859,-1.006958,3.285187,-0.09683,-0.291756


### <b><font color='blue'> Training Machine Learning:</font></b>
---
    * Choose Score to optimize and Hyperparameter Space
    * Cross-Validation: Random vs Grid Search CV
    * Kita harus mengalahkan benchmark

### **Benchmark / Baseline**

- Baseline untuk evaluasi nanti
- Karena ini klasifikasi, bisa kita ambil dari proporsi kelas target yang terbesar
- Dengan kata lain, menebak hasil output marketing response dengan nilai "no" semua tanpa modeling

In [64]:
y_train.value_counts(normalize = True)

# baseline akurasi = 78%

default.payment.next.month
0                             0.779511
1                             0.220489
dtype: float64

In [65]:
# 1. Import Model
# Misal kita gunakan 3 model ML untuk klasifikasi:
# K-nearest neighbor (K-NN)
# Logistic Regression
# Random Forest

# Import dari sklearn
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [66]:
# 2. Fitting Model
# Cara fitting/training model mengikuti yang dokumentasi model

# Model K nearest neighbor
knn = KNeighborsClassifier()
knn.fit(X_train_clean, y_train)

  return self._fit(X, y)


In [67]:
# Model Logistic Regression
logreg = LogisticRegression(random_state = 123)
logreg.fit(X_train_clean, y_train)

  y = column_or_1d(y, warn=True)


In [68]:
# Model Random Forest Classifier
random_forest = RandomForestClassifier(random_state = 123)
random_forest.fit(X_train_clean, y_train)

  random_forest.fit(X_train_clean, y_train)


In [69]:
# Model Random Forest Classifier 1
# Mari kita ubah hyperparameter dari random forest --> n_estimator
# Maksud & tujuan akan dijelaskan pada kelas Random Forest
# Tambahkan n_estimator = 500

random_forest_1 = RandomForestClassifier(random_state = 123,
                                         n_estimators = 500)
random_forest_1.fit(X_train_clean, y_train)

  random_forest_1.fit(X_train_clean, y_train)


In [70]:
# 3. Prediction
# Saatnya melakukan prediksi

# Prediksi Logistic Regression
logreg.predict(X_train_clean)

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [71]:
predicted_logreg = pd.DataFrame(logreg.predict(X_train_clean))
predicted_logreg

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,1
...,...
22495,0
22496,0
22497,0
22498,0


In [72]:
predicted_knn = pd.DataFrame(knn.predict(X_train_clean))
predicted_knn.head()

Unnamed: 0,0
0,0
1,0
2,1
3,0
4,1


In [73]:
predicted_rf = pd.DataFrame(random_forest.predict(X_train_clean))
predicted_rf.head()

Unnamed: 0,0
0,1
1,0
2,0
3,0
4,1


In [74]:
predicted_rf_1 = pd.DataFrame(random_forest_1.predict(X_train_clean))
predicted_rf_1.head()

Unnamed: 0,0
0,1
1,0
2,0
3,0
4,1


In [75]:
# 4. Cek performa model di data training
benchmark = y_train.value_counts(normalize=True)[0]
benchmark

0.7795111111111112

In [76]:
# akurasi knn
knn.score(X_train_clean, y_train)

0.8420888888888889

In [77]:
# akurasi logistic regression
logreg.score(X_train_clean, y_train)

0.8182222222222222

In [78]:
# akurasi random forest
random_forest.score(X_train_clean, y_train)

1.0

In [79]:
# akurasi random forest 1
random_forest_1.score(X_train_clean, y_train)

1.0

In [80]:
# 5. Simpan model ke file pickle

import joblib

# Simpan model logreg ke dalam folder yang sama dengan notebook
# dengan nama logreg.pkl
joblib.dump(logreg, "logreg.pkl")

joblib.dump(knn, "knn.pkl")
joblib.dump(random_forest, "random_forest.pkl")
joblib.dump(random_forest_1, "random_forest_1.pkl")

['random_forest_1.pkl']

In [81]:
# 6. Test Prediction
# Siapkan file test dataset
# Lakukan preprocessing yang sama dengan yang dilakukan di train dataset
# gunakan imputer_numerical dan standardizer yang telah di-fit di train dataset

def extractTest(data,
                numerical_column, categorical_column, ohe_column,
                imputer_numerical, standardizer):
    """
    Fungsi untuk mengekstrak & membersihkan test data 
    :param data: <pandas dataframe> sampel data test
    :param numerical_column: <list> kolom numerik
    :param categorical_column: <list> kolom kategorik
    :param ohe_column: <list> kolom one-hot-encoding dari data kategorik
    :param imputer_numerical: <sklearn method> imputer data numerik
    :param standardizer: <sklearn method> standardizer data
    :return cleaned_data: <pandas dataframe> data final
    """
    # Filter data
    numerical_data = data[numerical_column]
    categorical_data = data[categorical_column]

    # Proses data numerik
    numerical_data = pd.DataFrame(imputer_numerical.transform(numerical_data))
    numerical_data.columns = numerical_column
    numerical_data.index = data.index

    # Proses data kategorik
    categorical_data = categorical_data.fillna(value="KOSONG")
    categorical_data.index = data.index
    categorical_data = pd.get_dummies(categorical_data)
    categorical_data.reindex(index = categorical_data.index, 
                             columns = ohe_column)

    # Gabungkan data
    concat_data = pd.concat([numerical_data, categorical_data],
                             axis = 1)
    cleaned_data = pd.DataFrame(standardizer.transform(concat_data))
    cleaned_data.columns = concat_data.columns

    return cleaned_data


In [82]:
def testPrediction(X_test, y_test, classifier, compute_score):
    """
    Fungsi untuk mendapatkan prediksi dari model
    :param X_test: <pandas dataframe> input
    :param y_test: <pandas series> output/target
    :param classifier: <sklearn method> model klasifikasi
    :param compute_score: <bool> True: menampilkan score, False: tidak
    :return test_predict: <list> hasil prediksi data input
    :return score: <float> akurasi model
    """
    if compute_score:
        score = classifier.score(X_test, y_test)
        print(f"Accuracy : {score:.4f}")

    test_predict = classifier.predict(X_test)

    return test_predict, score

In [83]:
X_test_clean = extractTest(data = X_test,
                           numerical_column = numerical_column,
                           categorical_column = categorical_column,
                           ohe_column = ohe_columns,
                           imputer_numerical = imputer_numerical,
                           standardizer = standardizer)
X_test_clean.shape

(7500, 64)

In [84]:
# Logistic Regression Performance
logreg_test_predict, score = testPrediction(X_test = X_test_clean,
                                            y_test = y_test,
                                            classifier = logreg,
                                            compute_score = True)

Accuracy : 0.8201


In [85]:
# K nearest neighbor Performance
knn_test_predict, score = testPrediction(X_test = X_test_clean,
                                         y_test = y_test,
                                         classifier = knn,
                                         compute_score = True)

Accuracy : 0.7921


In [86]:
# Random Forest Performance
rf_test_predict, score = testPrediction(X_test = X_test_clean,
                                        y_test = y_test,
                                        classifier = random_forest,
                                        compute_score = True)

Accuracy : 0.8139


In [87]:
# Random Forest 1 Performance
rf_1_test_predict, score = testPrediction(X_test = X_test_clean,
                                          y_test = y_test,
                                          classifier = random_forest_1,
                                          compute_score = True)  

Accuracy : 0.8171
