# Tugas Besar Machine Learning 2

## Libraries  

In [41]:
# Import pandas
import pandas as pd

# Import numpy
import numpy as np

# Import scikit-learn
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder

# Import imbalance-learn
from collections import Counter
from imblearn.over_sampling import SMOTE

## Import Data

In [42]:
# Melakukan download dataset dari google drive
!gdown --id 1o5q4R7U67sa9ltjjLd7cOxWrAVoVRVFR
!gdown --id 1hBBtVEpkfk-A9gzmKiFnO-MEIrmnsMk1

Downloading...
From: https://drive.google.com/uc?id=1o5q4R7U67sa9ltjjLd7cOxWrAVoVRVFR
To: /content/kendaraan_train.csv
100% 15.2M/15.2M [00:00<00:00, 133MB/s]
Downloading...
From: https://drive.google.com/uc?id=1hBBtVEpkfk-A9gzmKiFnO-MEIrmnsMk1
To: /content/kendaraan_test.csv
100% 2.31M/2.31M [00:00<00:00, 70.7MB/s]


In [43]:
# Membaca file dataset
data_train = pd.read_csv('kendaraan_train.csv')
data_test = pd.read_csv('kendaraan_test.csv')

## Eksploratory

In [44]:
# Melihat data train
data_train

Unnamed: 0,id,Jenis_Kelamin,Umur,SIM,Kode_Daerah,Sudah_Asuransi,Umur_Kendaraan,Kendaraan_Rusak,Premi,Kanal_Penjualan,Lama_Berlangganan,Tertarik
0,1,Wanita,30.0,1.0,33.0,1.0,< 1 Tahun,Tidak,28029.0,152.0,97.0,0
1,2,Pria,48.0,1.0,39.0,0.0,> 2 Tahun,Pernah,25800.0,29.0,158.0,0
2,3,,21.0,1.0,46.0,1.0,< 1 Tahun,Tidak,32733.0,160.0,119.0,0
3,4,Wanita,58.0,1.0,48.0,0.0,1-2 Tahun,Tidak,2630.0,124.0,63.0,0
4,5,Pria,50.0,1.0,35.0,0.0,> 2 Tahun,,34857.0,88.0,194.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
285826,285827,Wanita,23.0,1.0,4.0,1.0,< 1 Tahun,Tidak,25988.0,152.0,217.0,0
285827,285828,Wanita,21.0,1.0,46.0,1.0,< 1 Tahun,Tidak,44686.0,152.0,50.0,0
285828,285829,Wanita,23.0,1.0,50.0,1.0,< 1 Tahun,Tidak,49751.0,152.0,226.0,0
285829,285830,Pria,68.0,1.0,7.0,1.0,1-2 Tahun,Tidak,30503.0,124.0,270.0,0


In [45]:
# Mengecek data kosong
data_train.isna().sum()

id                       0
Jenis_Kelamin        14440
Umur                 14214
SIM                  14404
Kode_Daerah          14306
Sudah_Asuransi       14229
Umur_Kendaraan       14275
Kendaraan_Rusak      14188
Premi                14569
Kanal_Penjualan      14299
Lama_Berlangganan    13992
Tertarik                 0
dtype: int64

In [46]:
# Drop data kosong
data_train = data_train.dropna(how = 'any')

In [47]:
# Mengecek kembali data kosong
data_train.isna().sum()

id                   0
Jenis_Kelamin        0
Umur                 0
SIM                  0
Kode_Daerah          0
Sudah_Asuransi       0
Umur_Kendaraan       0
Kendaraan_Rusak      0
Premi                0
Kanal_Penjualan      0
Lama_Berlangganan    0
Tertarik             0
dtype: int64

In [48]:
# Melihat informasi data
data_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 171068 entries, 0 to 285830
Data columns (total 12 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   id                 171068 non-null  int64  
 1   Jenis_Kelamin      171068 non-null  object 
 2   Umur               171068 non-null  float64
 3   SIM                171068 non-null  float64
 4   Kode_Daerah        171068 non-null  float64
 5   Sudah_Asuransi     171068 non-null  float64
 6   Umur_Kendaraan     171068 non-null  object 
 7   Kendaraan_Rusak    171068 non-null  object 
 8   Premi              171068 non-null  float64
 9   Kanal_Penjualan    171068 non-null  float64
 10  Lama_Berlangganan  171068 non-null  float64
 11  Tertarik           171068 non-null  int64  
dtypes: float64(7), int64(2), object(3)
memory usage: 17.0+ MB


In [49]:
# Melihat data_train sekarang
data_train

Unnamed: 0,id,Jenis_Kelamin,Umur,SIM,Kode_Daerah,Sudah_Asuransi,Umur_Kendaraan,Kendaraan_Rusak,Premi,Kanal_Penjualan,Lama_Berlangganan,Tertarik
0,1,Wanita,30.0,1.0,33.0,1.0,< 1 Tahun,Tidak,28029.0,152.0,97.0,0
1,2,Pria,48.0,1.0,39.0,0.0,> 2 Tahun,Pernah,25800.0,29.0,158.0,0
3,4,Wanita,58.0,1.0,48.0,0.0,1-2 Tahun,Tidak,2630.0,124.0,63.0,0
5,6,Pria,21.0,1.0,35.0,1.0,< 1 Tahun,Tidak,22735.0,152.0,171.0,0
8,9,Wanita,20.0,1.0,8.0,1.0,< 1 Tahun,Tidak,30786.0,160.0,31.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
285826,285827,Wanita,23.0,1.0,4.0,1.0,< 1 Tahun,Tidak,25988.0,152.0,217.0,0
285827,285828,Wanita,21.0,1.0,46.0,1.0,< 1 Tahun,Tidak,44686.0,152.0,50.0,0
285828,285829,Wanita,23.0,1.0,50.0,1.0,< 1 Tahun,Tidak,49751.0,152.0,226.0,0
285829,285830,Pria,68.0,1.0,7.0,1.0,1-2 Tahun,Tidak,30503.0,124.0,270.0,0


In [50]:
# Menghapus data duplikat
data_train = data_train.drop_duplicates()
data_train

Unnamed: 0,id,Jenis_Kelamin,Umur,SIM,Kode_Daerah,Sudah_Asuransi,Umur_Kendaraan,Kendaraan_Rusak,Premi,Kanal_Penjualan,Lama_Berlangganan,Tertarik
0,1,Wanita,30.0,1.0,33.0,1.0,< 1 Tahun,Tidak,28029.0,152.0,97.0,0
1,2,Pria,48.0,1.0,39.0,0.0,> 2 Tahun,Pernah,25800.0,29.0,158.0,0
3,4,Wanita,58.0,1.0,48.0,0.0,1-2 Tahun,Tidak,2630.0,124.0,63.0,0
5,6,Pria,21.0,1.0,35.0,1.0,< 1 Tahun,Tidak,22735.0,152.0,171.0,0
8,9,Wanita,20.0,1.0,8.0,1.0,< 1 Tahun,Tidak,30786.0,160.0,31.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
285826,285827,Wanita,23.0,1.0,4.0,1.0,< 1 Tahun,Tidak,25988.0,152.0,217.0,0
285827,285828,Wanita,21.0,1.0,46.0,1.0,< 1 Tahun,Tidak,44686.0,152.0,50.0,0
285828,285829,Wanita,23.0,1.0,50.0,1.0,< 1 Tahun,Tidak,49751.0,152.0,226.0,0
285829,285830,Pria,68.0,1.0,7.0,1.0,1-2 Tahun,Tidak,30503.0,124.0,270.0,0


In [51]:
# Menghapus data numerical
data_train = data_train.drop(columns = ['id', 'Umur', 'Premi', 'Lama_Berlangganan','Kanal_Penjualan','Kode_Daerah'], axis = 1)
data_train

Unnamed: 0,Jenis_Kelamin,SIM,Sudah_Asuransi,Umur_Kendaraan,Kendaraan_Rusak,Tertarik
0,Wanita,1.0,1.0,< 1 Tahun,Tidak,0
1,Pria,1.0,0.0,> 2 Tahun,Pernah,0
3,Wanita,1.0,0.0,1-2 Tahun,Tidak,0
5,Pria,1.0,1.0,< 1 Tahun,Tidak,0
8,Wanita,1.0,1.0,< 1 Tahun,Tidak,0
...,...,...,...,...,...,...
285826,Wanita,1.0,1.0,< 1 Tahun,Tidak,0
285827,Wanita,1.0,1.0,< 1 Tahun,Tidak,0
285828,Wanita,1.0,1.0,< 1 Tahun,Tidak,0
285829,Pria,1.0,1.0,1-2 Tahun,Tidak,0


In [52]:
# Melihat tipe data
data_train.dtypes

Jenis_Kelamin       object
SIM                float64
Sudah_Asuransi     float64
Umur_Kendaraan      object
Kendaraan_Rusak     object
Tertarik             int64
dtype: object

In [53]:
 # Merubah tipe data float64 menjadi int64
data_train.SIM = data_train.SIM.astype('int64')
data_train.Sudah_Asuransi = data_train.Sudah_Asuransi.astype('int64')
data_train

Unnamed: 0,Jenis_Kelamin,SIM,Sudah_Asuransi,Umur_Kendaraan,Kendaraan_Rusak,Tertarik
0,Wanita,1,1,< 1 Tahun,Tidak,0
1,Pria,1,0,> 2 Tahun,Pernah,0
3,Wanita,1,0,1-2 Tahun,Tidak,0
5,Pria,1,1,< 1 Tahun,Tidak,0
8,Wanita,1,1,< 1 Tahun,Tidak,0
...,...,...,...,...,...,...
285826,Wanita,1,1,< 1 Tahun,Tidak,0
285827,Wanita,1,1,< 1 Tahun,Tidak,0
285828,Wanita,1,1,< 1 Tahun,Tidak,0
285829,Pria,1,1,1-2 Tahun,Tidak,0


In [54]:
# Merubah menjadi numerikal data agar dapat diproses
le = preprocessing.LabelEncoder()
data_train.Jenis_Kelamin = le.fit_transform(data_train.Jenis_Kelamin)
data_train.Umur_Kendaraan = le.fit_transform(data_train.Umur_Kendaraan)
data_train.Kendaraan_Rusak = le.fit_transform(data_train.Kendaraan_Rusak)
data_train

Unnamed: 0,Jenis_Kelamin,SIM,Sudah_Asuransi,Umur_Kendaraan,Kendaraan_Rusak,Tertarik
0,1,1,1,1,1,0
1,0,1,0,2,0,0
3,1,1,0,0,1,0
5,0,1,1,1,1,0
8,1,1,1,1,1,0
...,...,...,...,...,...,...
285826,1,1,1,1,1,0
285827,1,1,1,1,1,0
285828,1,1,1,1,1,0
285829,0,1,1,0,1,0


In [55]:
# Melihat tipe data untuk nanti diproses
data_train.dtypes

Jenis_Kelamin      int64
SIM                int64
Sudah_Asuransi     int64
Umur_Kendaraan     int64
Kendaraan_Rusak    int64
Tertarik           int64
dtype: object

In [56]:
# Mengecek korelasi keterkaitan antar variabel
data_train.corr()

Unnamed: 0,Jenis_Kelamin,SIM,Sudah_Asuransi,Umur_Kendaraan,Kendaraan_Rusak,Tertarik
Jenis_Kelamin,1.0,0.018742,0.082809,0.11597,0.092064,-0.053709
SIM,0.018742,1.0,0.016459,0.029913,0.019149,0.008675
Sudah_Asuransi,0.082809,0.016459,1.0,0.175825,0.826157,-0.340711
Umur_Kendaraan,0.11597,0.029913,0.175825,1.0,0.176561,-0.103575
Kendaraan_Rusak,0.092064,0.019149,0.826157,0.176561,1.0,-0.353388
Tertarik,-0.053709,0.008675,-0.340711,-0.103575,-0.353388,1.0


In [57]:
# Melihat data test
data_test

Unnamed: 0,Jenis_Kelamin,Umur,SIM,Kode_Daerah,Sudah_Asuransi,Umur_Kendaraan,Kendaraan_Rusak,Premi,Kanal_Penjualan,Lama_Berlangganan,Tertarik
0,Wanita,49,1,8,0,1-2 Tahun,Pernah,46963,26,145,0
1,Pria,22,1,47,1,< 1 Tahun,Tidak,39624,152,241,0
2,Pria,24,1,28,1,< 1 Tahun,Tidak,110479,152,62,0
3,Pria,46,1,8,1,1-2 Tahun,Tidak,36266,124,34,0
4,Pria,35,1,23,0,1-2 Tahun,Pernah,26963,152,229,0
...,...,...,...,...,...,...,...,...,...,...,...
47634,Pria,61,1,46,0,> 2 Tahun,Pernah,31039,124,67,0
47635,Pria,41,1,15,0,1-2 Tahun,Pernah,2630,157,232,0
47636,Pria,24,1,29,1,< 1 Tahun,Tidak,33101,152,211,0
47637,Pria,59,1,30,0,1-2 Tahun,Pernah,37788,26,239,1


In [58]:
# # # Menghapus data yang duplikat
# data_test = data_test.drop_duplicates()

In [59]:
# Menghapus drop menyesuaikan dengan dataset train
data_test = data_test.drop(columns = ['Umur', 'Premi', 'Lama_Berlangganan','Kanal_Penjualan','Kode_Daerah'], axis = 1)
data_test

Unnamed: 0,Jenis_Kelamin,SIM,Sudah_Asuransi,Umur_Kendaraan,Kendaraan_Rusak,Tertarik
0,Wanita,1,0,1-2 Tahun,Pernah,0
1,Pria,1,1,< 1 Tahun,Tidak,0
2,Pria,1,1,< 1 Tahun,Tidak,0
3,Pria,1,1,1-2 Tahun,Tidak,0
4,Pria,1,0,1-2 Tahun,Pernah,0
...,...,...,...,...,...,...
47634,Pria,1,0,> 2 Tahun,Pernah,0
47635,Pria,1,0,1-2 Tahun,Pernah,0
47636,Pria,1,1,< 1 Tahun,Tidak,0
47637,Pria,1,0,1-2 Tahun,Pernah,1


In [60]:
# Melihat info tipe dataset test
data_test.dtypes

Jenis_Kelamin      object
SIM                 int64
Sudah_Asuransi      int64
Umur_Kendaraan     object
Kendaraan_Rusak    object
Tertarik            int64
dtype: object

In [61]:
# Merubah tipe data float64 menjadi int64
data_test.SIM = data_test.SIM.astype('int64')
data_test.Sudah_Asuransi = data_test.Sudah_Asuransi.astype('int64')
data_test

Unnamed: 0,Jenis_Kelamin,SIM,Sudah_Asuransi,Umur_Kendaraan,Kendaraan_Rusak,Tertarik
0,Wanita,1,0,1-2 Tahun,Pernah,0
1,Pria,1,1,< 1 Tahun,Tidak,0
2,Pria,1,1,< 1 Tahun,Tidak,0
3,Pria,1,1,1-2 Tahun,Tidak,0
4,Pria,1,0,1-2 Tahun,Pernah,0
...,...,...,...,...,...,...
47634,Pria,1,0,> 2 Tahun,Pernah,0
47635,Pria,1,0,1-2 Tahun,Pernah,0
47636,Pria,1,1,< 1 Tahun,Tidak,0
47637,Pria,1,0,1-2 Tahun,Pernah,1


In [62]:
# Melihat tipe data sekarang
data_test.dtypes

Jenis_Kelamin      object
SIM                 int64
Sudah_Asuransi      int64
Umur_Kendaraan     object
Kendaraan_Rusak    object
Tertarik            int64
dtype: object

In [63]:
# Merubah menjadi numerikal data agar dapat diproses
le = preprocessing.LabelEncoder()
data_test.Jenis_Kelamin = le.fit_transform(data_test.Jenis_Kelamin)
data_test.Umur_Kendaraan = le.fit_transform(data_test.Umur_Kendaraan)
data_test.Kendaraan_Rusak = le.fit_transform(data_test.Kendaraan_Rusak)
data_test

Unnamed: 0,Jenis_Kelamin,SIM,Sudah_Asuransi,Umur_Kendaraan,Kendaraan_Rusak,Tertarik
0,1,1,0,0,0,0
1,0,1,1,1,1,0
2,0,1,1,1,1,0
3,0,1,1,0,1,0
4,0,1,0,0,0,0
...,...,...,...,...,...,...
47634,0,1,0,2,0,0
47635,0,1,0,0,0,0
47636,0,1,1,1,1,0
47637,0,1,0,0,0,1


In [64]:
# Melihat tipe dataset test
data_test.dtypes

Jenis_Kelamin      int64
SIM                int64
Sudah_Asuransi     int64
Umur_Kendaraan     int64
Kendaraan_Rusak    int64
Tertarik           int64
dtype: object

In [65]:
# Menyimpan data yang telah di proses
data_train.to_csv(r'kendaraan_train_preprocessed.csv', index = False)

## Finding the Model

In [66]:
# Membagi dataset terhadap Dependent dan Independent variabel
X_train = data_train.iloc[:, [2,3,4]]
y_train = data_train.iloc[:,-1]

X_test = data_test.iloc[:, [2,3,4]]
y_test = data_test.iloc[:,-1]

In [67]:
# Melakukan oversampling dengan SMOTE
sm = SMOTE()
X_train_over, y_train_over = sm.fit_resample(X_train, y_train)

# Melihat distribusi kelas
counter = Counter(y_train_over)
print(counter)



Counter({0: 150270, 1: 150270})


In [68]:
# Mencoba KNN
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_over, y_train_over)
Y_pred_knn = knn.predict(X_test)
knn.score(X_test, y_test)

0.8769705493398267

In [69]:
# Mencoba Naive Bayes
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(X_train_over, y_train_over)
Y_pred_gnb = gnb.predict(X_test)
gnb.score(X_test, y_test)

0.6384055080921094

In [70]:
# Mencoba Logistic Regression
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(X_train_over, y_train_over)
Y_pred_lr = lr.predict(X_test)
lr.score(X_test, y_test)

0.6384055080921094

In [71]:
# Mencoba Decision Tree
from sklearn.tree import DecisionTreeClassifier

dc = DecisionTreeClassifier()
dc.fit(X_train_over, y_train_over)
Y_pred_dc = dc.predict(X_test)
dc.score(X_test, y_test)

0.6384055080921094

In [72]:
from sklearn.metrics import classification_report

print(classification_report(y_test, Y_pred_knn))
print(classification_report(y_test,Y_pred_gnb))
print(classification_report(y_test, Y_pred_lr))
print(classification_report(y_test, Y_pred_dc))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.88      1.00      0.93     41778
           1       0.00      0.00      0.00      5861

    accuracy                           0.88     47639
   macro avg       0.44      0.50      0.47     47639
weighted avg       0.77      0.88      0.82     47639

              precision    recall  f1-score   support

           0       0.99      0.59      0.74     41778
           1       0.25      0.98      0.40      5861

    accuracy                           0.64     47639
   macro avg       0.62      0.78      0.57     47639
weighted avg       0.90      0.64      0.70     47639

              precision    recall  f1-score   support

           0       0.99      0.59      0.74     41778
           1       0.25      0.98      0.40      5861

    accuracy                           0.64     47639
   macro avg       0.62      0.78      0.57     47639
weighted avg       0.90      0.64      0.70     47639

              preci

## Modelling

In [73]:
len(X_train_over.columns)

test1 = pd.concat([X_train_over, y_train_over], axis = 1)
len(test1.Kendaraan_Rusak[(test1.Kendaraan_Rusak == 0) & (test1.Tertarik == 0)])

y_test.shape

(47639,)

In [74]:
# Membuat model naive bayes untuk 3 fitur

class Naive_Bayes:
  def __init__(self):
    self.prob_asuransi = {
        "categories" : ["belum asuransi", "sudah asuransi"],
        "y" : [0,0],
        "n" : [0,0],
        "prob_y" : [0,0],
        "prob_n" : [0,0]
    }

    self.prob_umur_kend = {
        "categories" : ["satu sampai dua tahun", "kurang dari satu tahun", "lebih dari dua tahun"],
        "y" : [0,0,0],
        "n" : [0,0,0],
        "prob_y" : [0,0,0],
        "prob_n" : [0,0,0]
    }

    self.prob_kend_rusak = {
        "categories" : ["belum pernah rusak", "pernah rusak"],
        "y" : [0,0],
        "n" : [0,0],
        "prob_y" : [0,0],
        "prob_n" : [0,0]
    }

    self.prob_tertarik = {
        "y" : 0,
        "n" : 0,
        "prob_y" : 0,
        "prob_n" : 0
    }

    self.prob_class = {
        "prob_y" : [],
        "prob_n" : [],
        "prob_y_norm" : [],
        "prob_n_norm" : []
    }

  def calc_label_proba(self, y):
    zeros = 0
    ones = 0

    for i in range(len(y)):
      if(y[i] == 0):
        zeros += 1
      elif(y[i] == 1):
        ones += 1
    
    self.prob_tertarik["prob_n"] = zeros / (zeros + ones)
    self.prob_tertarik["prob_y"] = ones / (zeros + ones)

    # print("Probabilitas total n :",self.prob_tertarik["prob_n"])
    # print("Probabilitas total y :",self.prob_tertarik["prob_y"])

  def calc_attr_proba(self, X, y):
    for col1 in range(len(X.Sudah_Asuransi)):
      if(X.Sudah_Asuransi[col1] == 0):
        if(y[col1] == 0):
          self.prob_asuransi["n"][0] += 1
        elif(y[col1] == 1):
          self.prob_asuransi["y"][0] += 1
      elif(X.Sudah_Asuransi[col1] == 1):
        if(y[col1] == 0):
          self.prob_asuransi["n"][1] += 1
        elif(y[col1] == 1):
          self.prob_asuransi["y"][1] += 1

    # print(self.prob_asuransi["n"][0])
    # print(self.prob_asuransi["y"][0])
    # print(self.prob_asuransi["n"][1])
    # print(self.prob_asuransi["y"][1])

    self.prob_asuransi["prob_n"][0], self.prob_asuransi["prob_n"][1] = self.prob_asuransi["n"][0] / (self.prob_asuransi["n"][0] + self.prob_asuransi["n"][1]), self.prob_asuransi["n"][1] / (self.prob_asuransi["n"][0] + self.prob_asuransi["n"][1])
    self.prob_asuransi["prob_y"][0], self.prob_asuransi["prob_y"][1] = self.prob_asuransi["y"][0] / (self.prob_asuransi["y"][0] + self.prob_asuransi["y"][1]), self.prob_asuransi["y"][1] / (self.prob_asuransi["y"][0] + self.prob_asuransi["y"][1])
    
    # print(self.prob_asuransi["prob_n"])
    # print(self.prob_asuransi["prob_y"])

    for col2 in range(len(X.Umur_Kendaraan)):
      if(X.Umur_Kendaraan[col2] == 0):
        if(y[col2] == 0):
          self.prob_umur_kend["n"][0] += 1
        elif(y[col2] == 1):
          self.prob_umur_kend["y"][0] += 1
      elif(X.Umur_Kendaraan[col2] == 1):
        if(y[col2] == 0):
          self.prob_umur_kend["n"][1] += 1
        elif(y[col2] == 1):
          self.prob_umur_kend["y"][1] += 1
      elif(X.Umur_Kendaraan[col2] == 2):
        if(y[col2] == 0):
          self.prob_umur_kend["n"][2] += 1
        elif(y[col2] == 1):
          self.prob_umur_kend["y"][2] += 1
  
    # print(self.prob_umur_kend["n"][0])
    # print(self.prob_umur_kend["y"][0])
    # print(self.prob_umur_kend["n"][1])
    # print(self.prob_umur_kend["y"][1])
    # print(self.prob_umur_kend["n"][2])
    # print(self.prob_umur_kend["y"][2])
    
    self.prob_umur_kend["prob_n"][0], self.prob_umur_kend["prob_n"][1], self.prob_umur_kend["prob_n"][2] = self.prob_umur_kend["n"][0] / (self.prob_umur_kend["n"][0] + self.prob_umur_kend["n"][1] + self.prob_umur_kend["n"][2]), self.prob_umur_kend["n"][1] / (self.prob_umur_kend["n"][0] + self.prob_umur_kend["n"][1] + self.prob_umur_kend["n"][2]), self.prob_umur_kend["n"][2] / (self.prob_umur_kend["n"][0] + self.prob_umur_kend["n"][1] + self.prob_umur_kend["n"][2])
    self.prob_umur_kend["prob_y"][0], self.prob_umur_kend["prob_y"][1], self.prob_umur_kend["prob_y"][2] = self.prob_umur_kend["y"][0] / (self.prob_umur_kend["y"][0] + self.prob_umur_kend["y"][1] + self.prob_umur_kend["y"][2]), self.prob_umur_kend["y"][1] / (self.prob_umur_kend["y"][0] + self.prob_umur_kend["y"][1] + self.prob_umur_kend["y"][2]), self.prob_umur_kend["y"][2] / (self.prob_umur_kend["y"][0] + self.prob_umur_kend["y"][1] + self.prob_umur_kend["y"][2])

    # print(self.prob_umur_kend["prob_n"])
    # print(self.prob_umur_kend["prob_y"])

    for col3 in range(len(X.Kendaraan_Rusak)):
      if(X.Kendaraan_Rusak[col3] == 0):
        if(y[col3] == 0):
          self.prob_kend_rusak["n"][0] += 1
        elif(y[col3] == 1):
          self.prob_kend_rusak["y"][0] += 1
      elif(X.Kendaraan_Rusak[col3] == 1):
        if(y[col3] == 0):
          self.prob_kend_rusak["n"][1] += 1
        elif(y[col3] == 1):
          self.prob_kend_rusak["y"][1] += 1

    # print(self.prob_kend_rusak["n"][0])
    # print(self.prob_kend_rusak["y"][0])
    # print(self.prob_kend_rusak["n"][1])
    # print(self.prob_kend_rusak["y"][1])

    self.prob_kend_rusak["prob_n"][0], self.prob_kend_rusak["prob_n"][1] = self.prob_kend_rusak["n"][0] / (self.prob_kend_rusak["n"][0] + self.prob_kend_rusak["n"][1]), self.prob_kend_rusak["n"][1] / (self.prob_kend_rusak["n"][0] + self.prob_kend_rusak["n"][1])
    self.prob_kend_rusak["prob_y"][0], self.prob_kend_rusak["prob_y"][1] = self.prob_kend_rusak["y"][0] / (self.prob_kend_rusak["y"][0] + self.prob_kend_rusak["y"][1]), self.prob_kend_rusak["y"][1] / (self.prob_kend_rusak["y"][0] + self.prob_kend_rusak["y"][1])

    # print(self.prob_kend_rusak["prob_n"])
    # print(self.prob_kend_rusak["prob_y"])

  def classifying_label(self):
    pred = []
    for i in range(len(self.prob_class["prob_y_norm"])):
      if(self.prob_class["prob_y_norm"][i] < self.prob_class["prob_n_norm"][i]):
        pred.append(0)
      if(self.prob_class["prob_y_norm"][i] > self.prob_class["prob_n_norm"][i]):
        pred.append(1)
    
    return pd.Series(data = pred, name = "Tertarik")

  def predict(self, X):
    for i in range(len(X)):
      if(X.Sudah_Asuransi[i] == 0):
        sum_n = self.prob_tertarik["prob_n"] * self.prob_asuransi["prob_n"][0]
        sum_y = self.prob_tertarik["prob_y"] * self.prob_asuransi["prob_y"][0]
      elif(X.Sudah_Asuransi[i] == 1):
        sum_n = self.prob_tertarik["prob_n"] * self.prob_asuransi["prob_n"][1]
        sum_y = self.prob_tertarik["prob_y"] * self.prob_asuransi["prob_y"][1]

      if(X.Umur_Kendaraan[i] == 0):
        sum_n = sum_n * self.prob_umur_kend["prob_n"][0]
        sum_y = sum_y * self.prob_umur_kend["prob_y"][0]
      elif(X.Umur_Kendaraan[i] == 1):
        sum_n = sum_n * self.prob_umur_kend["prob_n"][1]
        sum_y = sum_y * self.prob_umur_kend["prob_y"][1]
      elif(X.Umur_Kendaraan[i] == 2):
        sum_n = sum_n * self.prob_umur_kend["prob_n"][2]
        sum_y = sum_y * self.prob_umur_kend["prob_y"][2]

      if(X.Kendaraan_Rusak[i] == 0):
        sum_n = sum_n * self.prob_kend_rusak["prob_n"][0]
        sum_y = sum_y * self.prob_kend_rusak["prob_y"][0]
      elif(X.Kendaraan_Rusak[i] == 1):
        sum_n = sum_n * self.prob_kend_rusak["prob_n"][1]
        sum_y = sum_y * self.prob_kend_rusak["prob_y"][1]

      
      self.prob_class["prob_n"].append(sum_n)
      self.prob_class["prob_y"].append(sum_y)

      norm_n = sum_n / (sum_n + sum_y)
      norm_y = sum_y / (sum_n + sum_y)

      self.prob_class["prob_n_norm"].append(norm_n)
      self.prob_class["prob_y_norm"].append(norm_y)

    return self.classifying_label()
    

  def fit(self, X, y):
    # Menghitung proba label
    self.calc_label_proba(y)
    

    # Menghitung proba attribut
    self.calc_attr_proba(X, y)

  
  


In [75]:
# Main program

nb = Naive_Bayes()
nb.fit(X_train_over, y_train_over)
Y_pred = nb.predict(X_test)

In [76]:
# Melihat akurasi

print(classification_report(y_test,Y_pred))

              precision    recall  f1-score   support

           0       0.99      0.59      0.74     41778
           1       0.25      0.98      0.40      5861

    accuracy                           0.64     47639
   macro avg       0.62      0.78      0.57     47639
weighted avg       0.90      0.64      0.70     47639



In [77]:
Y_pred.to_csv(r'labelled_y.csv', index = False)

# Experiment

In [78]:
# Membuat model naive bayes untuk 2 fitur

class Naive_Bayes_exp:
  def __init__(self):
    self.prob_asuransi = {
        "categories" : ["belum asuransi", "sudah asuransi"],
        "y" : [0,0],
        "n" : [0,0],
        "prob_y" : [0,0],
        "prob_n" : [0,0]
    }

    self.prob_kend_rusak = {
        "categories" : ["belum pernah rusak", "pernah rusak"],
        "y" : [0,0],
        "n" : [0,0],
        "prob_y" : [0,0],
        "prob_n" : [0,0]
    }

    self.prob_tertarik = {
        "y" : 0,
        "n" : 0,
        "prob_y" : 0,
        "prob_n" : 0
    }

    self.prob_class = {
        "prob_y" : [],
        "prob_n" : [],
        "prob_y_norm" : [],
        "prob_n_norm" : []
    }

  def calc_label_proba(self, y):
    zeros = 0
    ones = 0

    for i in range(len(y)):
      if(y[i] == 0):
        zeros += 1
      elif(y[i] == 1):
        ones += 1
    
    self.prob_tertarik["prob_n"] = zeros / (zeros + ones)
    self.prob_tertarik["prob_y"] = ones / (zeros + ones)

    # print("Probabilitas total n :",self.prob_tertarik["prob_n"])
    # print("Probabilitas total y :",self.prob_tertarik["prob_y"])

  def calc_attr_proba(self, X, y):
    for col1 in range(len(X.Sudah_Asuransi)):
      if(X.Sudah_Asuransi[col1] == 0):
        if(y[col1] == 0):
          self.prob_asuransi["n"][0] += 1
        elif(y[col1] == 1):
          self.prob_asuransi["y"][0] += 1
      elif(X.Sudah_Asuransi[col1] == 1):
        if(y[col1] == 0):
          self.prob_asuransi["n"][1] += 1
        elif(y[col1] == 1):
          self.prob_asuransi["y"][1] += 1

    # print(self.prob_asuransi["n"][0])
    # print(self.prob_asuransi["y"][0])
    # print(self.prob_asuransi["n"][1])
    # print(self.prob_asuransi["y"][1])

    self.prob_asuransi["prob_n"][0], self.prob_asuransi["prob_n"][1] = self.prob_asuransi["n"][0] / (self.prob_asuransi["n"][0] + self.prob_asuransi["n"][1]), self.prob_asuransi["n"][1] / (self.prob_asuransi["n"][0] + self.prob_asuransi["n"][1])
    self.prob_asuransi["prob_y"][0], self.prob_asuransi["prob_y"][1] = self.prob_asuransi["y"][0] / (self.prob_asuransi["y"][0] + self.prob_asuransi["y"][1]), self.prob_asuransi["y"][1] / (self.prob_asuransi["y"][0] + self.prob_asuransi["y"][1])
    
    # print(self.prob_asuransi["prob_n"])
    # print(self.prob_asuransi["prob_y"])

    for col3 in range(len(X.Kendaraan_Rusak)):
      if(X.Kendaraan_Rusak[col3] == 0):
        if(y[col3] == 0):
          self.prob_kend_rusak["n"][0] += 1
        elif(y[col3] == 1):
          self.prob_kend_rusak["y"][0] += 1
      elif(X.Kendaraan_Rusak[col3] == 1):
        if(y[col3] == 0):
          self.prob_kend_rusak["n"][1] += 1
        elif(y[col3] == 1):
          self.prob_kend_rusak["y"][1] += 1

    # print(self.prob_kend_rusak["n"][0])
    # print(self.prob_kend_rusak["y"][0])
    # print(self.prob_kend_rusak["n"][1])
    # print(self.prob_kend_rusak["y"][1])

    self.prob_kend_rusak["prob_n"][0], self.prob_kend_rusak["prob_n"][1] = self.prob_kend_rusak["n"][0] / (self.prob_kend_rusak["n"][0] + self.prob_kend_rusak["n"][1]), self.prob_kend_rusak["n"][1] / (self.prob_kend_rusak["n"][0] + self.prob_kend_rusak["n"][1])
    self.prob_kend_rusak["prob_y"][0], self.prob_kend_rusak["prob_y"][1] = self.prob_kend_rusak["y"][0] / (self.prob_kend_rusak["y"][0] + self.prob_kend_rusak["y"][1]), self.prob_kend_rusak["y"][1] / (self.prob_kend_rusak["y"][0] + self.prob_kend_rusak["y"][1])

    # print(self.prob_kend_rusak["prob_n"])
    # print(self.prob_kend_rusak["prob_y"])

  def classifying_label(self):
    pred = []
    for i in range(len(self.prob_class["prob_y_norm"])):
      if(self.prob_class["prob_y_norm"][i] < self.prob_class["prob_n_norm"][i]):
        pred.append(0)
      if(self.prob_class["prob_y_norm"][i] > self.prob_class["prob_n_norm"][i]):
        pred.append(1)
    
    return pd.Series(data = pred, name = "Tertarik")

  def predict(self, X):
    for i in range(len(X)):
      if(X.Sudah_Asuransi[i] == 0):
        sum_n = self.prob_tertarik["prob_n"] * self.prob_asuransi["prob_n"][0]
        sum_y = self.prob_tertarik["prob_y"] * self.prob_asuransi["prob_y"][0]
      elif(X.Sudah_Asuransi[i] == 1):
        sum_n = self.prob_tertarik["prob_n"] * self.prob_asuransi["prob_n"][1]
        sum_y = self.prob_tertarik["prob_y"] * self.prob_asuransi["prob_y"][1]

      if(X.Kendaraan_Rusak[i] == 0):
        sum_n = sum_n * self.prob_kend_rusak["prob_n"][0]
        sum_y = sum_y * self.prob_kend_rusak["prob_y"][0]
      elif(X.Kendaraan_Rusak[i] == 1):
        sum_n = sum_n * self.prob_kend_rusak["prob_n"][1]
        sum_y = sum_y * self.prob_kend_rusak["prob_y"][1]

      
      self.prob_class["prob_n"].append(sum_n)
      self.prob_class["prob_y"].append(sum_y)

      norm_n = sum_n / (sum_n + sum_y)
      norm_y = sum_y / (sum_n + sum_y)

      self.prob_class["prob_n_norm"].append(norm_n)
      self.prob_class["prob_y_norm"].append(norm_y)

    return self.classifying_label()
    

  def fit(self, X, y):
    # Menghitung proba label
    self.calc_label_proba(y)
    

    # Menghitung proba attribut
    self.calc_attr_proba(X, y)

  
  


In [79]:
# Eksperimen program

nb = Naive_Bayes_exp()
nb.fit(X_train_over, y_train_over)
Y_pred = nb.predict(X_test)

In [None]:
# Melihat akurasi

print(classification_report(y_test,Y_pred))

In [None]:
X_train_over