In [190]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math

penguin = pd.read_csv("penguins.csv", delimiter = ';')

X = penguin.drop(['species', 'island', 'sex'], 1)
Y = penguin['species'].values

In [191]:
from sklearn.model_selection import train_test_split
train_X, test_X, train_Y, test_Y = train_test_split(X, Y,  test_size = 0.10, random_state=1)

In [192]:
def e_distance (a, b):
  dim = len(a)

  distance = 0

  for dis in range(dim):
     distance += abs(a[dis]-b[dis])**2
  
  distance = distance**(1/2)

  return distance

In [193]:
def KNN_Predict(X_train, X_test, y_train, k):
    
    from collections import Counter

    output = []

    for p_test in X_test:
        data_jarak = []

        for p_train in X_train:
            jarak = e_distance(p_test, p_train)
            data_jarak.append(jarak)
        
        df_jarak = pd.DataFrame(data=data_jarak, columns=['jarak'])

        df_sh = df_jarak.sort_values(by=['jarak'], axis=0)[:k]

        counter = Counter(y_train[df_sh.index])

        predik = counter.most_common()[0][0]

        output.append(predik)
        
    return output

In [194]:
from sklearn.preprocessing import LabelEncoder as le

class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns

    def fit(self,X,y=None):
        return self

    def transform(self,X):
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = le().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = le().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

#  K-Nearest Neighbor Method


In [195]:
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing

enc = preprocessing.LabelEncoder()
scaler = StandardScaler()
X_train_sc = scaler.fit_transform(train_X)
X_test_sc = scaler.transform(test_X)

In [196]:
pred = KNN_Predict(X_train_sc, X_test_sc, train_Y, k=1)
print(pred)

['Adelie', 'Chinstrap', 'Gentoo', 'Gentoo', 'Adelie', 'Gentoo', 'Adelie', 'Gentoo', 'Adelie', 'Gentoo', 'Gentoo', 'Adelie', 'Adelie', 'Gentoo', 'Gentoo', 'Gentoo', 'Gentoo', 'Gentoo', 'Adelie', 'Adelie', 'Gentoo', 'Adelie', 'Adelie', 'Adelie', 'Adelie', 'Adelie', 'Chinstrap', 'Adelie', 'Chinstrap', 'Gentoo', 'Gentoo', 'Chinstrap', 'Gentoo', 'Adelie']


In [197]:
# Merapikan hasil confusion matrix
y_actual = pd.Series(test_Y, name = "actual")
y_pred = pd.Series(pred, name = "prediction")
df_confusion = pd.crosstab(y_actual, y_pred)
df_confusion

prediction,Adelie,Chinstrap,Gentoo
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Adelie,14,0,0
Chinstrap,1,4,0
Gentoo,0,0,15


In [198]:
from sklearn import metrics

print("Akurasi model: ", metrics.accuracy_score(test_Y, pred)*100, "%")
print("Presisi model: ", metrics.precision_score(test_Y, pred, average="micro")*100, "%")
print("Recall: ", metrics.recall_score(test_Y, pred, average="micro"))
print("Skor F1: ", metrics.f1_score(test_Y, pred, average="micro"))

Akurasi model:  97.05882352941177 %
Presisi model:  97.05882352941177 %
Recall:  0.9705882352941176
Skor F1:  0.9705882352941176


In [199]:
print(metrics.classification_report(test_Y, pred))

              precision    recall  f1-score   support

      Adelie       0.93      1.00      0.97        14
   Chinstrap       1.00      0.80      0.89         5
      Gentoo       1.00      1.00      1.00        15

    accuracy                           0.97        34
   macro avg       0.98      0.93      0.95        34
weighted avg       0.97      0.97      0.97        34



# Naive Bayes Classifier Method


In [200]:
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB()
mnb.fit(train_X, train_Y)

MultinomialNB()

In [201]:
penguin_test_pred = mnb.predict(test_X)
penguin_test_pred

array(['Adelie', 'Chinstrap', 'Gentoo', 'Gentoo', 'Adelie', 'Gentoo',
       'Adelie', 'Adelie', 'Adelie', 'Gentoo', 'Gentoo', 'Adelie',
       'Chinstrap', 'Gentoo', 'Gentoo', 'Gentoo', 'Gentoo', 'Gentoo',
       'Gentoo', 'Adelie', 'Gentoo', 'Adelie', 'Adelie', 'Adelie',
       'Adelie', 'Adelie', 'Chinstrap', 'Adelie', 'Chinstrap', 'Gentoo',
       'Gentoo', 'Chinstrap', 'Gentoo', 'Gentoo'], dtype='<U9')

In [202]:
b_actual = pd.Series(test_Y, name = "actual")
b_pred = pd.Series(penguin_test_pred, name = "prediction")
df_NB_confusion = pd.crosstab(b_actual, b_pred)
df_NB_confusion

prediction,Adelie,Chinstrap,Gentoo
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Adelie,11,1,2
Chinstrap,1,4,0
Gentoo,1,0,14


In [203]:
from sklearn import metrics

print("Akurasi model: ", metrics.accuracy_score(test_Y, penguin_test_pred)*100, "%")
print("Presisi model: ", metrics.precision_score(test_Y, penguin_test_pred, average="micro")*100, "%")
print("Recall: ", metrics.recall_score(test_Y, penguin_test_pred, average="micro"))
print("Skor F1: ", metrics.f1_score(test_Y, penguin_test_pred, average="micro"))

Akurasi model:  85.29411764705883 %
Presisi model:  85.29411764705883 %
Recall:  0.8529411764705882
Skor F1:  0.8529411764705882


In [204]:
print(metrics.classification_report(test_Y, penguin_test_pred))

              precision    recall  f1-score   support

      Adelie       0.85      0.79      0.81        14
   Chinstrap       0.80      0.80      0.80         5
      Gentoo       0.88      0.93      0.90        15

    accuracy                           0.85        34
   macro avg       0.84      0.84      0.84        34
weighted avg       0.85      0.85      0.85        34



# Regression Linear Method

In [205]:
# Dataset yang diperlukan dalam bentuk numerik sehingga hanya perlu mengambil data dari K-Nearest Neighbor Method

I_train = X_train_sc
I_test = X_test_sc

J_train = train_Y
J_test = test_Y

In [206]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver = 'liblinear')
#solver as linear equation
model.fit(train_X, train_Y)

LogisticRegression(solver='liblinear')

In [207]:
pred_penguin_regression = model.predict(test_X)
pred_penguin_regression

array(['Adelie', 'Chinstrap', 'Gentoo', 'Gentoo', 'Chinstrap', 'Gentoo',
       'Adelie', 'Gentoo', 'Adelie', 'Gentoo', 'Gentoo', 'Adelie',
       'Adelie', 'Gentoo', 'Gentoo', 'Gentoo', 'Gentoo', 'Gentoo',
       'Adelie', 'Adelie', 'Gentoo', 'Adelie', 'Adelie', 'Adelie',
       'Adelie', 'Adelie', 'Chinstrap', 'Adelie', 'Chinstrap', 'Gentoo',
       'Gentoo', 'Chinstrap', 'Gentoo', 'Adelie'], dtype=object)

In [208]:
i_actual = pd.Series(J_test, name = "actual")
i_pred = pd.Series(pred_penguin_regression, name = "prediction")
df_RL_confusion = pd.crosstab(i_actual, i_pred)
df_RL_confusion

prediction,Adelie,Chinstrap,Gentoo
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Adelie,14,0,0
Chinstrap,0,5,0
Gentoo,0,0,15


In [209]:
from sklearn import metrics

print("Akurasi model: ", metrics.accuracy_score(J_test, pred_penguin_regression)*100, "%")
print("Presisi model: ", metrics.precision_score(J_test, pred_penguin_regression, average="micro")*100, "%")
print("Recall: ", metrics.recall_score(J_test, pred_penguin_regression, average="micro"))
print("Skor F1: ", metrics.f1_score(J_test, pred_penguin_regression, average="micro"))

Akurasi model:  100.0 %
Presisi model:  100.0 %
Recall:  1.0
Skor F1:  1.0


In [210]:
print(metrics.classification_report(J_test, pred_penguin_regression))

              precision    recall  f1-score   support

      Adelie       1.00      1.00      1.00        14
   Chinstrap       1.00      1.00      1.00         5
      Gentoo       1.00      1.00      1.00        15

    accuracy                           1.00        34
   macro avg       1.00      1.00      1.00        34
weighted avg       1.00      1.00      1.00        34



# Ensemble Techniques 

In [211]:
pred_enc = preprocessing.LabelEncoder()
pred_enc.fit_transform(train_Y)

knn_pred = pred_enc.transform(pred)
nb_pred = pred_enc.transform(penguin_test_pred)
lr_pred = pred_enc.transform(pred_penguin_regression)

In [220]:
averaged_preds = (knn_pred + nb_pred + lr_pred)//3
averaged_preds = pred_enc.inverse_transform(averaged_preds)

acc = metrics.accuracy_score(test_Y, averaged_preds)

print("Akurasi model: ", metrics.accuracy_score(test_Y, averaged_preds)*100, "%")
print("Presisi model: ", metrics.precision_score(test_Y, averaged_preds, average="micro")*100, "%")
print("Recall: ", metrics.recall_score(test_Y, averaged_preds, average="micro"))
print("Skor F1: ", metrics.f1_score(test_Y, averaged_preds, average="micro"))

Akurasi model:  94.11764705882352 %
Presisi model:  94.11764705882352 %
Recall:  0.9411764705882353
Skor F1:  0.9411764705882353


In [213]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors = 1)
knn_pred_fit = neigh.fit(train_X, train_Y)
nb_pred_fit = mnb.fit(train_X, train_Y)
lr_pred_fit = model.fit(train_X, train_Y)

In [215]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report


def evaluate(model, X_train, X_test, y_train, y_test):
    y_test_pred = model.predict(X_test)
    y_train_pred = model.predict(X_train)

    print("TRAINIG RESULTS: \n===============================")
    clf_report = pd.DataFrame(classification_report(y_train, y_train_pred, output_dict=True))
    print(f"CONFUSION MATRIX:\n{confusion_matrix(y_train, y_train_pred)}")
    print(f"ACCURACY SCORE:\n{accuracy_score(y_train, y_train_pred):.4f}")
    print(f"CLASSIFICATION REPORT:\n{clf_report}")

    print("TESTING RESULTS: \n===============================")
    clf_report = pd.DataFrame(classification_report(y_test, y_test_pred, output_dict=True))
    print(f"CONFUSION MATRIX:\n{confusion_matrix(y_test, y_test_pred)}")
    print(f"ACCURACY SCORE:\n{accuracy_score(y_test, y_test_pred):.4f}")
    print(f"CLASSIFICATION REPORT:\n{clf_report}")

In [216]:
from sklearn.ensemble import VotingClassifier
voting_clf = VotingClassifier(estimators=[('KNN', knn_pred_fit), ('NB', nb_pred_fit), ('LR', lr_pred_fit)], voting='hard')
voting_clf.fit(train_X, train_Y)
voting_preds = voting_clf.predict(test_X)

evaluate(voting_clf, train_X, test_X, train_Y, test_Y)

TRAINIG RESULTS: 
CONFUSION MATRIX:
[[132   0   0]
 [  1  62   0]
 [  0   0 104]]
ACCURACY SCORE:
0.9967
CLASSIFICATION REPORT:
               Adelie  Chinstrap  Gentoo  accuracy   macro avg  weighted avg
precision    0.992481   1.000000     1.0  0.996656    0.997494      0.996681
recall       1.000000   0.984127     1.0  0.996656    0.994709      0.996656
f1-score     0.996226   0.992000     1.0  0.996656    0.996075      0.996648
support    132.000000  63.000000   104.0  0.996656  299.000000    299.000000
TESTING RESULTS: 
CONFUSION MATRIX:
[[14  0  0]
 [ 1  4  0]
 [ 0  0 15]]
ACCURACY SCORE:
0.9706
CLASSIFICATION REPORT:
              Adelie  Chinstrap  Gentoo  accuracy  macro avg  weighted avg
precision   0.933333   1.000000     1.0  0.970588   0.977778      0.972549
recall      1.000000   0.800000     1.0  0.970588   0.933333      0.970588
f1-score    0.965517   0.888889     1.0  0.970588   0.951469      0.969461
support    14.000000   5.000000    15.0  0.970588  34.000000     34.