#Inisialisasi Library

In [1]:
import re
import numpy as np
import pandas as pd
import operator
from random import randrange
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter

import warnings
warnings.filterwarnings('ignore')

#Load Traning Dataset

In [2]:
df = pd.read_csv('Data_Train_Preprocessed.csv')

In [3]:
df

Unnamed: 0,Label,Content
0,0,"['jakarta', 'ketua', 'dprd', 'kabupaten', 'bog..."
1,0,"['jakarta', 'vaksin', 'sinovac', 'vaksin', 'mu..."
2,0,"['bandung', 'pemkot', 'bandung', 'isolasi', 'm..."
3,0,"['makassar', 'aktif', 'covid', 'kota', 'makass..."
4,0,"['kabupaten', 'bandung', 'polisi', 'terap', 'g..."
...,...,...
235,1,"['foto', 'sertifikat', 'vaksinasi', 'covid', '..."
236,1,"['narasi', 'dokter', 'malaysia', 'dari', 'chai..."
237,1,"['foto', 'surat', 'kop', 'dinas', 'sehat', 'pe..."
238,1,"['narasi', 'ragu', 'parah', 'gelombang', 'covi..."


#Information Gain

In [4]:
class TextFeatureSelection():

    def __init__(self,target,input_doc_list):
        self.target=target
        self.input_doc_list=input_doc_list

    def _InformationGain(self,A,B,C,D,N):
        with np.errstate(divide='ignore', invalid='ignore'):
            return (-((A+C)/N)*np.log((A+C)/N))+((A+B)/N)*(A/(A+B))*np.log(A/(A+B))+((C+D)/N)*(C/(C+D))*np.log(C/(C+D)) + (-((B+D)/N)*np.log((B+D)/N))+((A+B)/N)*(B/(A+B))*np.log(B/(A+B))+((C+D)/N)*(D/(C+D))*np.log(D/(C+D))

    def _get_binary_label(self,label_array):
        #numpy array
        label_array=np.array(label_array)
        unique_label=np.unique(label_array)
        if 0 in unique_label and 1 in unique_label:
            pass
        else:
            label_array=np.where(label_array==unique_label[0],1,0)
        return label_array

    def _get_term_binary_matrix(self,input_doc_list):

        #inisialisasi vectorizer
        vectorizer = CountVectorizer()
        X = vectorizer.fit_transform(input_doc_list)
        word_list = vectorizer.get_feature_names_out()

        #binary word document matrix
        vectorizer = CountVectorizer(binary=True)
        X = vectorizer.fit_transform(input_doc_list)
        word_binary_matrix = X.toarray()
        count_list = word_binary_matrix.sum(axis=0)
        
        return word_list,count_list,word_binary_matrix

    def _get_ABCD(self,word_binary_matrix,label_array):

        A=[]
        B=[]
        C=[]
        D=[]
        print("This is word_binary_matrix.shape")
        print(word_binary_matrix.shape[1])
        for i in range(word_binary_matrix.shape[1]):
            computed_result=Counter(label_array * 2 + word_binary_matrix[:,i])
            A.append(computed_result[1])
            B.append(computed_result[3])
            C.append(computed_result[0])
            D.append(computed_result[2])

        A=np.array(A)
        B=np.array(B)
        C=np.array(C)
        D=np.array(D)
        N=A+B+C+D
        return A,B,C,D,N
    
    
    def _getvalues(self):
                
        #binary labels
        label_array=self._get_binary_label(self.target)

        #word, count, binary matrix
        word_list,count_list,word_binary_matrix=self._get_term_binary_matrix(self.input_doc_list)

        #nilai ABCDN
        A,B,C,D,N=self._get_ABCD(word_binary_matrix,label_array)
        
        #bentuk dataframe
        out_df=pd.DataFrame({'word list':word_list,'word occurence count':count_list})
        out_df['Information Gain']=self._InformationGain(A,B,C,D,N)
        out_df['Information Gain'].replace(np.nan,0,inplace=True)

        return out_df
    
    def getScore(self):
        
        values_df=self._getvalues()
        return values_df

In [6]:
input_doc_list = df['Content'].values
target= df['Label'].values
seleksi_fitur = TextFeatureSelection(target=target,input_doc_list=input_doc_list)
information_gain_df = seleksi_fitur.getScore()

This is word_binary_matrix.shape
4934


In [7]:
# save to csv
information_gain_df.to_csv("information_gain.csv", header=["word list", "word occurence count", "Information Gain"], index=False, encoding='utf-8')

#Euclidean Distance

In [8]:
class distanceMetrics:

#Class untuk menghitung jarak

    def __init__(self):
        
        #Inisialisasi/Constructor function

        pass
        
    def euclideanDistance(self, vector1, vector2):
        
        #Function untuk menghitung jarak Euclidean
                
        self.vectorA, self.vectorB = vector1, vector2
        if len(self.vectorA) != len(self.vectorB):
            raise ValueError("Panjang vektor tidak sama")
        distance = 0.0
        for i in range(len(self.vectorA)-1):
            distance += (self.vectorA[i] - self.vectorB[i])**2
        return (distance)**0.5

#KNN Model Building

In [10]:
class kNNClassifier:
    
#Class untuk membangun model KNN
    
    def __init__(self):
        
        #KNN constructor
        
        pass
    
    def fit(self, xTrain, yTrain):
        
        #Train KNN model dengan x Data
        assert len(xTrain) == len(yTrain)
        self.trainData = xTrain
        self.trainLabels = yTrain

    def getNeighbors(self, testRow):
        
        #Train KNN model dengan x Data dan menghitung jarak dengan memanggil function Euclidean

        calcDM = distanceMetrics()
        distances = []
        for i, trainRow in enumerate(self.trainData):
            distances.append([trainRow, calcDM.euclideanDistance(testRow, trainRow), i, self.trainLabels[i]])
            distances.sort(key=operator.itemgetter(1))

        neighbors = []
        for index in range(self.k):
            neighbors.append(distances[index])  
        return neighbors
        
    def predict(self, xTest, k, distanceMetric='euclidean'):
            
        #Menggunakan model KNN pada data testing
        
        self.testData = xTest
        self.k = k
        self.distanceMetric = distanceMetric
        predictions = []
        
        for i, testCase in enumerate(self.testData):
            neighbors = self.getNeighbors(testCase)
            output= [row[-1] for row in neighbors]
            prediction = max(set(output), key=output.count)
            predictions.append(prediction)
        
        return predictions

#TF-IDF

In [11]:
#Menghitung TF-IDF dari setiap fitur pada data training

from sklearn.feature_extraction.text import TfidfVectorizer

#Inisialisasi TF-IDF

tf = TfidfVectorizer()
 
#melakukan perhitungan IDF pada fitur

tfidf_vectors = tf.fit(df['Content'].values) 
 
#mapping array dari fitur index integer ke fitur nama

tfidf_vectors.get_feature_names_out() 

array(['ab', 'abai', 'abang', ..., 'zubair', 'zuckerbeg', 'zulpan'],
      dtype=object)

In [12]:
tfidf_matrix = tf.fit_transform(df['Content'].values) 
 
#mengetahui ukuran matriks TF-IDF

tfidf_matrix.shape 

(240, 4934)

In [13]:
matriks = tfidf_matrix.todense()
matriks

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [14]:
df_new = pd.DataFrame(
    matriks, 
    columns=tf.get_feature_names_out(),
    index=df.index
)
df_new

Unnamed: 0,ab,abai,abang,abc,abdi,abdul,abdullah,abjad,about,absah,...,zinc,zincvit,zionis,zona,zonasi,zoom,zoonotik,zubair,zuckerbeg,zulpan
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
236,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
237,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
238,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#Feature Selection

In [15]:
information_gain_df_sorted = information_gain_df.sort_values(by=['Information Gain'], ascending=False)
information_gain_df_sorted

Unnamed: 0,word list,word occurence count,Information Gain
2238,klaim,98,0.317083
1849,jakarta,80,0.274745
1145,edar,100,0.263967
4352,tangkap,98,0.252137
4686,unggah,86,0.230610
...,...,...,...
1814,isyarat,1,0.000000
1815,ita,9,0.000000
1816,itb,1,0.000000
1817,ite,1,0.000000


In [16]:
# save to csv
information_gain_df_sorted.to_csv("information_gain_sorted.csv", header=["word list", "word occurence count", "Information Gain"], index=False, encoding='utf-8')

In [17]:
threshold = round((len(information_gain_df_sorted)*0.005))
threshold

25

In [19]:
selected_feature_df = information_gain_df_sorted[:threshold]
selected_feature_list = selected_feature_df['word list'].values

In [20]:
feature_list = df_new.columns
feature_list

Index(['ab', 'abai', 'abang', 'abc', 'abdi', 'abdul', 'abdullah', 'abjad',
       'about', 'absah',
       ...
       'zinc', 'zincvit', 'zionis', 'zona', 'zonasi', 'zoom', 'zoonotik',
       'zubair', 'zuckerbeg', 'zulpan'],
      dtype='object', length=4934)

In [21]:
unselected_feature_list = [feature for feature in feature_list if feature not in selected_feature_list]
unselected_feature_list[:5]

['ab', 'abai', 'abang', 'abc', 'abdi']

In [22]:
df_with_feature_selection = df_new.drop(columns=unselected_feature_list)
df_with_feature_selection

Unnamed: 0,akun,artikel,atur,bagi,barat,capai,dasar,data,detik,edar,...,klaim,kota,laku,muat,positif,pusat,satgas,tambah,tangkap,unggah
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.024869,0.000000,0.000000,...,0.000000,0.000000,0.024292,0.000000,0.022905,0.0,0.000000,0.000000,0.000000,0.000000
1,0.000000,0.000000,0.015684,0.000000,0.000000,0.016286,0.000000,0.014530,0.012049,0.010394,...,0.000000,0.000000,0.028385,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
2,0.000000,0.000000,0.000000,0.000000,0.077173,0.000000,0.038587,0.072141,0.000000,0.000000,...,0.000000,0.214715,0.000000,0.000000,0.000000,0.0,0.041666,0.073316,0.000000,0.000000
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.135702,0.043172,0.000000,0.000000,0.000000,...,0.000000,0.080077,0.039421,0.000000,0.037171,0.0,0.046618,0.123042,0.000000,0.000000
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.096950,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235,0.077667,0.053030,0.000000,0.045570,0.000000,0.000000,0.000000,0.000000,0.000000,0.035144,...,0.000000,0.000000,0.000000,0.053030,0.000000,0.0,0.000000,0.000000,0.000000,0.075897
236,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.031299,...,0.031634,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.044465,0.031634,0.067594
237,0.082282,0.000000,0.000000,0.048278,0.000000,0.000000,0.000000,0.000000,0.000000,0.037232,...,0.037631,0.103272,0.000000,0.000000,0.000000,0.0,0.000000,0.052894,0.037631,0.040204
238,0.125723,0.042921,0.000000,0.073766,0.000000,0.000000,0.000000,0.000000,0.000000,0.056889,...,0.114995,0.000000,0.000000,0.085842,0.000000,0.0,0.000000,0.000000,0.028749,0.122858


In [23]:
label = df["Label"]
df_with_feature_selection = df_with_feature_selection.join(label)

#Evaluasi Model dengan KFCV

In [25]:
knn = kNNClassifier()

In [26]:
#kfold Cross Validation

def kFCVEvaluate(dataset, n, k, distanceMetrics='euclidean'):
  from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
  from sklearn.model_selection import KFold

  acc = []
  precission = []
  recall = []
  f1score = []

  kf = KFold(n_splits=n, random_state = 42, shuffle=True)

  for train_index, val_index in kf.split(dataset):
    train_df = dataset.iloc[train_index]
    val_df = dataset.iloc[val_index]
    
    textFeatures = train_df.values.tolist()
    textLabels = [text[-1] for text in textFeatures] 

    trainSet = [train[:-1] for train in textFeatures]

    knn.fit(trainSet, textLabels)

    testFeatures = val_df.values.tolist() 
    
    actual = [row[-1] for row in testFeatures]
    valSet = [test[:-1] for test in testFeatures]
              
    prediction = knn.predict(valSet, k, 'euclidean')

    val_acc = accuracy_score(actual, prediction)
    acc.append(val_acc)

    val_precission = precision_score(actual, prediction)
    precission.append(val_precission)

    val_recall = recall_score(actual, prediction)
    recall.append(val_recall)

    val_f1score = f1_score(actual, prediction)
    f1score.append(val_f1score)

    print('-' * 10)
    print(f'accuracy {val_acc}')
    print(f'precision {val_precission}')
    print(f'recall {val_recall}')
    print(f'f1-score {val_f1score}')


  mean_acc = sum(acc) / len(acc)
  mean_precission = sum(precission) / len(precission)
  mean_recall = sum(recall) / len(recall)
  mean_f1score = sum(f1score) / len(f1score)

  print('-' * 10)
  print(f"Mean-Accuracy: {mean_acc}")
  print(f"Mean-Precision: {mean_precission}")
  print(f"Mean-Recall: {mean_recall}")
  print(f"Mean-F1score: {mean_f1score}")

In [27]:
#Skenario untuk k = 3
kFCVEvaluate(df_with_feature_selection, 10, 3, 'euclidean')

----------
accuracy 0.9166666666666666
precision 1.0
recall 0.8181818181818182
f1-score 0.9
----------
accuracy 1.0
precision 1.0
recall 1.0
f1-score 1.0
----------
accuracy 1.0
precision 1.0
recall 1.0
f1-score 1.0
----------
accuracy 0.9583333333333334
precision 1.0
recall 0.9090909090909091
f1-score 0.9523809523809523
----------
accuracy 0.9583333333333334
precision 1.0
recall 0.9166666666666666
f1-score 0.9565217391304348
----------
accuracy 0.9583333333333334
precision 1.0
recall 0.9230769230769231
f1-score 0.9600000000000001
----------
accuracy 1.0
precision 1.0
recall 1.0
f1-score 1.0
----------
accuracy 1.0
precision 1.0
recall 1.0
f1-score 1.0
----------
accuracy 0.9583333333333334
precision 1.0
recall 0.9285714285714286
f1-score 0.962962962962963
----------
accuracy 1.0
precision 1.0
recall 1.0
f1-score 1.0
----------
Mean-Accuracy: 0.975
Mean-Precision: 1.0
Mean-Recall: 0.9495587745587747
Mean-F1score: 0.973186565447435


In [28]:
#Skenario untuk k = 5
kFCVEvaluate(df_with_feature_selection, 10, 5, 'euclidean')

----------
accuracy 0.9583333333333334
precision 1.0
recall 0.9090909090909091
f1-score 0.9523809523809523
----------
accuracy 1.0
precision 1.0
recall 1.0
f1-score 1.0
----------
accuracy 1.0
precision 1.0
recall 1.0
f1-score 1.0
----------
accuracy 0.9583333333333334
precision 1.0
recall 0.9090909090909091
f1-score 0.9523809523809523
----------
accuracy 0.9166666666666666
precision 0.9166666666666666
recall 0.9166666666666666
f1-score 0.9166666666666666
----------
accuracy 0.9583333333333334
precision 1.0
recall 0.9230769230769231
f1-score 0.9600000000000001
----------
accuracy 0.9583333333333334
precision 0.9230769230769231
recall 1.0
f1-score 0.9600000000000001
----------
accuracy 1.0
precision 1.0
recall 1.0
f1-score 1.0
----------
accuracy 0.9583333333333334
precision 1.0
recall 0.9285714285714286
f1-score 0.962962962962963
----------
accuracy 1.0
precision 1.0
recall 1.0
f1-score 1.0
----------
Mean-Accuracy: 0.9708333333333334
Mean-Precision: 0.9839743589743591
Mean-Recall: 0.9

In [29]:
#Skenario untuk k = 7
kFCVEvaluate(df_with_feature_selection, 10, 7, 'euclidean')

----------
accuracy 0.875
precision 0.8333333333333334
recall 0.9090909090909091
f1-score 0.8695652173913043
----------
accuracy 1.0
precision 1.0
recall 1.0
f1-score 1.0
----------
accuracy 1.0
precision 1.0
recall 1.0
f1-score 1.0
----------
accuracy 0.9166666666666666
precision 1.0
recall 0.8181818181818182
f1-score 0.9
----------
accuracy 0.9166666666666666
precision 0.9166666666666666
recall 0.9166666666666666
f1-score 0.9166666666666666
----------
accuracy 0.9166666666666666
precision 0.9230769230769231
recall 0.9230769230769231
f1-score 0.9230769230769231
----------
accuracy 0.9583333333333334
precision 0.9230769230769231
recall 1.0
f1-score 0.9600000000000001
----------
accuracy 1.0
precision 1.0
recall 1.0
f1-score 1.0
----------
accuracy 0.9583333333333334
precision 1.0
recall 0.9285714285714286
f1-score 0.962962962962963
----------
accuracy 0.9583333333333334
precision 0.9166666666666666
recall 1.0
f1-score 0.9565217391304348
----------
Mean-Accuracy: 0.95
Mean-Precision: 0.

In [30]:
#Skenario untuk k = 9
kFCVEvaluate(df_with_feature_selection, 10, 9, 'euclidean')

----------
accuracy 0.8333333333333334
precision 0.7692307692307693
recall 0.9090909090909091
f1-score 0.8333333333333333
----------
accuracy 1.0
precision 1.0
recall 1.0
f1-score 1.0
----------
accuracy 0.9166666666666666
precision 0.8666666666666667
recall 1.0
f1-score 0.9285714285714286
----------
accuracy 0.9166666666666666
precision 1.0
recall 0.8181818181818182
f1-score 0.9
----------
accuracy 0.9166666666666666
precision 0.8571428571428571
recall 1.0
f1-score 0.923076923076923
----------
accuracy 0.9166666666666666
precision 0.9230769230769231
recall 0.9230769230769231
f1-score 0.9230769230769231
----------
accuracy 0.9583333333333334
precision 0.9230769230769231
recall 1.0
f1-score 0.9600000000000001
----------
accuracy 1.0
precision 1.0
recall 1.0
f1-score 1.0
----------
accuracy 0.9583333333333334
precision 1.0
recall 0.9285714285714286
f1-score 0.962962962962963
----------
accuracy 0.9583333333333334
precision 0.9166666666666666
recall 1.0
f1-score 0.9565217391304348
-------

In [31]:
#Skenario untuk k = 11
kFCVEvaluate(df_with_feature_selection, 10, 11, 'euclidean')

----------
accuracy 0.8333333333333334
precision 0.7333333333333333
recall 1.0
f1-score 0.846153846153846
----------
accuracy 1.0
precision 1.0
recall 1.0
f1-score 1.0
----------
accuracy 0.9166666666666666
precision 0.8666666666666667
recall 1.0
f1-score 0.9285714285714286
----------
accuracy 0.875
precision 0.9
recall 0.8181818181818182
f1-score 0.8571428571428572
----------
accuracy 0.9166666666666666
precision 0.8571428571428571
recall 1.0
f1-score 0.923076923076923
----------
accuracy 0.8333333333333334
precision 0.8
recall 0.9230769230769231
f1-score 0.8571428571428571
----------
accuracy 0.9583333333333334
precision 0.9230769230769231
recall 1.0
f1-score 0.9600000000000001
----------
accuracy 0.9583333333333334
precision 0.9166666666666666
recall 1.0
f1-score 0.9565217391304348
----------
accuracy 0.9583333333333334
precision 1.0
recall 0.9285714285714286
f1-score 0.962962962962963
----------
accuracy 0.9166666666666666
precision 0.8461538461538461
recall 1.0
f1-score 0.91666666

#Load Testing Dataset

In [32]:
data_test = pd.read_csv('Data_Test_Preprocessed.csv')

In [33]:
test_tfidf_matrix = tf.transform(data_test['Content'].values) 

#mengetahui ukuran matriks TF-IDF
test_tfidf_matrix.shape

(60, 4934)

In [34]:
matriks_test = test_tfidf_matrix.todense()
matriks_test

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [36]:
data_test_new = pd.DataFrame(
    matriks_test, 
    columns=tf.get_feature_names_out(),
    index=data_test.index
)

In [37]:
df_test_with_feature_selection = data_test_new.drop(columns=unselected_feature_list)

In [38]:
label_test = data_test["Label"]
df_test_with_feature_selection = df_test_with_feature_selection.join(label_test)

In [39]:
df_test_with_feature_selection

Unnamed: 0,akun,artikel,atur,bagi,barat,capai,dasar,data,detik,edar,...,kota,laku,muat,positif,pusat,satgas,tambah,tangkap,unggah,Label
0,0.0,0.0,0.0,0.0,0.0,0.028402,0.0,0.050679,0.021012,0.0,...,0.0,0.024752,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.011915,0.0,0.0,0.0,0.01409,0.0,0.0,0.0,0
2,0.0,0.0,0.122139,0.01312,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.013027,0.0,0.0,0.0,0.0,0.0,0
3,0.0,0.0,0.0,0.0,0.0,0.015925,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.049237,0.01444,0.0,0.0,0
4,0.0,0.0,0.0,0.0,0.318168,0.0,0.022726,0.084977,0.0,0.0,...,0.0,0.041503,0.0,0.039134,0.0,0.0,0.064771,0.0,0.0,0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.016953,0.015847,0.013141,0.011336,...,0.015722,0.0,0.0,0.0,0.0,0.018306,0.0,0.0,0.0,0
6,0.0,0.0,0.131906,0.0,0.0,0.034241,0.0,0.0,0.025333,0.0,...,0.030309,0.059682,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
7,0.0,0.0,0.058988,0.0,0.0,0.0,0.0,0.218586,0.0,0.0,...,0.0,0.106758,0.0,0.251662,0.0,0.0,0.111073,0.0,0.0,0
8,0.0,0.0,0.0,0.0,0.023871,0.0,0.0,0.044628,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.12888,0.0,0.0,0.0,0
9,0.0,0.0,0.0,0.0,0.031867,0.0,0.063733,0.0,0.0,0.02131,...,0.147768,0.0,0.0,0.054874,0.0,0.0,0.060547,0.0,0.0,0


#Evaluasi Model dengan Data Testing

In [40]:
X_test = df_test_with_feature_selection.iloc[:, :-1].values

In [41]:
predictions = knn.predict(X_test, 3, 'euclidean')

In [42]:
from sklearn.metrics import confusion_matrix
pd.DataFrame(confusion_matrix(label_test, predictions), index=[0, 1], columns=[0,1])

Unnamed: 0,0,1
0,29,2
1,1,28


In [43]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

test_acc = accuracy_score(label_test, predictions)

test_precision = precision_score(label_test, predictions)
    
test_recall = recall_score(label_test, predictions)
    
test_f1score = f1_score(label_test, predictions)

print(test_acc)
print(test_precision)
print(test_recall)
print(test_f1score)

0.95
0.9333333333333333
0.9655172413793104
0.9491525423728815


#Save Model

In [44]:
import joblib
joblib.dump(knn, 'my_model_knn.pkl')

['my_model_knn.pkl']

In [45]:
knn_from_joblib = joblib.load('my_model_knn.pkl')

In [46]:
joblib.dump(tf, 'tfidf.pkl')

['tfidf.pkl']