## CountVectorizer

In [214]:
import pandas

In [215]:
text_list = [
    'belajar di rumah sangat menyenangkan',   # index 0: kalimat ke-1 -- positif -- data latih
    'belajar dan belajar di rumah',           # index 1: kalimat ke-2 -- positif -- data latih
    'bosan belajar dari rumah',               # index 2: kalimat ke-3 -- negatif -- data latih
    'bingung belajar sendiri rumah',          # index 3: kalimat ke-4 -- negatif -- data latih
    'belajar sendiri lagi sendiri lagi',      # index 4: kalimat ke-5 --    ?    -- data uji
]
kelas_teks = ['positif', 'positif', 'negatif' , 'negatif', '?']

In [216]:
text_word = []
for text in text_list:
    for word in text.split():
        text_word.append(word)
    
print(text_word)

['belajar', 'di', 'rumah', 'sangat', 'menyenangkan', 'belajar', 'dan', 'belajar', 'di', 'rumah', 'bosan', 'belajar', 'dari', 'rumah', 'bingung', 'belajar', 'sendiri', 'rumah', 'belajar', 'sendiri', 'lagi', 'sendiri', 'lagi']


In [217]:
# Mencari kata unik (fitur)
def get_uniqueWords(text_word) :
    unique_words = [] 
    for word in text_word:
        if not word in unique_words:
            unique_words.append(word)
    return unique_words

unique_words = get_uniqueWords(text_word)
print(unique_words)

['belajar', 'di', 'rumah', 'sangat', 'menyenangkan', 'dan', 'bosan', 'dari', 'bingung', 'sendiri', 'lagi']


In [218]:
# membuat vektor 0 dengan panjang = len(unique_words) & lebar = len(text_list)
def create_zeroVector():
    vector_list = [[0 for i in range(len(unique_words))] for j in range(len(text_list))]
    return vector_list

vector_list = create_zeroVector()
vector_list

[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]

In [219]:
# Membuat list vector
def create_vectorList(text_list):
    for i, text in enumerate(text_list):
        for word in text.split():
            for j, unique in enumerate(unique_words):
                if word == unique:
                    vector_list[i][j] += 1
    return vector_list

vector_list = create_vectorList(text_list)
vector_list

[[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
 [2, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0],
 [1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0],
 [1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0],
 [1, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2]]

In [332]:
data_frame = pandas.DataFrame(vector_list, columns=unique_words, index=['kalimat ke-1','kalimat ke-2','kalimat ke-3','kalimat ke-4','kalimat ke-5'])
data_frame['LABEL'] = kelas_teks

data_frame

Unnamed: 0,belajar,di,rumah,sangat,menyenangkan,dan,bosan,dari,bingung,sendiri,lagi,LABEL
kalimat ke-1,1,1,1,1,1,0,0,0,0,0,0,positif
kalimat ke-2,2,1,1,0,0,1,0,0,0,0,0,positif
kalimat ke-3,1,0,1,0,0,0,1,1,0,0,0,negatif
kalimat ke-4,1,0,1,0,0,0,0,0,1,1,0,negatif
kalimat ke-5,1,0,0,0,0,0,0,0,0,2,2,?


## KNN

In [333]:
data_latih = vector_list[:4]
data_latih

[[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
 [2, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0],
 [1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0],
 [1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0]]

In [334]:
data_tes = vector_list[4]
data_tes

[1, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2]

### Euclidean distance = sqrt(sum((x - y)^2))

In [335]:
from math import sqrt

In [336]:
# wadah untuk tetangga-tetangga
neighbors = {}

for i in range(len(data_latih)):
    total = 0
    for j in range(len(data_tes)):
        total += pow((data_tes[j]-data_latih[i][j]), 2)
    neighbors[i] = sqrt(total)

neighbors

{0: 3.4641016151377544,
 1: 3.4641016151377544,
 2: 3.3166247903554,
 3: 2.6457513110645907}

In [337]:
import itertools

In [341]:
K = 3

# mencari K tetangga terdekat
def get_nearestNeighbors(neighbors):
    neighbors_sortASC = dict(sorted(neighbors.items(), key=lambda item: item[1]))
    nearest_neighbors = dict(itertools.islice(neighbors_sortASC.items(), K))
    return nearest_neighbors

nearest_neighbors = get_nearestNeighbors(neighbors)
nearest_neighbors

{3: 2.6457513110645907, 2: 3.3166247903554, 0: 3.4641016151377544}

In [345]:
index_nearestNeighbors = list(nearest_neighbors)

sentiment_nearestNeighbors = []
for index in index_nearestNeighbors:
    sentiment_nearestNeighbors.append(kelas_teks[index])
    
sentiment_nearestNeighbors

['negatif', 'negatif', 'positif']

In [351]:
count_positif = 0
count_negatif = 0
for sentiment in sentiment_nearestNeighbors:
    if sentiment == 'positif':
        count_positif += 1
    else:
        count_negatif +=1

probabilitas_positif = count_positif / len(sentiment_nearestNeighbors)
probabilitas_negatif = count_negatif / len(sentiment_nearestNeighbors)

probabilitas_positif, probabilitas_negatif

(0.3333333333333333, 0.6666666666666666)

In [352]:
sentiment = ''
if probabilitas_positif > probabilitas_negatif:
    sentiment = 'positif'
else:
    sentiment = 'negatif'
    
sentiment

'negatif'

In [357]:
data_frame = pandas.DataFrame(text_list, columns=['dokumen'], index=['kalimat ke-1','kalimat ke-2','kalimat ke-3','kalimat ke-4','kalimat ke-5'])
data_frame['LABEL'] = kelas_teks
data_frame['LABEL'][4] = [sentiment]

data_frame

Unnamed: 0,dokumen,LABEL
kalimat ke-1,belajar di rumah sangat menyenangkan,positif
kalimat ke-2,belajar dan belajar di rumah,positif
kalimat ke-3,bosan belajar dari rumah,negatif
kalimat ke-4,bingung belajar sendiri rumah,negatif
kalimat ke-5,belajar sendiri lagi sendiri lagi,[negatif]
