In [1]:
import sklearn
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import numpy as np

In [2]:
corpus = [
    "góp gió gặt bão",
    "có làm mới có ăn",
    "đất lành chim đậu",
    "ăn cháo đá bát",
    "gậy ông đập lưng ông",
    "qua cầu rút ván" 
]

n_doc = len(corpus)

labels = [1, 1, 1, 0, 0, 0] # 1: positive - 0: negative

cate_2_label = {
    "positive": 1,
    "negative":0
}

In [3]:
def label_2_cate(labels):
    key_list = list(cate_2_label.keys())
    val_list = list(cate_2_label.values())

    position = [val_list.index(label) for label in labels]
    return np.array(key_list)[position]

In [4]:
X = np.array(corpus)
y = np.array(labels)

print(X)
print(y)

['góp gió gặt bão' 'có làm mới có ăn' 'đất lành chim đậu' 'ăn cháo đá bát'
 'gậy ông đập lưng ông' 'qua cầu rút ván']
[1 1 1 0 0 0]


## Convert text to vector by using TF-IDF transform

In [5]:
def caculate_tfidf(X_vectorized):
    tf = np.log(X_vectorized + 1)
    df = np.sum(X_vectorized, axis=0)
    idf = np.log((n_doc+1)/(df+1)) + 1
    tfidf = tf * idf

    return idf, tf, tfidf

In [6]:
def compute_norm(tfidf_vec):
    norm = np.linalg.norm(tfidf_vec, axis=1)
    n_doc = tfidf_vec.shape[0]
    for i in range(n_doc):
        tfidf_vec[i] /= norm[i]

In [7]:
vectorizer = CountVectorizer()
X_vectorized = vectorizer.fit_transform(X).toarray()
print("Vocab: ", vectorizer.get_feature_names_out())

Vocab:  ['bát' 'bão' 'chim' 'cháo' 'có' 'cầu' 'gió' 'góp' 'gậy' 'gặt' 'làm' 'lành'
 'lưng' 'mới' 'qua' 'rút' 'ván' 'ông' 'ăn' 'đá' 'đất' 'đập' 'đậu']


In [8]:
X_idf, x_tf, X_tfidf = caculate_tfidf(X_vectorized)

### Normalize TF-IDF values by L2 norm

In [9]:
compute_norm(X_tfidf)

## Train model KNN with 1 neighbor

In [10]:
knn_cls = KNeighborsClassifier(n_neighbors=3)
knn_cls.fit(X_tfidf, y)
preds = knn_cls.predict(X_tfidf)
print(preds)

[1 0 1 1 1 1]


### Inference

In [12]:
test_text = np.array(["không làm cạp đất mà ăn"])
test_vec = vectorizer.transform(test_text).toarray()

In [13]:
test_tf = np.log(test_vec + 1)
test_tfidf = test_tf * X_idf

In [14]:
compute_norm(test_tfidf)

In [15]:
pred = knn_cls.predict(test_tfidf)
print(label_2_cate(pred))

['positive']
