In [21]:
corpus = [
    "góp gió gặt bão",
    "có làm mới có ăn",
    "đất lành chim đậu",
    "ăn cháo đá bát",
    "gậy ông đập lưng ông",
    "qua cầu rút ván"
]

n_doc = len(corpus)
#1: positive; 0: negative
labels = [1, 1, 1, 0, 0, 0]

In [4]:
import numpy as np

X = np.array(corpus)
y = np.array(labels)

In [14]:
def caculate_tfidf(X_vectorized):
    tf = np.log(X_vectorized + 1)
    df = np.sum(X_vectorized, axis= 0)
    idf = np.log((n_doc+1)/(df+1)) + 1
    tfidf = tf * idf

    return idf, tf, tfidf

def compute_norm(tfidf_vec):
    norm = np.linalg.norm(tfidf_vec, axis = 1)
    n_doc = tfidf_vec.shape[0]
    for i in range(n_doc):
        tfidf_vec[i] /=  norm[i]

In [7]:
import sklearn
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [9]:
vectorizer = CountVectorizer()
X_vectorized = vectorizer.fit_transform(X).toarray()
print("Vocab: ", vectorizer.get_feature_names_out())
X_vectorized

Vocab:  ['bát' 'bão' 'chim' 'cháo' 'có' 'cầu' 'gió' 'góp' 'gậy' 'gặt' 'làm' 'lành'
 'lưng' 'mới' 'qua' 'rút' 'ván' 'ông' 'ăn' 'đá' 'đất' 'đập' 'đậu']


array([[0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0],
       [0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
        0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        1],
       [1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
        0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0, 0, 0, 1,
        0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
        0]])

In [15]:
#Using TF-IDF
X_idf, X_tf, X_tfidf = caculate_tfidf(X_vectorized)
compute_norm(X_tfidf)

In [16]:
knn_cls = KNeighborsClassifier(n_neighbors=3)
knn_cls.fit(X_tfidf, y)
preds = knn_cls.predict(X_tfidf)
print(preds)
print(y)

[1 0 1 1 1 1]
[1 1 1 0 0 0]


In [22]:
#Using sklearn pipeline

text_clf_model = Pipeline([('vect', CountVectorizer()),
                            ('tfidf', TfidfTransformer()),
                            ('clf', KNeighborsClassifier(n_neighbors=1))])

text_clf_model.fit(X, y)

test_text = np.array(["cái nết đánh chết cái đẹp"])
preds = text_clf_model.predict(test_text)
print(preds)

[0]
