# Import libraries

In [25]:
import sklearn 
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

# Preprocess data

In [13]:
corpus = [
    "góp gió gặt bão", 
    "có làm mới có ăn", 
    "đất lành chim đậu", 
    "ăn cháo đá bát", 
    "gậy ông đập lưng ông", 
    "qua cầu rút ván"
]

# length document
n_doc = len(corpus)

In [14]:
# 1: positive - 0: negative
labels = [1, 1, 1, 0, 0, 0]

In [15]:
# dict stores categories
categories_label = {
    "positive": 1, 
    "negative": 0
}

In [16]:
def label_to_categories(labels): 
    key_list = list(categories_label.keys())
    val_list = list(categories_label.values())
    position = [val_list.index(label) for label in labels]
    return np.array(key_list)[position]

In [17]:
X = np.array(corpus)
y = np.array(labels)

In [18]:
X, y

(array(['góp gió gặt bão', 'có làm mới có ăn', 'đất lành chim đậu',
        'ăn cháo đá bát', 'gậy ông đập lưng ông', 'qua cầu rút ván'],
       dtype='<U20'),
 array([1, 1, 1, 0, 0, 0]))

# TF-IDF transform

In [19]:
# calculate tf-idf function
def calculate_tf_idf(X_vectorized, n_doc):
    
    # Term Frequency (TF) calculation
    tf = np.log(X_vectorized + 1)  # Applying log transformation to term frequencies
    
    # Document Frequency (DF) calculation
    df = np.sum(X_vectorized, axis=0)  # Summing up the occurrence of each term across all documents
    
    # Inverse Document Frequency (IDF) calculation
    idf = np.log((n_doc + 1) / (df + 1)) + 1  # Adding 1 to avoid division by zero, and applying log transformation
    
    # TF-IDF calculation
    tf_idf = tf * idf  # Multiplying TF with IDF
    
    return idf, tf, tf_idf

In [20]:
# calculate norm function
def compute_norm(tf_idf_vec): 
    # Calculate the Euclidean norm of each TF-IDF vector
    norm = np.linalg.norm(tf_idf_vec, axis=1)

    # Get the number of documents
    n_doc = tf_idf_vec.shape[0]

    # Normalize each TF-IDF vector by its corresponding norm
    normalized_tf_idf_vec = np.empty_like(tf_idf_vec)
    for idx in range(n_doc):
        normalized_tf_idf_vec[idx] = tf_idf_vec[idx] / norm[idx]

    return normalized_tf_idf_vec

In [21]:
# change to vector 
vectorizer = CountVectorizer()

# fit to normalization
X_vectorized = vectorizer.fit_transform(X).toarray()

# print it out
print("Vocab: ", vectorizer.get_feature_names_out())

Vocab:  ['bát' 'bão' 'chim' 'cháo' 'có' 'cầu' 'gió' 'góp' 'gậy' 'gặt' 'làm' 'lành'
 'lưng' 'mới' 'qua' 'rút' 'ván' 'ông' 'ăn' 'đá' 'đất' 'đập' 'đậu']


In [22]:
X_idf, X_tf, X_tf_idf = calculate_tf_idf(X_vectorized, n_doc)
X_idf, X_tf, X_tf_idf

(array([2.25276297, 2.25276297, 2.25276297, 2.25276297, 1.84729786,
        2.25276297, 2.25276297, 2.25276297, 2.25276297, 2.25276297,
        2.25276297, 2.25276297, 2.25276297, 2.25276297, 2.25276297,
        2.25276297, 2.25276297, 1.84729786, 1.84729786, 2.25276297,
        2.25276297, 2.25276297, 2.25276297]),
 array([[0.        , 0.69314718, 0.        , 0.        , 0.        ,
         0.        , 0.69314718, 0.69314718, 0.        , 0.69314718,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.        , 1.09861229,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.69314718, 0.        , 0.        , 0.69314718, 0.        ,
         0.        , 0.        , 0.        , 0.69314718, 0.        ,
         0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.693

In [24]:
# change to L2 norm
compute_norm(X_tf_idf)

array([[0.        , 0.5       , 0.        , 0.        , 0.        ,
        0.        , 0.5       , 0.5       , 0.        , 0.5       ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.62232376,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.47882405, 0.        , 0.        , 0.47882405, 0.        ,
        0.        , 0.        , 0.        , 0.39264257, 0.        ,
        0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.5       , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.5       , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.5       , 0.        , 0.5       ],
       [0.52182349, 0.        , 0.        , 0.521

# Training

In [26]:
# get model 
knn_cls = KNeighborsClassifier(n_neighbors=3)
knn_cls.fit(X_tf_idf, y)
preds = knn_cls.predict(X_tf_idf)
print(preds)

[1 1 1 1 0 0]


## Inference

In [29]:
# change to vector
test_text = np.array(["không làm cạp đất mà ăn"])
test_vec = vectorizer.transform(test_text).toarray()
test_vec

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
        0]], dtype=int64)

In [31]:
test_tf = np.log(test_vec + 1)
test_tf_idf = test_tf * X_idf
test_tf_idf

array([[0.       , 0.       , 0.       , 0.       , 0.       , 0.       ,
        0.       , 0.       , 0.       , 0.       , 1.5614963, 0.       ,
        0.       , 0.       , 0.       , 0.       , 0.       , 0.       ,
        1.2804493, 0.       , 1.5614963, 0.       , 0.       ]])

In [32]:
compute_norm(test_tf_idf)

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.61171251, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.50161301, 0.        ,
        0.61171251, 0.        , 0.        ]])

In [34]:
# change to label
pred = knn_cls.predict(test_tf_idf)

print(label_to_categories(pred))

['positive']
