In [1]:
import numpy as np
from sklearn.cluster import KMeans
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.svm import SVC

#  KMeans

In [2]:
def load_data(data_path):
    def sparse_to_dense(sparse_r_d,vocab_size):
        r_d = [0.0 for _ in range(vocab_size)]
        indices_tfidfs = sparse_r_d.split()
        for index_tfidf in indices_tfidfs:
            index = int(index_tfidf.split(':')[0])
            tfidf = float(index_tfidf.split(':')[1])
            r_d[index] = tfidf
        return np.array(r_d)
    
    #read data and vocab_size
    with open(data_path + "data_tf_idf.txt") as f:
        d_lines = f.read().splitlines()
    with open(data_path + "words_idfs.txt") as f:
        vocab_size = len(f.read().splitlines())
    data, labels=[], []
    #crawl data set and labels set
    for data_id, d in enumerate(d_lines):
        features = d.split('<fff>')
        label, doc_id = int(features[0]), int(features[1])
        r_d = sparse_to_dense(sparse_r_d = features[2], vocab_size=vocab_size)
        #add to data and labels
        data.append(r_d)
        labels.append(label)
    return data, np.array(labels)

In [None]:
link_path = "/Users/nguyennamhai/HUST/Training phase Machine Learning lab 2023/Phase 1/Session 2/Datasets/"
X,y = load_data(link_path)

# split training set and testing set 
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=28)
# Use csr_matrix to create a sparse matrix with efficient row slicing hence improving time for computing
X_train=csr_matrix(X_train)

In [None]:
# evaluate accuracy
def compute_accuracy(prediced_y,expected_y):
    matches = np.equal(prediced_y,expected_y)
    accuracy = np.sum(matches.astype(float)) / expected_y.size
    return accuracy

In [None]:
# KMeans
kmeans = KMeans(random_state=42, tol = 1e-3, n_init = 5).fit(X_train)
predicted_y_kmeans = kmeans.predict(X_test)
accuracy_kmeans = compute_accuracy(prediced_y=predicted_y_kmeans,expected_y=y_test)
print("Accuracy: {}".format(accuracy_kmeans))

# Linear SVM

In [None]:
link_path = "/Users/nguyennamhai/HUST/Training phase Machine Learning lab 2023/Phase 1/Session 2/Datasets/"
X,y = load_data(link_path)
# split training set and testing set 
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=28)
# Use csr_matrix to create a sparse matrix with efficient row slicing hence improving time for computing
X_train=csr_matrix(X_train)

In [None]:
# Linear SVM
classifier = LinearSVC(C=10.0,tol = 0.001, verbose=True).fit(X_train,y_train)

In [None]:
# compute accuracy
predicted_y_svm = classifier.predict(X_test)
accuracy_linearsvms = compute_accuracy(prediced_y=predicted_y_svm, expected_y=y_test)
print("Accuracy: {}".format(accuracy_linearsvms))

# Kernel SVM

In [None]:
link_path = "/Users/nguyennamhai/HUST/Training phase Machine Learning lab 2023/Phase 1/Session 2/Datasets/"
X,y = load_data(link_path)
# split training set and testing set 
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=28)
# Use csr_matrix to create a sparse matrix with efficient row slicing hence improving time for computing
X_train=csr_matrix(X_train)

In [None]:
# kernel SVM
classifier_ker = SVC(C=50.0,
    kernel='rbf', #{‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’}
    gamma=0.1,
    tol=0.001,
    verbose=True
).fit(X_train,y_train)

In [None]:
# compute accuracy
predicted_y_svc = classifier.predict(X_test)
accuracy_svc = compute_accuracy(prediced_y=predicted_y_svc, expected_y=y_test)
print("Accuracy: {}".format(accuracy_svc))