# K-MEANS & SVM
A notebook for KMeans and SVM 

In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Disable warnings
import warnings
warnings.filterwarnings('ignore')

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import copy
import pandas as pd
import numpy as np
import os
PRJ_ROOT_DIR = os.path.dirname(os.path.abspath(''))

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)
import seaborn as sns

In [2]:
# Where to save the figures
NOTE_ROOT_DIR = os.path.abspath('')
DATA_DIR = os.path.join(NOTE_ROOT_DIR, "data", "20news-bydate")
CHAPTER_ID = "01_kmeans_svm"
IMAGES_PATH = os.path.join(NOTE_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

## 1 - KMeans from Scratch
Source code of KMeans is of `.models/`

In [4]:
# Test KMeans from scratch
from models import KMeans

kmeans = KMeans(num_clusters=8)
kmeans.load_data()
kmeans.run(seed_value=42, criterion='similarity', threshold=1e-3)
print(f"Purity: {kmeans.compute_purity()}", f"\nNMI: {kmeans.compute_NMI()}")

Purity: 0.1069723018147087 
NMI: 0.002836080655787284


## 2 - Sklearn Implementation

In [3]:
# Loss Function
def compute_accuracy(y_pred, y):
    # Check boolean True = 1, False = 0 
    matches = np.equal(y_pred, y)
    accuracy = np.sum(matches.astype(float)) / len(y)
    return accuracy

### 2.1 - Load the Data

In [5]:
# Load the data
def load_data(path: str = DATA_DIR):
    def sparse_to_dense(sparse_r_d, vocab_size):
        r_d = [0.0 for _ in range(vocab_size)]
        indices_and_tfidfs = sparse_r_d.split()
        for index_and_tfidf in indices_and_tfidfs:
            index = int(index_and_tfidf.split(':')[0])
            tfidf = float(index_and_tfidf.split(':')[1])
            r_d[index] = tfidf
        return np.array(r_d)    
                
    with open(os.path.join(path, "data_tf_idf.txt")) as f:
            data_lines = f.read().splitlines()
    with open(os.path.join(path, "words_idfs.txt")) as f:
        vocab_size = len(f.read().splitlines())

    data, labels = [], []
    for data_id, d in enumerate(data_lines):
        features = d.split('<fff>')
        label, doc_id = int(features[0]), int(features[1])
        r_d = sparse_to_dense(sparse_r_d=features[2], vocab_size=vocab_size)
        data.append(r_d)
        labels.append(label)
    return data, np.array(labels)

# extract the data
X, y = load_data()

In [6]:
# split data into train set and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### 2.2 - K-Means

In [19]:
# Run with K-Means
def clustering_with_KMeans(X_train):
    from sklearn import cluster
    from scipy.sparse import csr_matrix
    
    X_train = csr_matrix(X_train)
    print("========")
    kmeans = cluster.KMeans(
        n_clusters=8, 
        init="random",
        n_init=10,
        tol=1e-3,
        random_state=42
        ).fit(X_train)
    return kmeans.labels_

y_pred = clustering_with_KMeans(X_train)
compute_accuracy(y_pred, y_train)

0.04046166091801539

### 2.3. Linear SVM

In [21]:
# Run with Linear SVM
def classifying_with_linear_SVM(X_train, y_train, X_test, y_test):
    from sklearn.svm import LinearSVC
    clf = LinearSVC(C=10.0, tol=0.001, verbose=True)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = compute_accuracy(y_pred, y_test)
    print("\nAccuracy:", accuracy)
    return clf

classifying_with_linear_SVM(X_train, y_train, X_test, y_test)

[LibLinear]
Accuracy: 0.9153846153846154


LinearSVC(C=10.0, tol=0.001, verbose=True)

In [1]:
# Run with Kernel SVM
def classifying_with_kernel_SVM(X_train, y_train, X_test, y_test):
    from sklearn.svm import SVC
    clf = SVC(C=50.0, kernel="rbf", gamma=0.1, tol=0.001, verbose=True)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = compute_accuracy(y_pred, y_test)
    print("\nAccuracy:", accuracy)
    return clf

classifying_with_kernel_SVM(X_train, y_train, X_test, y_test)

[LibSVM]
Accuracy: 0.9044991511035654
