In [1]:
from collections import defaultdict
import numpy as np
from sklearn.cluster import KMeans
from scipy.sparse import csr_matrix
from sklearn.svm import LinearSVC

In [7]:
def load_data(data_path):
    def sparse_to_dense(sparse_r_d, vocab_size):
        r_d = [0.0 for _ in range(vocab_size)]
        
        indices_and_tfidfs = sparse_r_d.split()
        for index_and_tfidf in indices_and_tfidfs:
            index = int(index_and_tfidf.split(':')[0])
            tfidf = float(index_and_tfidf.split(':')[1])
            r_d[index] = tfidf
        return np.array(r_d)    
            
    with open(data_path) as f:
        d_lines = f.read().splitlines()
    with open("../datasets/20news-bydate/words_idfs.txt") as f:
        vocab_size = len(f.read().splitlines())

    labels = []
    data = []
    label_count = defaultdict(int)
    for data_id, d in enumerate(d_lines):
        features = d.split('<fff>')
        label, doc_id = int(features[0]), int(features[1])
        label_count[label] += 1
        r_d = sparse_to_dense(sparse_r_d=features[2], vocab_size=vocab_size)
        data.append(r_d)
        labels.append(label)
    
    return data, labels

In [15]:
def compute_accuracy(predicted_y, expected_y):
    matches = np.equal(predicted_y, expected_y)
    accuracy = np.sum(matches.astype(float))/predicted_y.size
    return accuracy

In [11]:
def clustering_with_KMeans():
    train_X, train_y = load_data(data_path='../datasets/20news-bydate/train_tf_idf.txt')
    #use csr_matrix to create a sparse matrix with efficient row slicing
    X = csr_matrix(train_X)
    print ('=========')
    kmeans = KMeans(
        n_clusters=20,
        init='random',
        n_init=5,
        tol=1e-3,
        random_state=2018
    ).fit(X)
    labels = kmeans.labels_
    training_accuracy = compute_accuracy(predicted_y=labels, expected_y=train_y)
    print("Training accuracy: ", training_accuracy)

    test_X, test_y = load_data(data_path='../datasets/20news-bydate/test_tf_idf.txt')
    predicted_y = kmeans.predict(test_X)
    accuracy = compute_accuracy(predicted_y=predicted_y, expected_y=test_y)
    print("Accuracy: ",accuracy)
    

In [18]:
clustering_with_KMeans()

Training accuracy:  0.06213540745978434
Accuracy:  0.05164630908125332


In [13]:
def classifying_with_linear_SVMs():
    train_X, train_y = load_data(data_path='../datasets/20news-bydate/train_tf_idf.txt')
    classifier = LinearSVC(
        C=10.0, #penalty coefficients
        tol=0.001, #tolerance for stopping data
        verbose=True #whether to print out logs or not 
    )
    classifier.fit(train_X,train_y)
    
    test_X, test_y = load_data(data_path='../datasets/20news-bydate/test_tf_idf.txt')
    predicted_y = classifier.predict(test_X)
    accuracy = compute_accuracy(predicted_y=predicted_y, expected_y=test_y)
    print("Accuracy: ",accuracy)

In [17]:
classifying_with_linear_SVMs()

[LibLinear]Accuracy:  0.048194370685077
