In [38]:
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
%matplotlib inline
from collections import defaultdict, Counter
import string
import re
from itertools import groupby
from operator import itemgetter
from nltk.stem import PorterStemmer 
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.metrics import f1_score

In [39]:
# Definition of preprocessing functions

# Functional functions
def compose2(f, g):
    return lambda *a, **kw: f(g(*a, **kw))

def compose (*functions):
    def inner(arg):
        for f in reversed(functions):
            arg = f(arg)
        return arg
    return inner

# Filtering
has_valid_len = lambda w: len(w) >= 4
is_nonstop_word = lambda w: w not in ENGLISH_STOP_WORDS
is_valid_word = lambda w: has_valid_len(w) and is_nonstop_word(w)

# Mapping
to_lower = lambda w: w.lower()
remove_punc = lambda w: w.translate(str.maketrans('', '', string.punctuation))
remove_num = lambda w: re.sub(r'\d+', '', w)

map_word = compose(to_lower, remove_punc, remove_num)

# This mapping is expensive, so do it after filtering
ps = PorterStemmer() 
stem = lambda w: ps.stem(w)

def preprocess(docs):
    docs = [ [map_word(t) for t in d ] for d in docs ]
    docs = [ [t for t in d if is_valid_word(t)] for d in docs ]
    docs = [ [stem(t) for t in d ] for d in docs ]
    return docs

# def preprocess2(docs):
#     docs = preprocess(docs)
#     return [ [stem(t) for t in d ] for d in docs ]

In [40]:
# Create sparse matrices, which requires aggregration of term IDs for both training and test data.

def calc_term_ids(docs):
    r""" The docs should be the combination of training set and test set."""
    term_ids = {}
    curr_term_id = 0
    nnz = 0
    for d in docs:
        nnz += len(set(d))
        for w in d:
            if w not in term_ids:
                term_ids[w] = curr_term_id
                curr_term_id += 1
    return (term_ids, nnz)

def build_sparse_matrix(docs, term_ids = {}, nnz = 0):
    r""" Build sparse matrix from a list of documents, 
    each of which is a list of word/terms in the document.  
    """
    nrows = len(docs)
    ncols = len(term_ids)
    assert(ncols != 0)

    # set up memory
    ind = np.zeros(nnz, dtype=np.int)
    val = np.zeros(nnz, dtype=np.double)
    ptr = np.zeros(nrows + 1, dtype=np.int)
    row_id = 0  # document ID / row counter
    acc = 0  # non-zero counter

    # transfer values
    for d in docs:
        cnt = Counter(d)
        keys = list(k for k, _ in cnt.most_common())
        curr_doc_len = len(keys)
        for i, key in enumerate(keys):
            ind[acc + i] = term_ids[key]
            val[acc + i] = cnt[key]
        ptr[row_id + 1] = ptr[row_id] + curr_doc_len
        acc += curr_doc_len
        row_id += 1

    mat = csr_matrix((val, ind, ptr), shape=(nrows, ncols), dtype=np.double)
    mat.sort_indices()
    
    return mat

In [41]:
# scale matrix and normalize its rows
def csr_idf(mat, copy=False, **kargs):
    r""" Scale a CSR matrix by idf. 
    Returns scaling factors as dict. If copy is True, 
    returns scaled matrix and scaling factors.
    """
    if copy is True:
        mat = mat.copy()
    nrows = mat.shape[0]
    nnz = mat.nnz
    ind, val, ptr = mat.indices, mat.data, mat.indptr
    # document frequency
    df = defaultdict(int)
    for i in ind:
        df[i] += 1
    # inverse document frequency
    for k,v in df.items():
        df[k] = np.log(nrows / float(v))  ## df turns to idf - reusing memory
    # scale by idf
    for i in range(0, nnz):
        val[i] *= df[ind[i]]
        
    return df if copy is False else mat

def csr_l2normalize(mat, copy=False):
    r""" Normalize the rows of a CSR matrix by their L-2 norm. 
    If copy is True, returns a copy of the normalized matrix.
    """
    if copy is True:
        mat = mat.copy()
    nrows = mat.shape[0]
    nnz = mat.nnz
    ind, val, ptr = mat.indices, mat.data, mat.indptr
    # normalize
    for i in range(nrows):
        rsum = 0.0    
        for j in range(ptr[i], ptr[i+1]):
            rsum += val[j]**2
        if rsum == 0.0:
            continue  # do not normalize empty rows
        rsum = 1.0/np.sqrt(rsum)
        for j in range(ptr[i], ptr[i+1]):
            val[j] *= rsum
            
    if copy is True:
        return mat
    
def csr_normalize(mat, copy=False, **kargs):
    r""" combination of 'csr_idf' and 'csr_l2normalize' """
    if copy is True:
        mat = mat.copy()
    csr_idf(mat)
    csr_l2normalize(mat)
    if copy is True:
        return mat

In [48]:
def predict(test_data, train_data, labels, k=1, verbose=False):
    r""" Predict the label of test data based on the given
    labeled training data using k nearest neighbor algorithm.
    """
    nrows = test_data.shape[0]
    predictions = []
    for i, data in enumerate(test_data):
        if verbose:
            print( "- Starting test data %s..." % (i) )
        predictions.append(knn(data, train_data, labels, k=k, verbose=verbose))
        
    return predictions

def norm(mat):
    r""" L2-norm """
    return np.sqrt(mat.power(2).sum())

def dist(a, b):
    r""" Cosine similarity """
    dp = a.dot(b.T).todense().item()
    return dp / ( norm(a) * norm(b))

def knn(point, cluster, cluster_labels, k, verbose=False):
    r""" Implementation of K-nearest-neighbor """
    dists = []
    
    # Calculate distance b/ the given node to all other nodes
    for i, node in enumerate(cluster):
        d = dist(point, node)
        if verbose:
            print( "- Compared with train data %s. Dist: %s" % (i, d) )
        dists.append({"dist": d, "label": cluster_labels[i]})
  
    # Sort and truncate by K
    dists = sorted(dists, reverse=True, key = lambda i: i["dist"])[:k]
    if verbose:
        print( "- Dist truncated by K=%d:" % (k), dists )

    # Add weight for classification
    weights = {}
    for label, v in groupby(dists, key = lambda i: i["label"]):
        weights[label] = sum(item["dist"] for item in list(v))
    
    return max(weights.items(), key=itemgetter(1))[0]

In [43]:
def load_data_as_lines(path):
    r""" Open text file by path and read all lines """
    with open(path, "r") as fh:
        lines = fh.readlines()
        
    # transform docs into lists of words
    raw_lines = [l.split() for l in lines]

    return raw_lines

In [44]:
def to_data_and_label(raw_lines):
    r""" Split training data and label from raw lines """
    y = list(map(lambda x: int(x[0]), raw_lines))
    x = list(map(lambda x: x[1:], raw_lines))
    return (x, y)

In [45]:
def save_result_as_file(prediction, file_name="prediction.dat"):
    r""" Save the predicted result as a new file """
    file_content = "\n".join(list(map(str, prediction)))
    with open(file_name, "w") as fd:
        fd.write(file_content) 

In [49]:
def main():
    # Read files
    train_lines = load_data_as_lines("train.dat")
    x_test = load_data_as_lines("test.dat")
    
    # Split 
    x_train, y_train = to_data_and_label(train_lines)
    print(y_train)
    # Text preprocessing
    docs_train = preprocess(x_train)
    docs_test = preprocess(x_test)

    # Build shared term ID dictionary for classification
    term_ids, nnz = calc_term_ids([*docs_train, *docs_test])

    # Build sparse matrices
    mat_train = build_sparse_matrix(docs_train, term_ids, nnz)
    mat_test = build_sparse_matrix(docs_test, term_ids, nnz)

    # Normalize CSR matrices
    mat_test = csr_normalize(mat_test, copy=True)
    mat_train = csr_normalize(mat_train, copy=True)
    
    # Train
    y_test = predict(mat_test, mat_train, y_train, k=6, verbose=False)
    save_result_as_file(y_test)

In [None]:
if __name__ == "__main__":
    main()

[4, 5, 2, 5, 4, 1, 5, 5, 3, 5, 5, 5, 2, 5, 5, 5, 1, 5, 1, 3, 1, 3, 1, 5, 3, 5, 5, 5, 3, 5, 3, 5, 1, 5, 4, 3, 1, 4, 3, 1, 1, 1, 5, 4, 2, 5, 5, 3, 1, 3, 1, 5, 5, 5, 5, 3, 5, 4, 3, 1, 3, 3, 5, 5, 3, 5, 1, 3, 4, 5, 4, 4, 3, 1, 1, 2, 4, 5, 4, 5, 2, 1, 3, 1, 2, 5, 5, 5, 4, 1, 5, 5, 3, 2, 4, 1, 5, 1, 3, 4, 1, 1, 2, 5, 1, 2, 1, 5, 3, 2, 3, 5, 4, 1, 4, 1, 3, 5, 5, 4, 5, 3, 4, 5, 4, 2, 5, 1, 3, 2, 1, 4, 4, 4, 4, 4, 5, 3, 4, 1, 4, 5, 4, 5, 1, 2, 1, 3, 3, 1, 1, 1, 4, 3, 5, 4, 5, 5, 1, 5, 5, 1, 3, 4, 1, 3, 5, 3, 4, 1, 1, 5, 1, 4, 1, 1, 3, 5, 5, 1, 4, 2, 2, 4, 2, 3, 5, 5, 4, 2, 5, 5, 3, 3, 4, 5, 5, 5, 4, 4, 5, 5, 3, 4, 1, 1, 3, 4, 5, 4, 5, 5, 5, 2, 1, 5, 5, 1, 4, 1, 3, 4, 2, 1, 5, 5, 3, 5, 4, 3, 4, 5, 4, 4, 2, 3, 5, 1, 1, 4, 2, 5, 5, 1, 5, 3, 3, 3, 1, 4, 5, 4, 5, 4, 3, 5, 1, 2, 1, 4, 5, 5, 1, 4, 3, 1, 4, 5, 5, 5, 5, 1, 4, 4, 5, 4, 2, 5, 1, 5, 2, 1, 5, 5, 5, 4, 5, 4, 5, 4, 1, 4, 1, 3, 2, 4, 1, 4, 3, 1, 4, 5, 2, 5, 3, 5, 1, 5, 5, 5, 5, 3, 5, 5, 2, 5, 4, 5, 5, 3, 5, 4, 3, 4, 5, 1, 5, 1, 3, 4, 4, 4, 4, 