In [7]:
from collections import defaultdict, Counter
import numpy as np
import scipy.sparse as sp
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, matthews_corrcoef
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC

  from numpy.core.umath_tests import inner1d


## File

In [6]:
def load_data_as_lines(path):
    r""" Open text file by path and read all lines """
    with open(path, "r") as fh:
        lines = fh.readlines()
        
    # transform docs into lists of words
    raw_lines = [l.split() for l in lines]

    return raw_lines

def to_data_and_label(raw_lines):
    r""" Split training data and label from raw lines """
    y = list(map(lambda x: int(x[0]), raw_lines))
    x = list(map(lambda x: x[1:], raw_lines))
    return (x, y)

def save_result_as_file(prediction, file_name="prediction.dat"):
    r""" Save the predicted result as a new file """
    file_content = "\n".join(list(map(str, prediction)))
    with open(file_name, "w") as fd:
        fd.write(file_content) 

## Sparse Matrix Relevant

In [42]:
def calc_term_ids(docs):
    r""" The docs should be the combination of training set and test set."""
    term_ids = {}
    curr_term_id = 0
    nnz = 0
    for d in docs:
        nnz += len(set(d))
        for w in d:
            if w not in term_ids:
                term_ids[w] = curr_term_id
                curr_term_id += 1
    return (term_ids, nnz)

def build_sparse_matrix(docs, term_ids = {}, nnz = 0):
    r""" Build sparse matrix from a list of documents, 
    each of which is a list of word/terms in the document.  
    """
    nrows = len(docs)
    ncols = len(term_ids)
    assert(ncols != 0)

    # set up memory
    ind = np.zeros(nnz, dtype=np.int)
    val = np.zeros(nnz, dtype=np.double)
    ptr = np.zeros(nrows + 1, dtype=np.int)
    row_id = 0  # document ID / row counter
    acc = 0  # non-zero counter

    # transfer values
    for d in docs:
        cnt = Counter(d)
        keys = list(k for k, _ in cnt.most_common())
        curr_doc_len = len(keys)
        for i, key in enumerate(keys):
            ind[acc + i] = term_ids[key]
            val[acc + i] = cnt[key]
        ptr[row_id + 1] = ptr[row_id] + curr_doc_len
        acc += curr_doc_len
        row_id += 1

    mat = csr_matrix((val, ind, ptr), shape=(nrows, ncols), dtype=np.double)
    mat.sort_indices()
    
    return mat

In [43]:
def csr_l2normalize(mat, copy=False):
    r""" Normalize the rows of a CSR matrix by their L-2 norm. 
    If copy is True, returns a copy of the normalized matrix.
    """
    if copy is True:
        mat = mat.copy()
    nrows = mat.shape[0]
    nnz = mat.nnz
    ind, val, ptr = mat.indices, mat.data, mat.indptr
    # normalize
    for i in range(nrows):
        rsum = 0.0    
        for j in range(ptr[i], ptr[i+1]):
            rsum += val[j]**2
        if rsum == 0.0:
            continue  # do not normalize empty rows
        rsum = 1.0/np.sqrt(rsum)
        for j in range(ptr[i], ptr[i+1]):
            val[j] *= rsum
            
    if copy is True:
        return mat

## K-mer

In [8]:
def kmer(name, k=3):
    r""" Given a name and parameter k, return the vector of k-mers associated with the name
    """
    name = name.lower()
    name_len = len(name)
    v = []
    if name_len < k:
        return [name]

    for i in range(name_len - k + 1):
        v.append(name[i:i+k])
    
    return v

## KNN with K-Fold

In [45]:
def splitData(mat, cls, fold=1, d=10):
    r""" Split the matrix and class info into train and test data using d-fold hold-out
    """
    n = mat.shape[0]
    r = int(np.ceil(n*1.0/d))
    mattr = []
    clstr = []
    # split mat and cls into d folds
    for f in range(d):
        if f+1 != fold:
            mattr.append( mat[f*r: min((f+1)*r, n)] )
            clstr.extend( cls[f*r: min((f+1)*r, n)] )
    # join all fold matrices that are not the test matrix
    train = sp.vstack(mattr, format='csr')
    # extract the test matrix and class values associated with the test rows
    test = mat[(fold-1)*r: min(fold*r, n), :]
    clste = cls[(fold-1)*r: min(fold*r, n)]

    return train, clstr, test, clste

def classifyNames(x_train, cls, c=3, k=3, folds=10):
    def classify(x, train, clstr):
        r""" Classify vector x using kNN and majority vote rule given training data and associated classes
        """
        # find nearest neighbors for x
        dots = x.dot(train.T)
        sims = list(zip(dots.indices, dots.data))
        if len(sims) == 0:
            # could not find any neighbors
            return 1 if np.random.rand() > 0.5 else -1
        sims.sort(key=lambda x: x[1], reverse=True)
        tc = Counter(clstr[s[0]] for s in sims[:k]).most_common(2)
        if len(tc) < 2 or tc[0][1] > tc[1][1]:
            # majority vote
            return tc[0][0]
        # tie break
        tc = defaultdict(float)
        for s in sims[:k]:
            tc[clstr[s[0]]] += s[1]
        return sorted(tc.items(), key=lambda x: x[1], reverse=True)[0][0]
        
    macc = 0.0
    for f in range(folds):
        # split data into training and testing
        train, clstr, test, clste = splitData(x_train, cls, f+1, folds)
        # predict the class of each test sample
        clspr = [ classify(test[i,:], train, clstr) for i in range(test.shape[0]) ]
        
        macc += matthews_corrcoef(clste, clspr)
        
    return macc/d

In [46]:
def test_knn():
    # Read files
    train_lines = load_data_as_lines("train.dat")
    x_test = load_data_as_lines("test.dat")
    # Split 
    x_train, y_train = to_data_and_label(train_lines)
    
    # Text preprocessing
    print("Preprocessing documents...")
    docs_train = list(map(lambda x: kmer(x[0], k=3), x_train))
    docs_test = list(map(lambda x: kmer(x[0], k=3), x_test))
    
    # Build sparse matrix
    term_ids, nnz = calc_term_ids([*docs_train])
    mat_train = build_sparse_matrix(docs_train, term_ids, nnz)
    csr_l2normalize(mat_train)
    
    # Find best k
    max_k = 500
    curr_max_acc = 0
    max_acc_k = 0
    history = [[] for i in range(max_k)]

    for k in range(max_k):
        accuracy = classifyNames(mat_train, y_train, k=k+1)
        history[k].append(accuracy)
        if accuracy > curr_max_acc:
            curr_max_acc = accuracy
            max_acc_k = k + 1

    print("k=%d, accuracy: %f" % (max_acc_k, curr_max_acc))
    
    # Create prediction file
    term_ids2, nnz2 = calc_term_ids([*docs_train, *docs_test])
    mat_train2 = build_sparse_matrix(docs_train, term_ids2, nnz2)
    mat_test2 = build_sparse_matrix(docs_test, term_ids2, nnz2)
    csr_l2normalize(mat_train2)
    csr_l2normalize(mat_test2)
    
    y_test = predict(mat_test2, mat_train2, y_train, k=max_acc_k)
    file_name = "knn%d-kmer3.dat" % max_acc_k
    save_result_as_file(y_test, file_name)
    print("File %s created." % file_name)
    
def predict(test_data, train_data, labels, k=1):
    r""" Predict the label of test data based on the given
    labeled training data using k-nearest neighbor classifier.
    """
    predictions = []
    
    similarity = train_data.dot(test_data.T).todense()
    top_k_idx = np.argpartition(similarity, -k, axis=0)[-k:,:]
    to_labels = np.vectorize(lambda idx: labels[idx])
    top_k_label = to_labels(top_k_idx)
    
    n_test_col = similarity.shape[1]

    for col in range(n_test_col):
        train_tags = top_k_label[:,col].flatten().tolist()[0]
        train_indices = top_k_idx[:,col].flatten().tolist()[0]

        # Select the maximum aggregated similarity
        weights = {}
        for tag, row in zip(train_tags, train_indices):
            if tag in weights:
                weights[tag]["count"] += 1
                weights[tag]["value"] += similarity[row, col]
            else:
                weights[tag] = {
                    "count": 1,
                    "value": similarity[row, col]
                }
                
        result = max(weights.items(), key=lambda x: (x[1]["count"], x[1]["value"]))[0]
        predictions.append(result)

    return predictions

## Test Other Models (SVC, Logictic Regression, Naive Bayes, etc.)

In [32]:
def load_data_as_cv(x_train, x_test=None):
    count_vect = CountVectorizer()
    docs_train_joined = list(map(lambda x: " ".join(x), x_train))
    docs_test_joined = None if x_test is None else \
        list(map(lambda x: " ".join(x), x_test))

    x_train_cv = count_vect.fit_transform(docs_train_joined)
    x_test_cv = None if x_test is None else \
        count_vect.transform(docs_test_joined)
        
    return x_train_cv, x_test_cv

In [33]:
def get_pca_data(x_train):
    def percvar(v):
        r"""Transform eigen/singular values into percents.
        Return: vector of percents, prefix vector of percents
        """
        # sort values
        s = np.sort(np.abs(v))
        # reverse sorting order
        s = s[::-1]
        # normalize
        s = s/np.sum(s)
        return s, np.cumsum(s)

    def perck(s, p):
        return next(i+1 for i,v in enumerate(s) if v >= p)

    x_train_d = x_train.todense()
    X_std = StandardScaler().fit_transform(x_train_d)
    means = np.mean(X_std, axis=0)
    X_sm = X_std - means
    
    U,s,V = np.linalg.svd(X_sm)
    _, pv = percvar(s**2/(X_sm.shape[0]-1))

    percentage_explained = 95
    n_components = perck(pv, percentage_explained * 0.01)
    svd = TruncatedSVD(n_components=n_components)
    svd.fit(x_train_d)

    return svd.transform(x_train_d), svd

In [42]:
def get_models_accuracy(data, y_train, use_pca=False, verbose=False):
    models = [
        LinearSVC(class_weight="balanced"),
        LinearSVC(),
        LogisticRegression(class_weight="balanced"),
        LogisticRegression(random_state=0),
        None if use_pca else MultinomialNB()
    ]
    models = list(filter(lambda x: x is not None, models))
    folds = 10
    scorer = make_scorer(matthews_corrcoef)
    
    data = data
    res = []
    for i, model in enumerate(models):
        model_name = model.__class__.__name__
        if model_name == "LinearSVC" and i == 0 or \
            model_name == "LogisticRegression" and i == 2:
            model_name += "(balanced)"
            
        accuracies = cross_val_score(model, data, y_train, scoring=scorer, cv=folds)
        result = (model_name, round(np.average(accuracies), 6))
        if verbose:
            print(result)
        res.append(result)
        
    return res

In [43]:
def get_model(model_name):
    if model_name == "LinearSVC":
        return LinearSVC()
    if model_name == "LinearSVC(balanced)":
        return LinearSVC(class_weight="balanced")
    if model_name == "LogisticRegression":
        return LogisticRegression(random_state=0)
    if model_name == "LogisticRegression(balanced)":
        return LogisticRegression(class_weight="balanced")
    if model_name == "MultinomialNB":
        return MultinomialNB()
    return None

In [44]:
def test_models(x_train, y_train, use_pca=False, verbose=False):
    results = []
    for k in range(2, 4):
        print("k = %d" % k)
        
        # Text preprocessing
        if verbose:
            print("Preprocessing documents...")
        docs_train = list(map(lambda x: kmer(x[0], k=k), x_train))

        x_train_cv, _ = load_data_as_cv(docs_train)
        if use_pca:
            x_train_cv, _ = get_pca_data(x_train_cv)

        if verbose:
            print("Evaluating...")
        model_acc = get_models_accuracy(x_train_cv, y_train, use_pca, verbose)
        model_acc = list(map(lambda x: (k, x[0], x[1]), model_acc))
        
        results = [*results, *model_acc]
    
    results = sorted(results, key=lambda x: x[2], reverse=True)
    
    return results

In [49]:
def main():
    # Read files
    train_lines = load_data_as_lines("train.dat")
    x_test = load_data_as_lines("test.dat")

    # Split 
    x_train, y_train = to_data_and_label(train_lines)
    
    # Get the most accurate models
    res = test_models(x_train, y_train)[:10]
    # Create files for accurate models
    for k, model_name, _ in res:
        docs_train = list(map(lambda x: kmer(x[0], k=k), x_train))
        docs_test = list(map(lambda x: kmer(x[0], k=k), x_test))
        x_train_cv, x_test_cv = load_data_as_cv(docs_train, docs_test)
    
        m = get_model(model_name)
        m.fit(x_train_cv, y_train)
        
        y_test = m.predict(x_test_cv)
        save_result_as_file(y_test, "%s-%d.dat" % (model_name, k))
    
    # Get the most accurate models with PCA
    res_pca = test_models(x_train, y_train, use_pca=True)[:6]
    # Create files for accurate PCA models
    for k, model_name, _ in res_pca:
        docs_train = list(map(lambda x: kmer(x[0], k=k), x_train))
        docs_test = list(map(lambda x: kmer(x[0], k=k), x_test))
        
        x_train_cv, x_test_cv = load_data_as_cv(docs_train, docs_test)
        x_train_cv, svd = get_pca_data(x_train_cv)
        
        x_test_cv = svd.transform(x_test_cv.todense())
    
        m = get_model(model_name)
        m.fit(x_train_cv, y_train)
        
        y_test = m.predict(x_test_cv)
        save_result_as_file(y_test, "%s-%d_pca.dat" % (model_name, k))

In [50]:
if __name__ == "__main__":
    main()

k = 2
k = 3
k = 2




k = 3




In [30]:
test_models()

k = 2
k = 3
k = 4


[(2, 'LogisticRegression', 0.853508),
 (2, 'LogisticRegression(balanced)', 0.828579),
 (3, 'LogisticRegression(balanced)', 0.827817),
 (3, 'LinearSVC(balanced)', 0.822402),
 (3, 'LinearSVC', 0.814257),
 (2, 'MultinomialNB', 0.812943),
 (2, 'LinearSVC', 0.803411),
 (2, 'LinearSVC(balanced)', 0.777075),
 (3, 'LogisticRegression', 0.772422),
 (4, 'LogisticRegression(balanced)', 0.73815),
 (4, 'LinearSVC', 0.723501),
 (4, 'LinearSVC(balanced)', 0.723244),
 (4, 'LogisticRegression', 0.651025),
 (3, 'MultinomialNB', 0.571121),
 (4, 'MultinomialNB', 0.106831)]

In [22]:
test_models(True)

k = 2




k = 3




k = 4




[(2, 'LogisticRegression', 0.832498),
 (3, 'LinearSVC', 0.802863),
 (3, 'LinearSVC(balanced)', 0.793694),
 (2, 'LinearSVC', 0.779962),
 (3, 'LogisticRegression', 0.772219),
 (2, 'LinearSVC(balanced)', 0.757175),
 (4, 'LogisticRegression', 0.683727),
 (4, 'LinearSVC', 0.663933),
 (4, 'LinearSVC(balanced)', 0.58965)]