In [98]:
import numpy as np
import scipy as sp
%matplotlib inline
import matplotlib.pyplot as plt
from collections import defaultdict
from pyparsing import anyOpenTag, anyCloseTag
from xml.sax.saxutils import unescape as unescape

In [99]:
# open docs file and read its lines
with open("data/train.dat", "r") as fh1:
    train = fh1.readlines() 
with open("data/test.dat", "r") as fh2:
    test = fh2.readlines()
lines = train + test

In [100]:
# transform docs into lists of words
docs_X = [l.split() for l in train]

print len(docs_X)
docs = [l.split() for l in lines]
print len(docs)
labels = list()
# print docs
for i in range(len(docs_X)):
    labels.append(docs_X[i][0])

25000
50000


In [107]:
def filterLen(docs, minlen):
    r""" filter out terms that are too short. 
    docs is a list of lists, each inner list is a document represented as a list of words
    minlen is the minimum length of the word to keep
    """
    return [ [t for t in d if len(t) >= minlen ] for d in docs ]
docs1 = filterLen(docs, 4)
print(docs[0][:50])
print(docs1[0][:50])

['-1', 'Although', 'a', 'film', 'with', 'Bruce', 'Willis', 'is', 'always', 'worth', 'watching,', 'you', 'better', 'skip', 'this', 'one.', 'I', 'watched', 'this', 'one', 'on', 'television,', 'so', 'I', "didn't", 'have', 'to', 'plunk', 'down', 'cash', 'for', 'it.', 'Lucky', 'me.<br', '/><br', '/>The', 'plot', 'develops', 'slowly,', 'very', 'slowly.', 'Although', 'the', 'first', '30', 'minutes', 'or', 'so', 'are', 'quite']
['Although', 'film', 'with', 'Bruce', 'Willis', 'always', 'worth', 'watching,', 'better', 'skip', 'this', 'one.', 'watched', 'this', 'television,', "didn't", 'have', 'plunk', 'down', 'cash', 'Lucky', 'me.<br', '/><br', '/>The', 'plot', 'develops', 'slowly,', 'very', 'slowly.', 'Although', 'first', 'minutes', 'quite', 'believable,', 'gets', 'more', 'more', 'unbelievable', 'towards', 'end.', 'highly', 'questionable,', 'seasoned', 'soldier', 'like', 'Waters', 'would', 'disobey', 'direct', 'orders.']


In [108]:
from collections import Counter
from scipy.sparse import csr_matrix
def build_matrix(docs):
    r""" Build sparse matrix from a list of documents, 
    each of which is a list of word/terms in the document.  
    """
    nrows = len(docs)
    idx = {}
    tid = 0
    nnz = 0
    for d in docs:
        nnz += len(set(d))
        for w in d:
            if w not in idx:
                idx[w] = tid
                tid += 1
    ncols = len(idx)
        
    # set up memory
    ind = np.zeros(nnz, dtype=np.int)
    val = np.zeros(nnz, dtype=np.double)
    ptr = np.zeros(nrows+1, dtype=np.int)
    i = 0  # document ID / row counter
    n = 0  # non-zero counter
    # transfer values
    for d in docs:
        cnt = Counter(d)
        keys = list(k for k,_ in cnt.most_common())
        l = len(keys)
        for j,k in enumerate(keys):
            ind[j+n] = idx[k]
            val[j+n] = cnt[k]
        ptr[i+1] = ptr[i] + l
        n += l
        i += 1
            
    mat = csr_matrix((val, ind, ptr), shape=(nrows, ncols), dtype=np.double)
    mat.sort_indices()
    
    return mat


def csr_info(mat, name="", non_empy=False):
    r""" Print out info about this CSR matrix. If non_empy, 
    report number of non-empty rows and cols as well
    """
    if non_empy:
        print("%s [nrows %d (%d non-empty), ncols %d (%d non-empty), nnz %d]" % (
                name, mat.shape[0], 
                sum(1 if mat.indptr[i+1] > mat.indptr[i] else 0 
                for i in range(mat.shape[0])), 
                mat.shape[1], len(np.unique(mat.indices)), 
                len(mat.data)))
    else:
        print( "%s [nrows %d, ncols %d, nnz %d]" % (name, 
                mat.shape[0], mat.shape[1], len(mat.data)) )

In [109]:
mat1 = build_matrix(docs1)
csr_info(mat1)

 [nrows 50000, ncols 431191, nnz 5847209]


In [110]:
# scale matrix and normalize its rows
def csr_idf(mat, copy=False, **kargs):
    r""" Scale a CSR matrix by idf. 
    Returns scaling factors as dict. If copy is True, 
    returns scaled matrix and scaling factors.
    """
    if copy is True:
        mat = mat.copy()
    nrows = mat.shape[0]
    nnz = mat.nnz
    ind, val, ptr = mat.indices, mat.data, mat.indptr
    # document frequency
    df = defaultdict(int)
    for i in ind:
        df[i] += 1
    # inverse document frequency
    for k,v in df.items():
        df[k] = np.log(nrows / float(v))  ## df turns to idf - reusing memory
    # scale by idf
    for i in range(0, nnz):
        val[i] *= df[ind[i]]
        
    return df if copy is False else mat

def csr_l2normalize(mat, copy=False, **kargs):
    r""" Normalize the rows of a CSR matrix by their L-2 norm. 
    If copy is True, returns a copy of the normalized matrix.
    """
    if copy is True:
        mat = mat.copy()
    nrows = mat.shape[0]
    nnz = mat.nnz
    ind, val, ptr = mat.indices, mat.data, mat.indptr
    # normalize
    for i in range(nrows):
        rsum = 0.0    
        for j in range(ptr[i], ptr[i+1]):
            rsum += val[j]**2
        if rsum == 0.0:
            continue  # do not normalize empty rows
        rsum = 1.0/np.sqrt(rsum)
        for j in range(ptr[i], ptr[i+1]):
            val[j] *= rsum
            
    if copy is True:
        return mat
mat2 = csr_idf(mat1, copy=True)
mat3 = csr_l2normalize(mat2, copy=True)

In [111]:
%reload_ext autoreload 
%matplotlib inline
import time
from lsh import clsh, jlsh, generateSamples, findNeighborsBrute, recall

In [112]:
# divide tranning set and test set 
X = mat3[:25000, :]
Y = mat3[25000:, :] 

In [113]:
# compute cosine sim
sims = Y.dot(X.T)

In [117]:
# find k neighbours
import operator
k = 100
dic_for_sort = dict()
sims1 = sims
list_nbr = list()
y_labels = list()
for i in range(sims1.shape[0]):

    count1 = 0
    row = sims1.getrow(i).toarray()[0].ravel()
    top_indices = row.argsort()[-k:]
    top_values = row[row.argsort()[-k:]]
    
    for j in range(len(top_indices)):
        if labels[top_indices[j]]=='+1':
            count1 = count1+1
    if count1>k/2:
        y_labels.append('+1')
    else:
        y_labels.append('-1')

print 'over!'

over!


In [118]:
# print result to text file
text_file = open("data/format.dat", "w")


for i in y_labels:
    
    text_file.write(i+'\n')
text_file.close()