In [1]:
import numpy as np
import scipy as sp
%matplotlib inline
import matplotlib.pyplot as plt
from collections import defaultdict
import re
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
with open("train.dat", "r") as fh:
    lines = fh.readlines()  

In [3]:
with open("english", "r") as fh:
    lines22 = fh.readlines() 

stop_words = []
for l in lines22:
    l = l.replace('\n','')
    if len(l)<3:
        continue
    stop_words.append(l.lower())

In [4]:
def clean(raw):
  cleanr = re.compile('<.*?>')
  cleantext = re.sub(cleanr, ' ', raw)
  #cleanr = re.compile('n\'t')
  #cleantext = re.sub(cleanr, ' not', cleantext)
  #cleanr = re.compile('\'ll')
  #cleantext = re.sub(cleanr, ' will', cleantext)
  #cleanr = re.compile('\'s')
  #cleantext = re.sub(cleanr, '', cleantext)
  cleanr = re.compile('[^a-zA-Z0-9]')
  cleantext = re.sub(cleanr, ' ', cleantext)
  for w in stop_words:
    cleanr = re.compile(w)
    cleantext = re.sub(cleanr, '', cleantext)
  cleanr = re.compile(r'\W*\b\w{1,2}\b')
  cleantext = re.sub(cleanr, '', cleantext)
  return cleantext.lower()

In [5]:

def group(inp, n = 2):
    for i in xrange(len(inp) - (n - 1)):
        yield inp[i:i+n]

def group2words(inp):
    comb_2_words = []
    for f, s in group(inp, 2):
        comb = f + " "+s
        comb_2_words.append(comb)
    return comb_2_words

def group3words(inp):
    comb_3_words = []
    for f, s, t in group(inp, 3):
        comb = f + " "+s + " "+t
        comb_3_words.append(comb)
    return comb_3_words

def getKmers(inp):
    kmers=[]
    comb_2_words = group2words(inp)
    for comb in comb_2_words:
        kmers.append(comb)
    comb_3_words = group3words(inp)
    for comb in comb_3_words:
        kmers.append(comb)
    return kmers

In [6]:
labels = [int(l[:2]) for l in lines]
docs = [clean(l[2:]).split() for l in lines]

In [7]:
for d in docs:
        kmers = getKmers(d)
        d.extend(kmers)

In [8]:
length_training = len(docs)
print length_training
print (docs[0])

25000
['although', 'film', 'bruce', 'willis', 'always', 'worth', 'watching', 'better', 'skip', 'one', 'watched', 'one', 'television', 'plunk', 'cash', 'lucky', 'the', 'plot', 'develops', 'slowly', 'slowly', 'although', 'first', 'minutes', 'quite', 'believable', 'gets', 'unbelievable', 'towards', 'end', 'highly', 'questionable', 'seasoned', 'soldier', 'like', 'waters', 'would', 'disobey', 'direct', 'orders', 'and', 'even', 'would', 'rest', 'plan', 'would', 'they', 'puts', 'direct', 'danger', 'certly', 'die', 'follow', 'heck', 'let', 'says', 'despite', 'direct', 'orders', 'remember', 'still', 'nice', 'scenes', 'movie', 'they', 'save', 'village', 'total', 'population', 'massacred', 'rebels', 'well', 'save', 'dozen', 'villagers', 'rest', 'already', 'killed', 'the', 'strange', 'part', 'take', 'trucks', 'rebels', 'left', 'behind', 'they', 'rat', 'foot', 'maybe', 'roads', 'unsafe', 'explanation', 'anyway', 'think', 'earned', 'movie', 'one', 'point', 'gave', 'what', 'made', 'movie', 'insult', 

In [9]:
# open docs file and read its lines
with open("test.dat", "r") as fh:
    lines = fh.readlines() 

docs_test = [clean(l).split() for l in lines]

for d in docs_test:
        kmers = getKmers(d)
        d.extend(kmers)

In [10]:
length_testing = len(docs_test)
print length_testing

25000


In [11]:
docs.extend(docs_test)

In [12]:
from collections import Counter
from scipy.sparse import csr_matrix

def build_matrix(docs):
    r""" Build sparse matrix from a list of documents,
    each of which is a list of word/terms in the document.
    """
    nrows = len(docs)
    idx = {}
    tid = 0
    nnz = 0
    # Remove all ratings
    for d in docs:
        #d = d[1:]
        nnz += len(set(d))
        for w in d:
            if w not in idx:
                idx[w] = tid
                tid += 1
    ncols = len(idx)

    # set up memory
    ind = np.zeros(nnz, dtype=np.int)
    val = np.zeros(nnz, dtype=np.double)
    ptr = np.zeros(nrows+1, dtype=np.int)
    i = 0  # document ID / row counter
    n = 0  # non-zero counter
    # transfer values
    for d in docs:
        #d = d[1:]
        cnt = Counter(d)
        keys = list(k for k,_ in cnt.most_common())
        l = len(keys)
        for j,k in enumerate(keys):
            ind[j+n] = idx[k]
            val[j+n] = cnt[k]
        ptr[i+1] = ptr[i] + l
        n += l
        i += 1

    mat = csr_matrix((val, ind, ptr), shape=(nrows, ncols), dtype=np.double)
    mat.sort_indices()

    return mat


# scale matrix and normalize its rows
def csr_idf(mat, copy=False, **kargs):
    r""" Scale a CSR matrix by idf. 
    Returns scaling factors as dict. If copy is True, 
    returns scaled matrix and scaling factors.
    """
    if copy is True:
        mat = mat.copy()
    nrows = mat.shape[0]
    nnz = mat.nnz
    ind, val, ptr = mat.indices, mat.data, mat.indptr
    # document frequency
    df = defaultdict(int)
    for i in ind:
        df[i] += 1
    # inverse document frequency
    for k,v in df.items():
        df[k] = np.log(nrows / float(v))  ## df turns to idf - reusing memory
    # scale by idf
    for i in range(0, nnz):
        val[i] *= df[ind[i]]
        
    return df if copy is False else mat

def csr_l2normalize(mat, copy=False, **kargs):
    r""" Normalize the rows of a CSR matrix by their L-2 norm. 
    If copy is True, returns a copy of the normalized matrix.
    """
    if copy is True:
        mat = mat.copy()
    nrows = mat.shape[0]
    nnz = mat.nnz
    ind, val, ptr = mat.indices, mat.data, mat.indptr
    # normalize
    for i in range(nrows):
        rsum = 0.0    
        for j in range(ptr[i], ptr[i+1]):
            rsum += val[j]**2
        if rsum == 0.0:
            continue  # do not normalize empty rows
        rsum = 1.0/np.sqrt(rsum)
        for j in range(ptr[i], ptr[i+1]):
            val[j] *= rsum
            
    if copy is True:
        return mat

In [13]:
csr_mat = build_matrix(docs)
mat1 = csr_idf(csr_mat, copy=True)
mat = csr_l2normalize(mat1, copy=True)

In [14]:
cos_sim_sparse = cosine_similarity(mat,dense_output=False)

In [24]:
results = []
k=1799
zero_counts = 0
possible_range = [-1,+1]
for i in range(length_training, length_training+length_testing):
    if i%1000==1:
        print('Staring ' + str(i-25000))
    sim_1 = cos_sim_sparse[i, :length_training]
    sim_arr = sim_1.toarray()
    sim_list = sim_arr.tolist()
    sim = sim_list[0]
    r = range(len(labels))
    sim_labels = zip(sim, labels, r)
    sim_sort_labels = sorted(sim_labels, key=lambda (v, k, l): v, reverse=True)
    sum = 0
    for j in range(k):
        if sim_sort_labels[j][0] != 0:
            sum = sum + int(sim_sort_labels[j][1])
        if sum == 0:
            sum = np.random.choice(possible_range, 1, p=[0.3, 0.7])[0]
    if sum > 0:
        results.append(1)
    else:
        results.append(-1)


Staring 1
Staring 1001
Staring 2001
Staring 3001
Staring 4001
Staring 5001
Staring 6001
Staring 7001
Staring 8001
Staring 9001
Staring 10001
Staring 11001
Staring 12001
Staring 13001
Staring 14001
Staring 15001
Staring 16001
Staring 17001
Staring 18001
Staring 19001
Staring 20001
Staring 21001
Staring 22001
Staring 23001
Staring 24001


In [25]:
l = len(results)
print(l)
print(results[0])
print(results[l-1])

25000
1
-1


In [26]:
op_file = open('output_k_1799.dat', 'w')
for r in results:
    if r == 1:
        op_file.write("+1\n")
    else:
        op_file.write("-1\n")
op_file.close()