In [1]:
from collections import Counter
import math
import os
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
import sys
import pdb
import scipy
import itertools
import math

In [2]:
movies_filename = "movies.csv"
rating_train_filename = "ratings_train.csv"
rating_test_filename = "ratings_test.csv"
rating_test_truth_filename = "ratings_test_truth.csv"

In [3]:
ratings_train = pd.read_csv(rating_train_filename)
ratings_test = pd.read_csv(rating_test_filename)
ratings_test_truth = pd.read_csv(rating_test_truth_filename)
movies = pd.read_csv(movies_filename)

In [10]:
def get_shingle_hash(hash_size, filename):
    shingles = []
    movieDict = {}
    movieDictInv = {}
    
    with open(filename, 'r') as f:
        next(f)
        i = 0
        for line in f:
            if line[-1] == '\n':
                line = line[0:-1]
                print(line)
            lineRead = [t for t in line[:-1].split(',') if  t != '']
            print(lineRead)
            tokens = [g for g in lineRead[-1].split('|') if g != '']
            print(tokens)
            movieDict[int(lineRead[0])] = i
            print(movieDict)
            movieDictInv[i] = int(lineRead[0])
            print(movieDictInv)
            shingle = set()
            for ngram in tokens:
                ngram = frozenset(ngram)
                ngram_hash = hash(ngram) % hash_size
                shingle.add(ngram_hash)
            print(shingle)
            shingles.append(shingle)
            i += 1
        
        # return None to keep the same signature as q1
    return shingles, movieDict, movieDictInv

In [5]:
def get_hash_coeffs(br):
    rnds = np.random.choice(2**10, (2, br), replace=False)
    c = 1048583
    return rnds[0], rnds[1], c

In [6]:
def min_hashing(shingles, hash_coeffs, br):
    count = len(shingles)
    (a, b, c) = hash_coeffs
    a = a.reshape(1, -1)
    M = np.zeros((br, count), dtype=int) 
    for i, s in enumerate(shingles):
        
        row_idx = np.asarray(list(s)).reshape(-1, 1)
        m = (np.matmul(row_idx, a) + b) % c
        m_min = np.min(m, axis=0) 
        M[:, i] = m_min

    return M

In [7]:
def LSH(M, b, r, band_hash_size):
    count = M.shape[1]
    bucket_list = []
    for band_index in range(b):
        # The hash table for each band is stored as a sparse matrix! Learn basics about sparse matrix here: https://docs.scipy.org/doc/scipy/reference/sparse.html
        # However, I did a few benchmark, lil_matrix, dok_matrix are claimed to be efficient for incremental consturction, they are not as efficient as store indices of non-zero entries in a list
        # So instead of having a matrix, just two arrays to store the indices
        # But we need to image there is a matrix. Its layout same as slide 56. Cols are documents, rows are hash of signature index
        row_idx = []
        col_idx = []

        row_start = band_index * r
        for c in range(count):
            v = M[row_start:(row_start+r), c]
            v_hash = hash(tuple(v.tolist())) % band_hash_size
            row_idx.append(v_hash)
            col_idx.append(c)

        # It's a binary matrix. Set to True at these indices.
        data_ary = [True] * len(row_idx)

        # Convert to row based sparse matrix for fast processing later
        m = scipy.sparse.csr_matrix((data_ary, (row_idx, col_idx)), shape=(band_hash_size, count), dtype=bool)
        bucket_list.append(m)

    return bucket_list

In [8]:
def find_similiar(shingles, query_index, threshold, bucket_list, M, b, r, band_hash_size, verify_by_signature, movieDictInv):
    # Step 1: Find candidates
    candidates = set()
    for band_index in range(b):
        row_start = band_index * r
        v = M[row_start:(row_start+r), query_index]
        v_hash = hash(tuple(v.tolist())) % band_hash_size

        m = bucket_list[band_index]
        bucket = m[v_hash].indices #Sparse sparse matrix method: get indices of nonzero elements
        #print(f'Band: {band_index}, candidates: {bucket}')
        candidates = candidates.union(bucket)

   # print(f'Found {len(candidates)} candidates')

    # Step 2: Verify similarity of candidates
    sims = {}
    simsSet = set()
    # Since the candidates size is small, we just evaluate it on k-shingles matrix, or signature matrix for greater efficiency
    if verify_by_signature:
        query_vec = M[:, query_index]
        for col_idx in candidates:
            col = M[:, col_idx]
            sim = np.mean(col == query_vec) # Jaccard Similarity is proportional to the fraction of the minhashing signature they agree
            if sim >= threshold:
                sims[movieDictInv[col_idx]] = sim
                simsSet.add(movieDictInv[col_idx])
                
    else:
        query_set = shingles[query_index]
        for col_idx in candidates:
            col_set = shingles[col_idx]

            sim = len(query_set & col_set) / len(query_set | col_set) # Jaccard Similarity
            if sim >= threshold:
                sims[movieDictInv[col_idx]] = sim
                simsSet.add(movieDictInv[col_idx])

    return sims, simsSet

In [8]:
threshold = 0.8
hash_size = 2**20
band_hash_size = 2**8
verify_by_signature = False

b = 7
r = 2
br = b*r

In [11]:
shingles, movieDict, movieDictInv  = get_shingle_hash(hash_size, movies_filename)
hash_coeffs = get_hash_coeffs(br)
M = min_hashing(shingles, hash_coeffs, br)
bucket_list = LSH(M, b, r, band_hash_size)


1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
['1', 'Toy Story (1995)', 'Adventure|Animation|Children|Comedy|Fantas']
['Adventure', 'Animation', 'Children', 'Comedy', 'Fantas']
{1: 0}
{0: 1}
{245537, 719907, 524070, 403956, 604986}
2,Jumanji (1995),Adventure|Children|Fantasy
['2', 'Jumanji (1995)', 'Adventure|Children|Fantas']
['Adventure', 'Children', 'Fantas']
{1: 0, 2: 1}
{0: 1, 1: 2}
{245537, 604986, 403956}
3,Grumpier Old Men (1995),Comedy|Romance
['3', 'Grumpier Old Men (1995)', 'Comedy|Romanc']
['Comedy', 'Romanc']
{1: 0, 2: 1, 3: 2}
{0: 1, 1: 2, 2: 3}
{835091, 719907}
4,Waiting to Exhale (1995),Comedy|Drama|Romance
['4', 'Waiting to Exhale (1995)', 'Comedy|Drama|Romanc']
['Comedy', 'Drama', 'Romanc']
{1: 0, 2: 1, 3: 2, 4: 3}
{0: 1, 1: 2, 2: 3, 3: 4}
{835091, 719907, 378229}
5,Father of the Bride Part II (1995),Comedy
['5', 'Father of the Bride Part II (1995)', 'Comed']
['Comed']
{1: 0, 2: 1, 3: 2, 4: 3, 5: 4}
{0: 1, 1: 2, 2: 3, 3: 4, 4: 5}
{523707}
6,Heat (1995

{1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 26: 25, 27: 26, 28: 27, 29: 28, 30: 29, 31: 30, 32: 31, 34: 32, 36: 33, 38: 34, 39: 35, 40: 36, 41: 37, 42: 38, 43: 39, 44: 40, 45: 41, 46: 42, 47: 43, 48: 44, 49: 45, 50: 46, 52: 47, 53: 48, 54: 49, 55: 50, 57: 51, 58: 52, 60: 53, 61: 54, 62: 55, 63: 56, 64: 57, 65: 58, 66: 59, 68: 60, 69: 61, 70: 62, 71: 63, 72: 64, 73: 65, 74: 66, 75: 67, 76: 68, 77: 69, 78: 70, 79: 71, 80: 72, 81: 73, 82: 74, 83: 75, 85: 76, 86: 77, 87: 78, 88: 79, 89: 80, 92: 81, 93: 82, 94: 83, 95: 84, 96: 85, 97: 86, 99: 87, 100: 88, 101: 89, 102: 90, 103: 91, 104: 92, 105: 93, 106: 94, 107: 95, 108: 96, 110: 97, 111: 98, 112: 99, 113: 100, 116: 101, 117: 102, 118: 103, 119: 104, 121: 105, 122: 106, 123: 107, 125: 108, 126: 109, 128: 110, 129: 111, 132: 112, 135: 113, 137: 114, 140: 115, 141: 116, 144: 117, 145: 118, 146: 119, 147: 1

361,It Could Happen to You (1994),Comedy|Drama|Romance
['361', 'It Could Happen to You (1994)', 'Comedy|Drama|Romanc']
['Comedy', 'Drama', 'Romanc']
{1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 26: 25, 27: 26, 28: 27, 29: 28, 30: 29, 31: 30, 32: 31, 34: 32, 36: 33, 38: 34, 39: 35, 40: 36, 41: 37, 42: 38, 43: 39, 44: 40, 45: 41, 46: 42, 47: 43, 48: 44, 49: 45, 50: 46, 52: 47, 53: 48, 54: 49, 55: 50, 57: 51, 58: 52, 60: 53, 61: 54, 62: 55, 63: 56, 64: 57, 65: 58, 66: 59, 68: 60, 69: 61, 70: 62, 71: 63, 72: 64, 73: 65, 74: 66, 75: 67, 76: 68, 77: 69, 78: 70, 79: 71, 80: 72, 81: 73, 82: 74, 83: 75, 85: 76, 86: 77, 87: 78, 88: 79, 89: 80, 92: 81, 93: 82, 94: 83, 95: 84, 96: 85, 97: 86, 99: 87, 100: 88, 101: 89, 102: 90, 103: 91, 104: 92, 105: 93, 106: 94, 107: 95, 108: 96, 110: 97, 111: 98, 112: 99, 113: 100, 116: 101, 117: 102, 118: 103, 119: 104, 121: 10

{0: 1, 1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 12, 12: 13, 13: 14, 14: 15, 15: 16, 16: 17, 17: 18, 18: 19, 19: 20, 20: 21, 21: 22, 22: 23, 23: 24, 24: 25, 25: 26, 26: 27, 27: 28, 28: 29, 29: 30, 30: 31, 31: 32, 32: 34, 33: 36, 34: 38, 35: 39, 36: 40, 37: 41, 38: 42, 39: 43, 40: 44, 41: 45, 42: 46, 43: 47, 44: 48, 45: 49, 46: 50, 47: 52, 48: 53, 49: 54, 50: 55, 51: 57, 52: 58, 53: 60, 54: 61, 55: 62, 56: 63, 57: 64, 58: 65, 59: 66, 60: 68, 61: 69, 62: 70, 63: 71, 64: 72, 65: 73, 66: 74, 67: 75, 68: 76, 69: 77, 70: 78, 71: 79, 72: 80, 73: 81, 74: 82, 75: 83, 76: 85, 77: 86, 78: 87, 79: 88, 80: 89, 81: 92, 82: 93, 83: 94, 84: 95, 85: 96, 86: 97, 87: 99, 88: 100, 89: 101, 90: 102, 91: 103, 92: 104, 93: 105, 94: 106, 95: 107, 96: 108, 97: 110, 98: 111, 99: 112, 100: 113, 101: 116, 102: 117, 103: 118, 104: 119, 105: 121, 106: 122, 107: 123, 108: 125, 109: 126, 110: 128, 111: 129, 112: 132, 113: 135, 114: 137, 115: 140, 116: 141, 117: 144, 118: 145, 119: 146, 120: 1

UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 827: character maps to <undefined>

In [12]:
result = []
for index, row in ratings_test.iterrows():
    mlist = list(ratings_train.loc[ratings_train['userId'] == row['userId']]['movieId'])
    mrlist = list(ratings_train.loc[ratings_train['userId'] == row['userId']]['rating'])
    mlistSet = set(mlist)
    movieID = row['movieId']
    print(movieID)
    movieIDIdx = movieDict[movieID]
    meanRatingByUser = np.mean(mrlist)    
    sims, simsSet = find_similiar(shingles, movieIDIdx, threshold, bucket_list, M, b, r, band_hash_size, verify_by_signature, movieDictInv)
    
    intersection = mlistSet.intersection(simsSet)
    
    if len(intersection)==0:
        result.append(meanRatingByUser)
    
    else:
        ratingCommon = 0
        for i in range(len(mlist)):
            if mlist[i] in intersection:
                ratingCommon += (mrlist[i]*sims[mlist[i]])
                
        ratingCommon = ratingCommon/len(intersection)
        if ratingCommon < 1.0:
            ratingCommon = 1.0
        elif ratingCommon > 5.0:
            ratingCommon = 5.0
        result.append(ratingCommon)
        



        
        

    

1


NameError: name 'movieDict' is not defined

In [208]:
from sklearn.metrics import mean_squared_error

rmseError = np.sqrt(mean_squared_error(ratings_test_truth['rating'],result))

print(rmseError)



1.0091969002719883


In [None]:
0.9968427290744207 7 , 2
0.9969012605414767 7,2, 0.84
1.0091969002719883