In [None]:
!pip install pyspark
!pip install -U -q PyDrive
!apt install openjdk-8-jdk-headless -qq
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

In [None]:
from pyspark import SparkContext
from pyspark.mllib.recommendation import ALS, Rating
import math 
import datetime
import sys
import random
import itertools as it

In [None]:
lsh = SparkContext.getOrCreate()

Matching companies to all users

In [None]:
def UsertoRowMapping(company, dictionary_users): 
    r_users = company[1]
    company_id = company[0]
    new_list = []

    for user in r_users:
        new_list.append(dictionary_users[user])
        
    return (company_id,new_list)

Function to collect parameters of all hash functions

In [None]:
def HashParameters(no_of_hash_fns, max_row_number):
    all_parameters = []
    count = 0 
    while count < no_of_hash_fns: 
        a = random.randint(0,1000)
        b = random.randint(0,1000)
        all_parameters.append([a,b,max_row_number])
        count+=1
    return all_parameters

Generate hash embeddings using the parameters generated form the above functions

In [None]:
def GenerateHashFunctions(company, hash_params, max_row_number): #apply hash parameters in hash functions and then on user_ids
    row_values = []
    for parameters in hash_params:
        a = parameters[0]
        b = parameters[1]
        c = parameters[2]
        company_id = company[0]
        initial_row_numbers = company[1]
        new_row_numbers = []
        for y in initial_row_numbers:
            new_number = ((a*y) + b) % max_row_number
            new_row_numbers.append(new_number)
        min_value = min(new_row_numbers)
        row_values.append(min_value)
    return (company[0],row_values)

In [None]:
def CreateBands(business,rows,n): 
    users = business[1]
    business_id = business[0]
    band_size = []
    p = 0 
    while p <= (n - 1):
        a = users[p:p+rows]
        band_size.append(a)
        p = p + rows
    return (business_id,users, band_size)

In [None]:
def BandManipulation(business): 
	users = tuple(business[2])
	business_sig = business[1]
	business_id = business[0]
	return (users,(business_id,business_sig))

In [None]:
def RemoveSigMat(candidate): 
    cands = []
    cand = candidate[1]
    for c in cand: 
        business_id = c[0]
        cands.append(business_id)
    return cands


In [None]:
def Transform(band,business_dict): 
    if len(band) == 2:
        pair1 = band[0]
        pair1_data = business_dict[pair1]
        pair2 = band[1]
        pair2_data = business_dict[pair2]
        return ((pair1,pair1_data),(pair2,pair2_data))
    else:
        combos = list(it.combinations(band,2))
        combos_list = []
        for combo in combos:
            pair1 = combo[0]
            pair1_data = business_dict[pair1]
            pair2 = combo[1]
            pair2_data = business_dict[pair2]
            combos_list.append(((pair1,pair1_data),(pair2,pair2_data)))
        return combos_list

In [None]:
train_RDD = lsh.textFile('/content/drive/MyDrive/yelp_sample/yelp_train.csv')
header = train_RDD.first()
train_RDD = train_RDD.filter(lambda x: x!= header) #remove header
train_initial = train_RDD.map(lambda x:x.split(','))
user_as_key = train_initial.map(lambda x:(x[0],x[1]))
business_as_key = train_initial.map(lambda x:(x[1],x[0]))

In [None]:
user_combos = user_as_key.groupByKey()
user_combos = user_combos.collect()

In [None]:
print(len(user_combos))
user_combos[:3]

11270


[('o0p-iTC5yTBV5Yab_7es4g',
  <pyspark.resultiterable.ResultIterable at 0x7f7814f00c18>),
 ('-qj9ouN0bzMXz1vfEslG-A',
  <pyspark.resultiterable.ResultIterable at 0x7f7814f611d0>),
 ('4bQqil4770ey8GfhBgEGuw',
  <pyspark.resultiterable.ResultIterable at 0x7f7814f61588>)]

In [None]:
user_combos = user_as_key.groupByKey().map(lambda x : (x[0], list(x[1])))
user_combos.take(1)

[('o0p-iTC5yTBV5Yab_7es4g',
  ['iAuOpYDfOTuzQ6OPpEiGwA',
   'e_Nf4zAA1KZq80aoy3v8Ng',
   'qFZ7ReYNpjHuvSxOqXqHMg',
   'LwNZ1AR4_5iukgQYuhmTOg',
   'ZWcXoh_RO7rT2DdcyOzyfQ',
   'BORPULADzZV6_LJZzcsywQ',
   '3cHL1r5w2oLYH-veijtweA',
   'XXW_OFaYQkkGOGniujZFHg',
   'G0EJjh-dqJBmSEhQRog-hQ',
   'lfXfxBms5z1nwzkxxLFBWg',
   'i60v66vwf1A13YqcIqGD_A',
   'A37JfEzlKQygyzA-5p2dWw',
   'nVAJZ6BJ9PPlxDXn976R6A',
   'v3AS5LGeV2Si4nOHZ7lgxQ',
   'FaHADZARwnY4yvlvpnsfGA',
   'fQt9QVAZM9PM0wFDdROvWw',
   'pzUm_jLzVa9JAf4sC6tPLg',
   'mLjDh0vQYBCY6g9T1HczVw',
   'meXjqyhTNLFmknY39y2sMg',
   'ZwKUXfZguhBD1zsfb0RLPg',
   'SLJX2dksO_BeNMQUe3Nr8Q'])]

In [None]:
user_combos = user_combos.zipWithIndex()
user_combos.take(1)

[(('o0p-iTC5yTBV5Yab_7es4g',
   ['iAuOpYDfOTuzQ6OPpEiGwA',
    'e_Nf4zAA1KZq80aoy3v8Ng',
    'qFZ7ReYNpjHuvSxOqXqHMg',
    'LwNZ1AR4_5iukgQYuhmTOg',
    'ZWcXoh_RO7rT2DdcyOzyfQ',
    'BORPULADzZV6_LJZzcsywQ',
    '3cHL1r5w2oLYH-veijtweA',
    'XXW_OFaYQkkGOGniujZFHg',
    'G0EJjh-dqJBmSEhQRog-hQ',
    'lfXfxBms5z1nwzkxxLFBWg',
    'i60v66vwf1A13YqcIqGD_A',
    'A37JfEzlKQygyzA-5p2dWw',
    'nVAJZ6BJ9PPlxDXn976R6A',
    'v3AS5LGeV2Si4nOHZ7lgxQ',
    'FaHADZARwnY4yvlvpnsfGA',
    'fQt9QVAZM9PM0wFDdROvWw',
    'pzUm_jLzVa9JAf4sC6tPLg',
    'mLjDh0vQYBCY6g9T1HczVw',
    'meXjqyhTNLFmknY39y2sMg',
    'ZwKUXfZguhBD1zsfb0RLPg',
    'SLJX2dksO_BeNMQUe3Nr8Q']),
  0)]

In [None]:
user_combos = user_as_key.groupByKey().map(lambda x : (x[0], list(x[1]))).zipWithIndex().map(lambda x: (x[0][0],x[1]))
users_row_numbers = user_combos.collect()
users_row_numbers[:10]

[('o0p-iTC5yTBV5Yab_7es4g', 0),
 ('-qj9ouN0bzMXz1vfEslG-A', 1),
 ('4bQqil4770ey8GfhBgEGuw', 2),
 ('zsZBYWYEmLLs81_f-HHM8w', 3),
 ('Fv0e9RIV9jw5TX3ctA1WbA', 4),
 ('z4RytucxI_XfcMFaEI2DRg', 5),
 ('BeoFmNoFuz-h8uso-J2_lg', 6),
 ('pMefTWo6gMdx8WhYSA2u3w', 7),
 ('Ez8ifxrtOUT5TM4817scOQ', 8),
 ('UYcmGbelzRa0Q6JqzLoguw', 9)]

In [None]:
user_dict = {} #to map user_id to integer in the future
for user in users_row_numbers: 
	user_dict[user[0]] = user[1]

In [None]:
business_combos = business_as_key.groupByKey().map(lambda x : (x[0], list(x[1])))
business_combos.take(1)

[('3MntE_HWbNNoyiLGxywjYA',
  ['T13IBpJITI32a1k41rc-tg',
   'xhlcoVm3FOKcxZ0phkdO6Q',
   '4o0KkpAkyO6r0NHXmobTeQ',
   'TQXtrSpsUyvHMriX8hvNWQ',
   'kjaUSiRWhR9bF9KxOMbVvg'])]

In [None]:
business_combos_with_rows = business_combos.map(lambda x: UsertoRowMapping(x,user_dict))
bus_comb = business_combos_with_rows.collect()
print(bus_comb[:5])

[('3MntE_HWbNNoyiLGxywjYA', [5628, 791, 722, 7074, 2988]), ('YXohNvMTCmGhFMSQsDZq1g', [2, 25, 6283, 1514, 7139, 2350, 8823, 3339, 5723, 282, 9706, 7950, 10051, 3200, 975, 2834, 2841, 1069, 6774, 5104, 490, 1391, 4588, 2435, 7577, 2187, 1075, 508, 4442, 9550, 9, 3472, 587, 3495, 3153, 5645, 8596, 7783, 3848, 9375, 51, 7047, 1843, 3734, 601, 153, 1279, 303, 265, 910, 5647, 1794, 11212, 8899, 3820, 424, 10752, 6276, 643, 1232, 437, 3127, 1587, 10390, 6224, 756, 7092, 3888, 1669, 6425, 9438, 7342, 4747, 1660, 3390, 7061, 6072, 8736, 1431, 1735, 6523, 2536, 5740, 2332, 6615, 1390, 1050, 6477, 7153, 1137, 6501, 308, 10472, 10882, 4, 8827, 2824, 3666, 10730, 8564, 8029, 2936, 5948, 3171, 7892, 1944, 403, 2179, 3694, 373, 7703, 5737, 3255, 5286, 3559, 2963, 9919, 8328, 11060, 7564, 4602, 1116, 3733, 6894, 5247, 6327, 2679, 3374, 6460, 8550, 544, 6443, 8927, 345, 10243, 1698, 512, 4374, 7148, 3649, 6314, 5932, 4763, 646, 8558, 5940, 7903, 6458, 1293, 1751, 3131, 2438, 1292, 2568, 5255, 4733, 10

In [None]:
business_dict = {} 
for bus in bus_comb: 
	business_dict[bus[0]] = bus[1]

In [None]:
max_row_number = len(users_row_numbers) 
n = 60 
bands = 20
rows = 3
hparams = HashParameters(n,max_row_number) 

In [None]:
sig_mat = business_combos_with_rows.map(lambda x: GenerateHashFunctions(x, hparams,max_row_number))
print(sig_mat.take(1))

[('3MntE_HWbNNoyiLGxywjYA', [1784, 2927, 2558, 113, 5753, 2482, 4844, 5301, 2549, 1059, 171, 3110, 1223, 2632, 2796, 3967, 1664, 2243, 161, 2135, 2016, 1205, 2057, 5290, 4046, 5636, 4094, 1561, 1238, 4558, 173, 164, 264, 3822, 2123, 1158, 363, 42, 1115, 2043, 5918, 2695, 909, 4538, 3364, 1061, 1025, 432, 1366, 2369, 1700, 132, 743, 696, 580, 3778, 1524, 4257, 1248, 2189])]


In [None]:
banded_sig_mat = sig_mat.map(lambda x: CreateBands(x,rows,n))
print(banded_sig_mat.take(1))

[('3MntE_HWbNNoyiLGxywjYA', [1784, 2927, 2558, 113, 5753, 2482, 4844, 5301, 2549, 1059, 171, 3110, 1223, 2632, 2796, 3967, 1664, 2243, 161, 2135, 2016, 1205, 2057, 5290, 4046, 5636, 4094, 1561, 1238, 4558, 173, 164, 264, 3822, 2123, 1158, 363, 42, 1115, 2043, 5918, 2695, 909, 4538, 3364, 1061, 1025, 432, 1366, 2369, 1700, 132, 743, 696, 580, 3778, 1524, 4257, 1248, 2189], [[1784, 2927, 2558], [113, 5753, 2482], [4844, 5301, 2549], [1059, 171, 3110], [1223, 2632, 2796], [3967, 1664, 2243], [161, 2135, 2016], [1205, 2057, 5290], [4046, 5636, 4094], [1561, 1238, 4558], [173, 164, 264], [3822, 2123, 1158], [363, 42, 1115], [2043, 5918, 2695], [909, 4538, 3364], [1061, 1025, 432], [1366, 2369, 1700], [132, 743, 696], [580, 3778, 1524], [4257, 1248, 2189]])]


In [None]:
b1_to_b20_amended = []

for i in range(20):
    b1 = banded_sig_mat.map(lambda x:(x[0],x[1],x[2][i]))
    b1_to_b20_amended.append(b1.map(lambda x:BandManipulation(x)).groupByKey().map(lambda x : (x[0], list(x[1]))).filter(lambda x:len(x[1])>=2))

candidates = None
candidates = b1_to_b20_amended[0].union(b1_to_b20_amended[1])

for b_amended in b1_to_b20_amended[2:]:
    candidates = candidates.union(b_amended)    

In [None]:
candidates_cleaned = candidates.map(lambda x: RemoveSigMat(x))
candidates_original = candidates_cleaned.map(lambda x: Transform(x,business_dict)) #add original user_id
candidate_sim = candidates_original.collect()

In [None]:
jac_sim = [] 
for cand in candidate_sim:
	if isinstance(cand,tuple):
	    pair1 = cand[0][0]
	    pair1_data = set(cand[0][1])
	    pair2 = cand[1][0]
	    pair2_data = set(cand[1][1])
	    jac = ((len(pair1_data&pair2_data)))/float(len(pair1_data|pair2_data))
	    inp = ((pair1,pair2),jac)
	    jac_sim.append(inp)
	else:
	    for combo in cand: 
	        pair1 = combo[0][0]
	        pair1_data = set(combo[0][1])
	        pair2 = combo[1][0]
	        pair2_data = set(combo[1][1])
	        jac = ((len(pair1_data&pair2_data)))/float(len(pair1_data|pair2_data))
	        inp = ((pair1,pair2),jac)
	        jac_sim.append(inp)

In [None]:
similar = {} #filter out those with jaccard less than 0.5
for pair in jac_sim:
	a = pair[0][0]
	b = pair[0][1]
	jaccard = pair[1]
	if(jaccard>=0.5):
	    if(a<b):
	        if (a,b) not in similar:
	            similar[(a,b)] = jaccard       
	    else:
	        if (b,a) not in similar:
	            similar[(b,a)] = jaccard 


In [None]:
similar_pairs2 = [] 
for k,v in similar.items():
	inp = (k,v)
	similar_pairs2.append(inp)
sorted_similar_pair2 = sorted(similar_pairs2, key=lambda x: (x[0][0], x[0][1]))

In [None]:
sorted_similar_pair2[-10:]

[(('uhRh3jq8blzxZLJH3xks_Q', 'vwpsor_zVUTJJkp301Dpog'), 0.5),
 (('uwO1mCtTOyzQN96eRtfk_g', 'y0maJiW69rMQRbrfXtmssw'), 0.5),
 (('uzpdcxvlKx5BuoWU3NRHbA', 'xeN57OX3lJfY96nfEx63fg'), 1.0),
 (('vCopginiBsz1STNhQPb8XA', 'yUB8CDsYMES3cN-xcHXCkQ'), 0.5555555555555556),
 (('vVdY2qDO7dkUrU8LXYZ1uA', 'zrtRx2bm55zVeYIpzLLm3w'), 0.6666666666666666),
 (('vadTcg3fvqiXj0Bh9Lv3zg', 'wvV-mcM-Djgv6Z29rnwlVg'), 0.5),
 (('w9CWjQW-tiAdWSPBzmXLvA', 'xD0qY7CyI2PCcr0lvVp9-A'), 0.5),
 (('wfsCX4M-uf_l-hQA1sSWyQ', 'yFKNTMyCQZ92M91A_yavhw'), 0.5),
 (('wpDJUu8Ty_eyQJENUy7fIw', 'z3hD4GX5gSJtuqNhN-o2cg'), 0.5),
 (('wtE6O9u1ni81T_TW27uYgQ', 'zi0HXFQuzMeozBwQ4iWs7w'), 0.5)]

In [None]:
with open('/content/drive/MyDrive/yelp_sample/lsh_results','w') as outfile:
	outfile.write('business_id_1')
	outfile.write(",")
	outfile.write('business_id_2')
	outfile.write(",")
	outfile.write("similarity")
	outfile.write('\n')
	for p in sorted_similar_pair2:
		outfile.write(str(p[0][0]))
		outfile.write(",")
		outfile.write(str(p[0][1]))
		outfile.write(",")
		outfile.write(str(p[1]))
		outfile.write("\n")
outfile.close()