In [1]:
import os
import sys
from pyspark import SparkContext, SparkConf
import json
import itertools
import math
import numpy as np
import time
from itertools import combinations
import random

In [2]:
appName = 'assignment3'
master = 'local[*]'
conf = SparkConf().setAppName(appName).setMaster(master)
sc = SparkContext(conf=conf)
sc.setLogLevel("INFO")

sc

In [34]:

def lsh(train_data_path):
    # Total number of users 11270
    p = 13591
    m = 11270
    number_of_hashes = 128
    a = random.sample(range(1, p), number_of_hashes)
    b = random.sample(range(0, p), number_of_hashes)


    def read_csv_line(line):
        line = line.split(',')
        return (line[0].strip(), line[1].strip(), line[2].strip())

    def read_business(data_path):
        rdd = sc.textFile(data_path)
        header = rdd.first()
        rdd = rdd.filter(lambda x: x != header).map(read_csv_line)
        unique_users = rdd.map(lambda x: x[0]).distinct().collect()    
        userid_index = {}
        for i, uid in enumerate(unique_users):
            userid_index[uid] = i

        business_users = rdd.map(lambda x: (x[1], userid_index[x[0]])).groupByKey().map(lambda x: (x[0], list(x[1])))

        return business_users

    def calculate_hash(a, b, p, m, vec):
        return min([((a*x)+b)%p%m for x in vec])

    def minhash(vector, a, b, p, m):
        return [calculate_hash(ai, bi, p, m, vector) for ai, bi in zip(a, b)]
    #     return list(np.min((a.reshape(1, a.shape[0]) * vector.reshape(vector.shape[0], 1) + b) %p %m, axis=0))

    def hash_bands(x):
        doc_id, sig = x
        b = 64
        r = 2
        output = []
        for i in range(0, b):
            output.append(((i, hash(tuple(sig[r*i:(i+1)*r]))), doc_id))
        return output


    def jaccard_sim(x, y):
        x = set(x)
        y = set(y)
        return len(x.intersection(y)) / len(x.union(y))


    def index_signatures(signatures):
        index = {}
        for x, y in signatures:
            index[x] = y

        return index
    
    
    rdd = read_business(train_data_path)
    signatures = rdd.collect()
    index = index_signatures(signatures)
    business_sig = sc.parallelize(signatures).map(lambda x: (x[0], minhash(x[1], a, b, p, m)))
    candidates = business_sig.flatMap(hash_bands).groupByKey().map(lambda x: (x[0], list(x[1]))).filter(lambda x: len(x[1]) > 1).collect()
    reverse_index = {}
    candidates_index = {}

    for bid, c in candidates:
        candidates_index[bid] = c

    for bucket_id, bucket in candidates:
        for bid in bucket:
            reverse_index[bid] = bucket_id
            
    return candidates_index, reverse_index

In [35]:
candidates_index, reverse_index = lsh('./data/yelp_train.csv')

In [36]:
len(candidates_index)

143615

In [37]:
def read_csv_line(line):
    line = line.split(',')
    return (line[0].strip(), line[1].strip(), line[2].strip())

def avg(item):
    vec = item[1]
    s = 0
    temp = {}
    for x in vec:
        s += float(x[1])
        temp[x[0]] = x[1]
        
    return (item[0], {'vec': temp, 'avg': s/len(vec)})

def read_business(data_path):
    rdd = sc.textFile(data_path)
    header = rdd.first()
    rdd = rdd.filter(lambda x: x != header).map(read_csv_line)
    
    business_users = rdd.map(lambda x: (x[1], (x[0], x[2]))).groupByKey().map(avg).collect()
    temp = {}
    for b, u in business_users:
        temp[b] = u
    return temp


def read_user(data_path):
    rdd = sc.textFile(data_path)
    header = rdd.first()
    rdd = rdd.filter(lambda x: x != header).map(read_csv_line)
    
    business_users = rdd.map(lambda x: (x[0], (x[1], x[2]))).groupByKey().map(avg).collect()
    temp = {}
    for u, b in business_users:
        temp[u] = b
    return temp


def pearson_correlation(business1, business2):
    vec_set1 = set()
    b1_sum = 0
    for user, rating in business1['vec'].items():
        b1_sum += float(rating)
        vec_set1.add(user)
    
    b2_sum = 0
    vec_set2 = set()
    for user, rating in business2['vec'].items():
        b2_sum += float(rating)
        vec_set2.add(user)
            
    vec_set = vec_set1.union(vec_set2) 
    if(len(vec_set) == 0):
        return 0.0
    b1_sum += (len(vec_set)-len(business1['vec'])) * 3.5
    b2_sum += (len(vec_set)-len(business2['vec'])) * 3.5
    b1_avg = b1_sum/len(vec_set)
    b2_avg = b2_sum/len(vec_set)

    corr_numerator = 0
    corr_denominator_p1 = 0
    corr_denominator_p2 = 0
    
    for user_id in vec_set:
        v1 = None
        if(user_id in business1['vec']):
            v1 = float(business1['vec'][user_id])-b1_avg
        else:
            v1 = business1['avg']-b1_avg
            
            
        if(user_id in business2['vec']):
            v2 = float(business2['vec'][user_id])-b2_avg
        else:
            v2 = business2['avg']-b2_avg
        
        corr_numerator += v1*v2
        corr_denominator_p1 += math.pow(v1, 2)
        corr_denominator_p2 += math.pow(v2, 2)
    
    denominator = math.pow(corr_denominator_p1, 0.5) * math.pow(corr_denominator_p2, 0.5)
    
    if(denominator != 0):
        return corr_numerator / denominator
    else:
        return 0.0



def calculate_rating(train_data, user_avg, reverse_index, candidates_index, business_id, user_id):
    if(not business_id in train_data and not user_id in user_avg):
        return "{},{},{}\n".format(user_id, business_id, 4.0)
    if(not business_id in train_data and user_id in user_avg):
        return "{},{},{}\n".format(user_id, business_id, user_avg[user_id]['avg'])
    if(business_id in train_data and not user_id in user_avg):
        return "{},{},{}\n".format(user_id, business_id, train_data[business_id]['avg'])
    
    business_data = train_data[business_id]
    similar_items = []
    for bid in candidates_index[reverse_index['FaHADZARwnY4yvlvpnsfGA']]:
        b = train_data[bid]
        if(not user_id in b['vec']):
            continue
        sim = pearson_correlation(business_data, b)
        if(sim > 0.0):
            similar_items.append((sim, float(b['vec'][user_id])))
#     similar_item = sorted(similar_items, key=lambda x: x[0], reverse=True)
    numerator = 0
    denominator = 0
    
    for similarity, rating in similar_items:
        weight = similarity# * math.pow(abs(similarity), rho-1)
        numerator += weight*rating
        denominator += weight
        
    if(denominator != 0):
        return "{},{},{}\n".format(user_id, business_id, numerator/denominator)
    else:
        return "{},{},{}\n".format(user_id, business_id, train_data[business_id]['avg'])

    
    
# def calculate_rating_partition(train_data, user_avg, test_data):
    
#     for (business_id, user_id) in test_data:
    
#         if(not business_id in train_data and not user_id in user_avg):
#             yield "{},{},{}\n".format(user_id, business_id, 4.0)
#             continue
#         if(not business_id in train_data and user_id in user_avg):
#             yield "{},{},{}\n".format(user_id, business_id, user_avg[user_id]['avg'])
#             continue
#         if(business_id in train_data and not user_id in user_avg):
#             yield "{},{},{}\n".format(user_id, business_id, train_data[business_id]['avg'])
#             continue

#         business_data = train_data[business_id]
#         similar_items = []
#         for bid, rating in user_avg[user_id]['vec'].items():
#             b = train_data[bid]
#             sim = pearson_correlation(business_data, b)
#             if(sim > 0.0):
#                 similar_items.append((sim, float(b['vec'][user_id])))
#     #     similar_item = sorted(similar_items, key=lambda x: x[0], reverse=True)
#         numerator = 0
#         denominator = 0

#         for similarity, rating in similar_items:
#             weight = similarity# * math.pow(abs(similarity), rho-1)
#             numerator += weight*rating
#             denominator += weight

#         if(denominator != 0):
#             yield "{},{},{}\n".format(user_id, business_id, numerator/denominator)
            
#         else:
#             yield "{},{},{}\n".format(user_id, business_id, train_data[business_id]['avg'])
            
    
    
def save_output(output, path):
    output_file = open(path, 'wt')
    output_file.write('user_id, business_id, prediction\n')
    for line in output:
        output_file.write(line)
    output_file.close()
    return
    
def read_test_data(test_path):
    def read_test_line(line):
        line = line.split(',')
        return (line[0].strip(), line[1].strip())
    
    rdd = sc.textFile(test_path)
    header = rdd.first()
    rdd = rdd.filter(lambda x: x != header).map(read_test_line).map(lambda x: (x[1], x[0]))#.join(train_data).map(lambda x: (x[0], x[1][0], x[1][1]))
    
    return rdd.collect()



def rmse(test_path, pred_path):
    test_data = {}
    count = 0
    with open(test_path, 'rt') as file:
        line = file.readline().strip()
        line = file.readline().strip()
        while(line):
            count+=1
            test_data[tuple(line.split(',')[:2])] = float(line.split(',')[2])
            line = file.readline().strip()
            
    sum_error_square = 0
    with open(pred_path, 'rt') as file:
        line = file.readline().strip()
        line = file.readline().strip()
        while(line):
            if(line == 'Null'):
                line = file.readline().strip()
                continue
            sum_error_square+=math.pow((test_data[tuple(line.split(',')[:2])]-float(line.split(',')[2])), 2)
            line = file.readline().strip()
            
    return math.pow(sum_error_square/count, 0.5)


# def expand(train_data, user_avg, user_id, business_id):
#     if(not user_id in user_avg or not business_id in train_data):
#         return []
#     output = []
#     for bid, r in user_avg[user_id]['vec'].items():
#         output.append(((business_id, user_id), (bid, r)))
        
#     return output

# def pearson_wrapper(train_data, bid1, bid2, bucket_index, candidate_index):
#     return pearson_correlation(train_data[bid1], train_data[bid2])

In [38]:
st = time.time()
train_data = read_business('./data/yelp_train.csv')
test_data = read_test_data('./data/yelp_val.csv')
user_avg = read_user('./data/yelp_train.csv')
print(time.time()-st)

5.4780943393707275


In [39]:
st = time.time()
output = sc.parallelize(test_data).map(lambda x: calculate_rating(train_data, user_avg, reverse_index, candidates_index, x[0], x[1])).collect()
print(time.time()-st)
save_output(output, 'output2_1.csv')
rmse('./data/yelp_val.csv', 'output2_1.csv')

14.764833927154541


1.0787236408418601

In [48]:
st = time.time()
def predict(x):
    numerator = 0
    denominator = 0
    for n,d in x:
        numerator += n
        denominator += d
    if(denominator == 0):
        return 0.0
    else:
        return numerator/denominator
output = sc.parallelize(test_data).flatMap(lambda x: expand(train_data, user_avg, x[1], x[0])).map(lambda x: (x[0], (pearson_wrapper(train_data, x[0][0], x[1][0]), x[1][1]))).filter(lambda x: x[1][0]>0).map(lambda x: (x[0], (x[1][0]*float(x[1][1]), x[1][0]))).groupByKey().map(lambda x: (x[0], predict(x[1]))).collect()
print(time.time()-st)

53.18922662734985


In [25]:
output[:10]

['wf1GqnKQuvH-V3QN80UOOQ,fThrN4tfupIGetkrz18JOg,3.75\n',
 '39FT2Ui8KUXwmUt6hnwy-g,uW6UHfONAmm8QttPkbMewQ,4.403292181069959\n',
 '7weuSPSSqYLUFga6IYP4pg,IhNASEZ3XnBHmuuVnWdIwA,4.835106382978723\n',
 'CqaIzLiWaa-lMFYBAsYQxw,G859H6xfAmVLxbzQgipuoA,4.2\n',
 'yy7shAsNWRbGg-8Y67Dzag,rS39YnrhoXmPqHLzCBjeqw,3.090909090909091\n',
 'Uk1UKBIAwOqhjZdLm3r9zg,5CJL_2-XwCGBmOav4mFdYg,3.5208333333333335\n',
 'x-8ZMKKNycT3782Kqf9loA,jgtWfJCJZty_Nctqpdtp3g,4.0125\n',
 '0FVcoJko1kfZCrJRfssfIA,JVK8szNDoy9MNiYSz_MiAA,3.5714285714285716\n',
 'LcCRMIDz1JgshpPGYfLDcA,t19vb_4ML2dg5HZ-MF3muA,3.380952380952381\n',
 'C__1BHWTGBNA5s2ZPH289g,h_UvnQfe1cuVICly_kIqHg,3.642857142857143\n']

In [26]:
save_output(output, 'output2_1.csv')

In [27]:
rmse('./data/yelp_val.csv', 'output2_1.csv')

1.0794448834300172

In [None]:
# step1: generate the pairs ((bid_in, uid_in), (bid1, rating for uid))(all the bids in user vector)
# step2: calculate similarity using pearson
# step3: groupby first2 and 



In [None]:
rmse(sys.argv[2].strip(), sys.argv[3].strip())

In [6]:
def read_csv_line(line):
    line = line.split(',')
    return (line[0].strip(), line[1].strip(), line[2].strip())

def avg(item):
    vec = item[1]
    s = 0
    temp = {}
    for x in vec:
        s += float(x[1])
        temp[x[0]] = x[1]
        
    return (item[0], {'vec': temp, 'avg': s/len(vec)})

def read_business(data_path):
    rdd = sc.textFile(data_path)
    header = rdd.first()
    rdd = rdd.filter(lambda x: x != header).map(read_csv_line)
    
    business_users = rdd.map(lambda x: (x[1], (x[0], x[2]))).groupByKey().map(avg).collect()
    temp = {}
    for b, u in business_users:
        temp[b] = u
    return temp


def read_user(data_path):
    rdd = sc.textFile(data_path)
    header = rdd.first()
    rdd = rdd.filter(lambda x: x != header).map(read_csv_line)
    
    business_users = rdd.map(lambda x: (x[0], (x[1], x[2]))).groupByKey().map(avg).collect()
    temp = {}
    for u, b in business_users:
        temp[u] = b
    return temp
    
def is_rated(x, user_y):
    for (user, _) in x:
        if(user == user_y):
            return True
        
    return False

def stage1(x, user_y):
    for (user, rating) in x[2]['vec']:
        if(user == user_y):
            return [(float(rating)*x[1], x[1])]
        
    return []



In [7]:
train_data = read_business('./data/yelp_train.csv')
user_avg = read_user('./data/yelp_train.csv')
# test_data = read_test_data(train_data, './data/yelp_val.csv')

In [8]:
user_avg['IlZKVEmRztQtja-6y3rOaA']

{'vec': {'fThrN4tfupIGetkrz18JOg': '3.0',
  '_OimUEl0DzPg6GtJXYeoJw': '3.0',
  'RqwuaHAybTqbkmGe1RZLXQ': '4.0',
  'WHlbUb5SNdXDKYfkNDo89A': '4.0',
  'zlZQM-cJPVW7FHJsYTvyYg': '3.0',
  'ZTVWHEAl-1jA_pLyL21s4g': '4.0',
  'as_fyujgsdwtV56n31z-Sw': '3.0',
  'meXjqyhTNLFmknY39y2sMg': '2.0',
  'IqXDpeNbePsW1jR7SSO57w': '2.0',
  'tsx0bZbS4ciySrHZr8EFmw': '4.0',
  'Bd13s20et9wY2sL72lgWow': '2.0',
  'NN6HpdvQlGZUVjBwzhgUWQ': '3.0',
  '-BBqiQQCweN7ePwTlY163Q': '4.0',
  'V3A7tvyfSX9pmy6L2ohqDg': '3.0',
  'MBpTl3womA9RRX4ruce78w': '4.0',
  'ZLVWlYQQxzZoJbg4w0gsMA': '4.0',
  's1k42BjFaV47wXzh1Ub5ZA': '3.0',
  'wT7Y-dJxyKIME09o37DE0A': '4.0',
  '0qsarzXaZau8REdE0rToxA': '3.0',
  'gNER9lE1Ma9FLm9MBsvYgg': '4.0',
  'mlAUV9SKTENbzhWTdl4l4w': '5.0',
  'rcoAajAVBBXsBNfgKYaH-w': '4.0',
  'MN2oRaV4bZEbBIy_w7fNEQ': '2.0',
  'hiceY83htPGn513-IcYWVA': '4.0',
  '7D-ATDP9Ezr1ZehszsotOQ': '4.0',
  'LNsZJP6jZ11e0tDljOLPiQ': '3.0',
  'DBGf3gaJ30ObzQ5fo1SSLg': '3.0',
  'kROYR_6mVfg9WgJ7z1HOSA': '4.0',
  'T_CniZSYWv

In [71]:
train_data['fThrN4tfupIGetkrz18JOg']

{'vec': {'IlZKVEmRztQtja-6y3rOaA': '3.0',
  '6P7o2yz4-B8LZDgoqU_Cgg': '4.0',
  'dvwFC5u09dbG_16AeNKBmQ': '4.0',
  'ALNUwFXAHXmFeQ9V2gCTfA': '4.0',
  'wQiyjBi8q_Q0dWLPSnQUqQ': '5.0',
  'PIethXohNwpHCMVCfIqcfg': '4.0',
  'PEjS7EYRg1cfHyx3r-w0qg': '3.0',
  'CNAEooBIpYLNnnugWceDcg': '4.0',
  'tcFmCEOQj8BqZ4VZheKCrw': '4.0',
  'QaN-nccbLZPWzownQYgTVQ': '4.0',
  'ncCeaGkpEQMgzSfBHnatWw': '2.0',
  'alxetHC3mXR2PtG8CeCN6Q': '4.0'},
 'avg': 3.75}

In [108]:
def pearson_correlation(business1, business2, user_avg):
    vec_set1 = set()
    b1_sum = 0
    for user, rating in business1['vec'].items():
        b1_sum += float(rating)
        vec_set1.add(user)
    
    b2_sum = 0
    vec_set2 = set()
    for user, rating in business2['vec'].items():
        b2_sum += float(rating)
        vec_set2.add(user)
            
    vec_set = vec_set1.union(vec_set2) 
    if(len(vec_set) == 0):
        return 0.0
    b1_sum += (len(vec_set)-len(business1['vec'])) * 3.5
    b2_sum += (len(vec_set)-len(business2['vec'])) * 3.5
    b1_avg = b1_sum/len(vec_set)
    b2_avg = b2_sum/len(vec_set)

    corr_numerator = 0
    corr_denominator_p1 = 0
    corr_denominator_p2 = 0
    
    for user_id in vec_set:
        v1 = None
        if(user_id in business1['vec']):
            v1 = float(business1['vec'][user_id])-b1_avg
        else:
            v1 = business1['avg']-b1_avg
            
            
        if(user_id in business2['vec']):
            v2 = float(business2['vec'][user_id])-b2_avg
        else:
            v2 = business2['avg']-b2_avg
        
        corr_numerator += v1*v2
        corr_denominator_p1 += math.pow(v1, 2)
        corr_denominator_p2 += math.pow(v2, 2)
    
    denominator = math.pow(corr_denominator_p1, 0.5) * math.pow(corr_denominator_p2, 0.5)
    
    if(denominator != 0):
        return corr_numerator / denominator
    else:
        return 0.0



def calculate_rating(train_data, user_avg, business_id, user_id):
    if(not business_id in train_data and not user_id in user_avg):
        return "{},{},{}\n".format(user_id, business_id, 4.0)
    if(not business_id in train_data and user_id in user_avg):
        return "{},{},{}\n".format(user_id, business_id, user_avg[user_id])
    if(business_id in train_data and not user_id in user_avg):
        return "{},{},{}\n".format(user_id, business_id, train_data[business_id]['avg'])
    
    business_data = train_data[business_id]
    similar_items = []
    for bid, b in train_data.items():
        if(user_id in b['vec']):
            sim = pearson_correlation(business_data, b, user_avg)
            if(sim > 0.0):
                similar_items.append((sim, float(b['vec'][user_id])))
#     similar_item = sorted(similar_items, key=lambda x: x[0], reverse=True)
    numerator = 0
    denominator = 0
    
    for similarity, rating in similar_items:
        weight = similarity# * math.pow(abs(similarity), rho-1)
        numerator += weight*rating
        denominator += weight
        
    if(denominator != 0):
        return "{},{},{}\n".format(user_id, business_id, numerator/denominator)
    else:
        return "{},{},{}\n".format(user_id, business_id, train_data[business_id]['avg'])


calculate_rating(train_data, user_avg, 'fThrN4tfupIGetkrz18JOg', 'wf1GqnKQuvH-V3QN80UOOQ')
    

'wf1GqnKQuvH-V3QN80UOOQ,fThrN4tfupIGetkrz18JOg,4.536925698145147\n'

In [109]:
def save_output(output, path):
    output_file = open(path, 'wt')
    output_file.write('user_id, business_id, prediction\n')
    for line in output:
        output_file.write(line)
    output_file.close()
    return
    
def read_test_data(test_path):
    rdd = sc.textFile(test_path)
    header = rdd.first()
    rdd = rdd.filter(lambda x: x != header).map(read_csv_line).map(lambda x: (x[1], x[0]))#.join(train_data).map(lambda x: (x[0], x[1][0], x[1][1]))
    
    return rdd.collect()

st = time.time()
test_data = read_test_data('./data/yelp_val.csv')
output = sc.parallelize(test_data).map(lambda x: calculate_rating(train_data, user_avg, x[0], x[1])).collect()
print(time.time()-st)
save_output(output, 'output2-1.csv')

66.02546644210815


In [110]:
def rmse(test_path, pred_path):
    test_data = {}
    count = 0
    with open(test_path, 'rt') as file:
        line = file.readline().strip()
        line = file.readline().strip()
        while(line):
            count+=1
            test_data[tuple(line.split(',')[:2])] = float(line.split(',')[2])
            line = file.readline().strip()
            
    sum_error_square = 0
    with open(pred_path, 'rt') as file:
        line = file.readline().strip()
        line = file.readline().strip()
        while(line):
            if(line == 'Null'):
                line = file.readline().strip()
                continue
            sum_error_square+=math.pow((test_data[tuple(line.split(',')[:2])]-float(line.split(',')[2])), 2)
            line = file.readline().strip()
            
    return math.pow(sum_error_square/count, 0.5)

rmse('./data/yelp_val.csv', './output2-1.csv')

1.0634362508560673

In [None]:
# train_data.take(2)[0]

In [None]:
# a = read_business('./data/yelp_train.csv').filter(lambda x: x[0] == 'cYwJA2A6I12KNkm2rtXd5g').take(1)

In [None]:
# test_data[1]

In [None]:
# sc.parallelize(nearest_neighbor_items).flatMap(lambda x: stage1(x, test_data[1][1][0])).reduce(lambda x, y: (x[0]+y[0], x[1]+y[1]))

In [None]:
# nearest_neighbor_items[:1]

In [None]:
# mod_n = 100

# nearest_neighbor_items = read_business('./data/yelp_train.csv').filter(lambda x: is_rated(x, 'wf1GqnKQuvH-V3QN80UOOQ')).map(lambda x: (x[0], pearson_correlation(x, b[0]), x[1])).filter(lambda x: x[1] > 0).collect()#.takeOrdered(mod_n, lambda x: -x[1])

In [None]:
train_data = read_business('./data/yelp_train.csv')
test_data = read_test_data(train_data, './data/yelp_val.csv')

In [None]:
# cp = test_data.cartesian(train_data)

In [None]:
# cp.take(1)

In [None]:
# def get_rating(vec, user):
#     for u, r in vec:
#         if(u == user):
#             return r
#     return 0.0

# st = time.time()
# x = cp.repartition(16).\
#     map(lambda x: (x[0][0], (x[1][0], pearson_correlation(x[0][2], x[1][1])))).collect()
# #     filter(lambda x: x[1][1]>0).collect()#.repartition(16).groupByKey().take(1)


In [9]:
def calculate_rating(train_data, business, user_id):
    nearest_neighbor_items = train_data.filter(lambda x: is_rated(x[1]['vec'], user_id)).map(lambda x: (x[0], pearson_correlation(x[1], business), x[1])).filter(lambda x: x[1] > 0).collect()#.takeOrdered(5, lambda x: -x[1])
    if(len(nearest_neighbor_items) == 0):
        return 0.0
    n, d = sc.parallelize(nearest_neighbor_items).flatMap(lambda x: stage1(x, user_id)).reduce(lambda x, y: (x[0]+y[0], x[1]+y[1]))
    return n/d 

def main(train_path, test_path, output_path):
    train_data = read_business(train_path).persist()
    test_data = read_test_data(train_data, test_path).collect()
    output = []
    print(test_data[:2])
    for item in test_data:
        rating_pred = calculate_rating(train_data, item[1][1], item[1][0])
        output.append("{},{},{}\n".format(item[1][0], item[0], rating_pred))
        break
    print(output)
#     with open(test_path, 'rt') as file:
#         line = file.readline().strip()
#         line = file.readline().strip()
#         while(line):
#             user, business, rating = line.split(',')
#             rating_pred = calculate_rating(train_data, business.strip(), user.strip())
#             print(rating_pred)
#             output_file.write("{},{},{}\n".format(user, business, rating_pred))
#             line = file.readline().strip()
    return

# main('./data/yelp_train.csv', './data/yelp_val.csv', 'output2-1.csv')

st = time.time()
# train_data = train_data.map(lambda x: (x[0], x[0][0], x[0][1])).persist()
print()
print(time.time()-st)

[('fThrN4tfupIGetkrz18JOg', 'wf1GqnKQuvH-V3QN80UOOQ', {'vec': [('IlZKVEmRztQtja-6y3rOaA', '3.0'), ('6P7o2yz4-B8LZDgoqU_Cgg', '4.0'), ('dvwFC5u09dbG_16AeNKBmQ', '4.0'), ('ALNUwFXAHXmFeQ9V2gCTfA', '4.0'), ('wQiyjBi8q_Q0dWLPSnQUqQ', '5.0'), ('PIethXohNwpHCMVCfIqcfg', '4.0'), ('PEjS7EYRg1cfHyx3r-w0qg', '3.0'), ('CNAEooBIpYLNnnugWceDcg', '4.0'), ('tcFmCEOQj8BqZ4VZheKCrw', '4.0'), ('QaN-nccbLZPWzownQYgTVQ', '4.0'), ('ncCeaGkpEQMgzSfBHnatWw', '2.0'), ('alxetHC3mXR2PtG8CeCN6Q', '4.0')], 'avg': 3.75}), ('fThrN4tfupIGetkrz18JOg', 'EtxsD-Jbyxh7TsaWHtGCew', {'vec': [('IlZKVEmRztQtja-6y3rOaA', '3.0'), ('6P7o2yz4-B8LZDgoqU_Cgg', '4.0'), ('dvwFC5u09dbG_16AeNKBmQ', '4.0'), ('ALNUwFXAHXmFeQ9V2gCTfA', '4.0'), ('wQiyjBi8q_Q0dWLPSnQUqQ', '5.0'), ('PIethXohNwpHCMVCfIqcfg', '4.0'), ('PEjS7EYRg1cfHyx3r-w0qg', '3.0'), ('CNAEooBIpYLNnnugWceDcg', '4.0'), ('tcFmCEOQj8BqZ4VZheKCrw', '4.0'), ('QaN-nccbLZPWzownQYgTVQ', '4.0'), ('ncCeaGkpEQMgzSfBHnatWw', '2.0'), ('alxetHC3mXR2PtG8CeCN6Q', '4.0')], 'avg': 3.75})]
['w

NameError: name 'output_file' is not defined

In [None]:
test_data[0]