In [1]:
import findspark
findspark.init()

In [2]:
import sys
import time
import json
from math import sqrt
from projectsupport import *

In [3]:
from pyspark import SparkConf, SparkContext, StorageLevel

In [None]:
train_file = "file:///Users/markduan/duan/USC_course/USC_APDS/INF553/project/data/train_review.json"
model_file = "../model/model_user.json"
cf_type = "user_based"
avg_business_file = "/Users/markduan/duan/USC_course/USC_APDS/INF553/project/data/business_avg.json"

In [21]:
UNK = 3.823989
CORATED_LIMIT = 10
NUM_HASHS = 30
NUM_BANDS = 15
LARGE_NUMBER = sys.maxsize
JACCARD_SIMILARITY_THRESHOLD = 0.01 # >=

In [6]:
def meanList(l):
    return sum(l) / len(l)

def averageRating(x):
    bid = x[0][0]
    uid = x[0][1]
    stars = x[1] # list
    average_stars = meanList(stars)
    return (bid, [(uid, average_stars)])

def convertToDict(l):
    # l - [(uid, star), ...]
    us = {}
    for uid, star in l:
        us[uid] = star
    return us

In [7]:
def pc(b1, b2):
    # b1, b2 - (bid, {uid: star, ...})
    # avg_business - {bid: avg, ...}
    b1_bid = b1[0]
    b2_bid = b2[0]
    # if b1_bid == b2_bid:
    #     return (b1_bid, b2_bid, 1.0) # b1_bid == b2_bid
    b1_d = b1[1]
    b2_d = b2[1]
    b1_u = set(b1_d.keys())
    b2_u = set(b2_d.keys())
    u_intersect = list(b1_u.intersection(b2_u))
    len_inter = len(u_intersect)
#     if len_inter == 0:
#         return None
#     if len_inter < CORATED_LIMIT:
#         len_gap = CORATED_LIMIT - len_inter
#         # print(len_gap, end=' ')
#         avg_b1 = avg_business.get(b1_bid, UNK)
#         avg_b2 = avg_business.get(b1_bid, UNK)
#         b1_corated = [b1_d[uid] for uid in u_intersect] + [avg_b1] * len_gap
#         b2_corated = [b2_d[uid] for uid in u_intersect] + [avg_b2] * len_gap
#     else:
#         b1_corated = [b1_d[uid] for uid in u_intersect]
#         b2_corated = [b2_d[uid] for uid in u_intersect]
    if len_inter < CORATED_LIMIT:
        return None
    b1_corated = [b1_d[uid] for uid in u_intersect]
    b2_corated = [b2_d[uid] for uid in u_intersect]

    # co-rated average
    b1_avg = meanList(b1_corated)
    b2_avg = meanList(b2_corated)

    # overall average
    # b1_avg = bid_avg[b1_bid]
    # b2_avg = bid_avg[b2_bid]

    # overall average according to train_review.json
    # b1_avg = meanList(list(b1_d.values()))
    # b2_avg = meanList(list(b2_d.values()))

    b1_corated_normalized = [(x - b1_avg) for x in b1_corated]
    b2_corated_normalized = [(x - b2_avg) for x in b2_corated]
    n = sum([(b1_corated_normalized[i] * b2_corated_normalized[i]) for i in range(len(u_intersect))])
    d1 = sum([(x * x) for x in b1_corated_normalized])
    d2 = sum([(x * x) for x in b2_corated_normalized])
    if n == 0 or d1 == 0 or d2 == 0:
        return None
    w = n / sqrt(d1 * d2)
    if w > 0:
#         w = w ** 2.5 # Case Amplification
        return (b1_bid, b2_bid, w) # b1_bid < b2_bid
    else:
        return None

In [8]:
def updateSignature(s_cur, s_new, length):
    # update s_cur
    for i in range(length):
        if s_new[i] < s_cur[i]:
            s_cur[i] = s_new[i]

def minHash(x, hashs):
    u_id = x[0]
    b_ids = x[1]
    signature = [LARGE_NUMBER] * NUM_HASHS
    for b in b_ids:
        s_ = [hashs[i](b) for i in range(NUM_HASHS)]
        updateSignature(signature, s_, NUM_HASHS)
    res = (u_id, signature)
    return res

def LSH(x, b, r, hash_lsh):
    u_id = x[0]
    signature = x[1]
    ress = []
    for i in range(b):
        v_hash = hash_lsh(signature[i*r:(i+1)*r])
        res = ((v_hash, i), u_id)
        ress.append(res)
    return ress

def generatePairs(l):
    l.sort()
    l_len = len(l)
    res_l = [(l[i],l[j]) for i in range(l_len) for j in range(i+1, l_len)]
    # in each pair, pair[0] < pair[1]
    return res_l

def computeJaccardSimilarity(l1, l2):
    # items in l1 are unique to each other, so does l2
    l1_len = len(l1)
    l2_len = len(l2)
    intersect_set = set(l1).intersection(set(l2))
    inter_len = len(intersect_set)
    union_len = l1_len + l2_len - inter_len
    js = inter_len / union_len
    return js

In [9]:
def getData(sc):
    raw_data = sc.textFile(train_file) \
        .map(json.loads)
    return raw_data

In [10]:
def getRenameData(raw_data):
    # rename bid
    original_bid = raw_data.map(lambda x: x['business_id']).distinct().collect()
    bid_rename = Rename(original_bid)

    # rename uid
    original_uid = raw_data.map(lambda x: x['user_id']).distinct().collect()
    uid_rename = Rename(original_uid)

    # rename data
    data_renamed = raw_data.map(lambda r: (r['user_id'], r['business_id'], r['stars'])) \
        .map(lambda x: (uid_rename.getNewValue(x[0]), bid_rename.getNewValue(x[1]), x[2])) \
        .persist(StorageLevel.MEMORY_AND_DISK)

    return data_renamed, bid_rename, uid_rename

In [22]:
# for user based model
def getCandidates(data_renamed, bid_rename, uid_rename):
    data_distinct = data_renamed.map(lambda x: (x[0], x[1])) \
        .distinct() \
        .persist(StorageLevel.MEMORY_AND_DISK)
    # data after rename: [(22368, 6616), (22369, 4431), (11224, 2238), (9325, 759), (1829, 4435)]
    # data_distinct - [(uid, bid), ...]
    
    # print('long uid', uid_rename.values_length)
    # print('long bid', bid_rename.values_length)
    # generate hash functions for min-hash
    num_bid = bid_rename.values_length
    print('num_uid =', uid_rename.values_length) # 91,730 can generate 4.207e9 pairs
    print('num_bid =', num_bid) # 13,167
    hashs_minhash = generateHashs(NUM_HASHS, num_bid)

    # generate hash functions and parameters for LSH
    b = NUM_BANDS
    r = int(NUM_HASHS / NUM_BANDS)
    hash_lsh = generateHashForLSH(r)

    # generate candidates using Min-Hash & LSH
    candidates = data_distinct.groupByKey() \
        .map(lambda x: minHash(x, hashs_minhash)) \
        .flatMap(lambda x: LSH(x, b, r, hash_lsh)) \
        .groupByKey() \
        .filter(lambda x: len(x[1]) > 1) \
        .map(lambda x: list(x[1])) \
        .flatMap(lambda x: generatePairs(x)) \
        .distinct() \
        .collect()
    print('length of candidates:', len(candidates)) # 19300205

    return candidates

In [12]:
def transformDataForUserBased(data_renamed):
    # data_renamed - [(uid, bid, star), ...]
    data_groupby_uid = data_renamed.map(lambda x: ((x[0], x[1]), [x[2]])) \
        .reduceByKey(lambda x, y: x + y) \
        .map(lambda x: averageRating(x)) \
        .reduceByKey(lambda x, y: x + y) \
        .filter(lambda x: len(x[1]) > 2) \
        .map(lambda x: (x[0], convertToDict(x[1]))) \
        .collect()

    print('data_groupby_uid[:5]:', data_groupby_uid[:5])
    # print(len(data_groupby_uid)) # 26178

    return data_groupby_uid

In [13]:
def computePearsonCorrelationAndJaccardSimilarityUserBased(data, candidates):
    # data - [(uid, {bid: star, ...}), ...]
    print('data[:3]:', data[:3])
    data_dict = {} # {uid: (uid, {bid: star, ...}), ...}
    for uid, d in data:
        data_dict[uid] = (uid, d)
    # uid_list = list(data_dict.keys())
    uid_v = []
    for u1, u2 in candidates:
        pc1 = data_dict.get(u1)
        pc2 = data_dict.get(u2)
        if pc1 == None or pc2 == None:
            # if pc1 == None:
            #     print('None value:', u1)
            # else:
            #     print('None value:', u2)
            continue
        res = pc(data_dict[u1], data_dict[u2])
        if res != None:
            bids1 = list(data_dict[u1][1].keys())
            bids2 = list(data_dict[u2][1].keys())
            js_ = computeJaccardSimilarity(bids1, bids2)
            if js_ >= JACCARD_SIMILARITY_THRESHOLD:
                d_ = {'u1': res[0], 'u2': res[1], 'sim': res[2]}
                # print(d_)
                uid_v.append(d_)

    print('uid_v[:5] =', uid_v[:5])
    print('len(uid_v) =', len(uid_v)) # 542141
    return uid_v

In [14]:
def outputModelToFileUserBased(model, rename):
    # model - [{'u1': uid, 'u2': uid, 'sim': w}, ...]
    with open(model_file, 'w', encoding='utf-8') as fp:
        for item in model:
            item['u1'] = rename.getOriginalValue(item['u1'])
            item['u2'] = rename.getOriginalValue(item['u2'])
            fp.write(json.dumps(item))
            fp.write('\n')

In [16]:
conf = SparkConf() \
    .setAppName("task3train") \
    .setMaster("local[*]") \
    .set("spark.driver.memory","4g")
sc = SparkContext(conf=conf)

In [17]:
# get data
raw_data = getData(sc)

In [18]:
# get rename data
data_renamed, bid_rename, uid_rename = getRenameData(raw_data)

In [None]:
# find most possible user pairs using Min-Hash & LSH
candidates = getCandidates(data_renamed, bid_rename, uid_rename)

num_uid = 91730
num_bid = 13167


In [None]:
# transform to generate a dataset for user-based model
data_groupby_uid = transformDataForUserBased(data_renamed)

In [None]:
# compute Pearson Correlation w
model = computePearsonCorrelationAndJaccardSimilarityUserBased(data_groupby_uid, candidates)

In [None]:
# name back & output model to file
outputModelToFileUserBased(model, uid_rename)