In [1]:
import findspark
findspark.init()

In [2]:
import sys
import time
import json
from math import sqrt
from projectsupport import *

In [3]:
from pyspark import SparkConf, SparkContext, StorageLevel

In [4]:
train_file = "file:///Users/markduan/duan/USC_course/USC_APDS/INF553/project/data/train_review.json"
model_file = "../model/model_itemCF.json"
cf_type = "item_based"
# avg_business_file = "/Users/markduan/duan/USC_course/USC_APDS/INF553/project/data/business_avg.json"

In [5]:
UNK = 3.823989
CORATED_LIMIT = 20
NUM_HASHS = 50
LARGE_NUMBER = sys.maxsize
NUM_BANDS = NUM_HASHS
JACCARD_SIMILARITY_THRESHOLD = 0.01 # >=

In [6]:
def meanList(l):
    return sum(l) / len(l)

In [7]:
def averageRating(x):
    bid = x[0][0]
    uid = x[0][1]
    stars = x[1] # list
    average_stars = meanList(stars)
    return (bid, [(uid, average_stars)])

In [8]:
def convertToDict(l):
    # l - [(uid, star), ...]
    us = {}
    for uid, star in l:
        us[uid] = star
    return us

In [9]:
def pc(b1, b2):
    # b1, b2 - (bid, {uid: star, ...})
    # avg_business - {bid: avg, ...}
    b1_bid = b1[0]
    b2_bid = b2[0]
    # if b1_bid == b2_bid:
    #     return (b1_bid, b2_bid, 1.0) # b1_bid == b2_bid
    b1_d = b1[1]
    b2_d = b2[1]
    b1_u = set(b1_d.keys())
    b2_u = set(b2_d.keys())
    u_intersect = list(b1_u.intersection(b2_u))
    len_inter = len(u_intersect)
#     if len_inter == 0:
#         return None
#     if len_inter < CORATED_LIMIT:
#         len_gap = CORATED_LIMIT - len_inter
#         # print(len_gap, end=' ')
#         avg_b1 = avg_business.get(b1_bid, UNK)
#         avg_b2 = avg_business.get(b1_bid, UNK)
#         b1_corated = [b1_d[uid] for uid in u_intersect] + [avg_b1] * len_gap
#         b2_corated = [b2_d[uid] for uid in u_intersect] + [avg_b2] * len_gap
#     else:
#         b1_corated = [b1_d[uid] for uid in u_intersect]
#         b2_corated = [b2_d[uid] for uid in u_intersect]
    if len_inter < CORATED_LIMIT:
        return None
    b1_corated = [b1_d[uid] for uid in u_intersect]
    b2_corated = [b2_d[uid] for uid in u_intersect]

    # co-rated average
    b1_avg = meanList(b1_corated)
    b2_avg = meanList(b2_corated)

    # overall average
    # b1_avg = bid_avg[b1_bid]
    # b2_avg = bid_avg[b2_bid]

    # overall average according to train_review.json
    # b1_avg = meanList(list(b1_d.values()))
    # b2_avg = meanList(list(b2_d.values()))

    b1_corated_normalized = [(x - b1_avg) for x in b1_corated]
    b2_corated_normalized = [(x - b2_avg) for x in b2_corated]
    n = sum([(b1_corated_normalized[i] * b2_corated_normalized[i]) for i in range(len(u_intersect))])
    d1 = sum([(x * x) for x in b1_corated_normalized])
    d2 = sum([(x * x) for x in b2_corated_normalized])
    if n == 0 or d1 == 0 or d2 == 0:
        return None
    w = n / sqrt(d1 * d2)
    if w > 0:
#         w = w ** 2.5 # Case Amplification
        return (b1_bid, b2_bid, w) # b1_bid < b2_bid
    else:
        return None

In [10]:
def getData(sc):
    raw_data = sc.textFile(train_file) \
        .map(json.loads)
    return raw_data

In [11]:
# for itembased model
def transformDataForItemBased(raw_data):
    data_groupby_bid = raw_data.map(lambda r: ((r['business_id'], r['user_id']), [r['stars']])) \
        .reduceByKey(lambda x, y: x + y) \
        .map(lambda x: averageRating(x)) \
        .reduceByKey(lambda x, y: x + y) \
        .map(lambda x: (x[0], convertToDict(x[1]))) \
        .collect()
        # .filter(lambda x: len(x[1]) > 2) \
        # .map(lambda x: (x[0], convertToDict(x[1]))) \
        # .collect()

    # print(data_groupby_bid[:2])
    print(len(data_groupby_bid)) # 10253 -> 10118 (remove businesses that were rated by fewer than 2 users)

    return data_groupby_bid

In [12]:
def computePearsonCorrelationItemBased(data):
    # data - [(bid, {uid: star, ...}), ...]
    # avg_business - {bid: avg, ...}
    data_dict = {} # {bid: (bid, {uid: star, ...}), ...}
    for bid, d in data:
        data_dict[bid] = (bid, d)
    bid_list = list(data_dict.keys())
    bid_list.sort()
    bid_length = len(bid_list)
    # bid_pairs = ((bid_list[i], bid_list[j]) for i in range(bid_length) for j in range(i+1, bid_length))
    bid_v = []
    for i in range(bid_length):
        x = bid_list[i]
#         print(i)
        for j in range(i+1, bid_length):
            y = bid_list[j]
            res = pc(data_dict[x], data_dict[y])
            if res != None:
                d_ = {'b1': res[0], 'b2': res[1], 'sim': res[2]}
                # print(d_)
                bid_v.append(d_)
    # print(bid_v[:5])
    print(len(bid_v)) # 214796
    return bid_v

In [13]:
def outputModelToFile(model):
    with open(model_file, 'w', encoding='utf-8') as fp:
        for item in model:
            fp.write(json.dumps(item))
            fp.write('\n')

In [14]:
# def computePearsonCorrelationItemBased(data, avg_business):
#     # data - [(bid, {uid: star, ...}), ...]
#     # avg_business - {bid: avg, ...}
#     data_dict = {} # {bid: (bid, {uid: star, ...}), ...}
#     for bid, d in data:
#         data_dict[bid] = (bid, d)
#     bid_list = list(data_dict.keys())
#     bid_list.sort()
#     bid_length = len(bid_list)
#     # bid_pairs = ((bid_list[i], bid_list[j]) for i in range(bid_length) for j in range(i+1, bid_length))
#     bid_l = []
#     for i in range(bid_length):
#         x = bid_list[i]
# #         print(i)
#         for j in range(i+1, bid_length):
#             y = bid_list[j]
#             res = pc(data_dict[x], data_dict[y], avg_business)
#             if res != None:
#                 bid_l.append(res)
#     # print(bid_v[:5])
#     print(len(bid_l)) # 214796
#     return bid_l

In [15]:
# def outputModelToFile(model):
#     with open(model_file, 'w', encoding='utf-8') as fp:
#         json.dump(model, fp)

In [16]:
conf = SparkConf() \
    .setAppName("project") \
    .setMaster("local[*]") \
    .set("spark.driver.memory","4g")
sc = SparkContext(conf=conf)

In [17]:
# get data
raw_data = getData(sc)

In [18]:
# # get avg_business
# with open(avg_business_file, 'r', encoding='utf-8') as fp:
#     avg_business = json.load(fp)

In [19]:
# transform to generate a dataset for item-based model
data_groupby_bid = transformDataForItemBased(raw_data)

13167


In [20]:
# compute Pearson Correlation w
model = computePearsonCorrelationItemBased(data_groupby_bid)

29172


In [21]:
# output model to file
outputModelToFile(model)