In [1]:
import findspark
findspark.init()

In [2]:
import sys
import time
import json
from math import sqrt
from decimal import Decimal, ROUND_HALF_UP

In [3]:
from pyspark import SparkConf, SparkContext, StorageLevel

In [4]:
N_NEIGHBORS_ITEMBASED = 10
N_NEIGHBORS_USERBASED = 5
UNK = 3.823989
DEFAULT_OUTPUT = None

In [5]:
train_file = "file:///Users/markduan/duan/USC_course/USC_APDS/INF553/project/data/train_review.json"
test_file = "file:///Users/markduan/duan/USC_course/USC_APDS/INF553/project/data/test_review.json"
model_file = "file:///Users/markduan/duan/USC_course/USC_APDS/INF553/project/model/model_itemCF.json"
output_file = "../predict/prediction_cf_none_n%d.json" % (N_NEIGHBORS_ITEMBASED)

In [6]:
def meanList(l):
    return sum(l) / len(l)

def averageRating(x):
    uid = x[0][0]
    bid = x[0][1]
    stars = x[1] # list
    average_stars = meanList(stars)
    return (uid, [(bid, average_stars)])

def convertToDict(l):
    # l - [(bid, star), ...]
    bs = {}
    for bid, star in l:
        bs[bid] = star
    return bs

In [7]:
def computeStarsItembased(corated, target_bid, model):
    """
    corated - {bid: star, ...}
    """
    if corated == None:
        return None
    corated.pop(target_bid, None)
    bid_cor = list(corated.keys())
    collect = []
    for b in bid_cor:
        pair = None
        if b < target_bid:
            pair = (b, target_bid)
        else:
            pair = (target_bid, b)

        # if b == target_bid:
        #     print('same:', pair)
        w = model.get(pair)
        if w != None:
            # pair may not have a value in the model
            # when b == target_bid, pair have no value, too
            collect.append((pair, w, b))
        # else:
        #     collect.append((pair, 0, b))
    # print(collect)
    collect.sort(key=lambda x: x[1], reverse=True)
    
    if len(collect) < N_NEIGHBORS_ITEMBASED:
        return None
    neighbors = collect[:N_NEIGHBORS_ITEMBASED]
    sum_w = 0
    n = 0
    for p, w, b in neighbors:
        star = corated[b]
        n += star * w
        sum_w += w
    if sum_w == 0:
        return None
    else:
        return n /sum_w
#         predict_stars = n / sum_w
#         origin_n = Decimal(str(predict_stars))
#         ans_n = origin_n.quantize(Decimal('0'), rounding=ROUND_HALF_UP)
#         return float(ans_n)

In [8]:
def getData(sc):
    train_raw_data = sc.textFile(train_file) \
        .map(json.loads)
    
    test_raw_data = sc.textFile(test_file) \
        .map(json.loads)

    return train_raw_data, test_raw_data

In [9]:
def getModelItembased(sc):
    model = sc.textFile(model_file) \
        .map(json.loads) \
        .map(lambda r: ((r['b1'], r['b2']), r['sim'])) \
        .collectAsMap()
    # model - {(bid1, bid2): sim, ...}  ps: bid1 < bid2
    return model

In [10]:
# def getModelItembased():
#     with open(model_file, 'r', encoding='utf-8') as fp:
#         model_l = json.load(fp)
#     model = {}
#     for x in model_l:
#         pair = (x[0], x[1])
#         sim = x[2]
#         model[pair] = sim
#     # model - {(bid1, bid2): sim, ...}  ps: bid1 < bid2
#     return model

In [11]:
def transformTrainDataGroupByUid(train_raw_data):
    data_groupby_uid = train_raw_data.map(lambda r: ((r['user_id'], r['business_id']), [r['stars']])) \
        .reduceByKey(lambda x, y: x + y) \
        .map(lambda x: averageRating(x)) \
        .reduceByKey(lambda x, y: x + y) \
        .map(lambda x: (x[0], convertToDict(x[1]))) \
        .collect()
    
    u_d = {}
    for uid, bd in data_groupby_uid:
        u_d[uid] = bd
    # u_d - {uid: {bid: star, ...}, ...}
    return u_d

In [12]:
def predictItembased(model, u_d, test_raw_data):
    prediction = test_raw_data.map(lambda r: (r['user_id'], r['business_id'])).collect()
    for i in range(len(prediction)):
        x = prediction[i]
        prediction[i] = (x, computeStarsItembased(u_d.get(x[0]), x[1], model))
    # prediction = test_raw_data.map(lambda r: (r['user_id'], r['business_id'])) \
    #     .map(lambda x: (x, computeStarsItembased(u_d.get(x[0]), x[1], model))) \
    #     .collect()
    # [((uid, bid), star), ...]
    return prediction

In [13]:
def outputResultToFileItembased(prediction):
    with open(output_file, 'w', encoding='utf-8') as fp:
        for item in prediction:
            t = {
                'user_id': item[0][0],
                'business_id': item[0][1],
                'stars': item[1]
            }
            fp.write(json.dumps(t))
            fp.write('\n')

In [14]:
conf = SparkConf() \
    .setAppName("project") \
    .setMaster("local[*]") \
    .set("spark.driver.memory","4g")
sc = SparkContext(conf=conf)
sc.setLogLevel("ERROR")

In [15]:
# get raw data
train_raw_data, test_raw_data = getData(sc)

In [16]:
# get model
model = getModelItembased(sc)

In [17]:
# transform to generate a dataset for item-based model
u_d = transformTrainDataGroupByUid(train_raw_data)

In [18]:
# predict star-rating
prediction = predictItembased(model, u_d, test_raw_data)

In [19]:
# output prediction to file
outputResultToFileItembased(prediction)