In [1]:
import findspark
findspark.init()

In [2]:
import json

In [3]:
from pyspark import SparkConf, SparkContext, StorageLevel

In [4]:
from projectsupport import *

In [5]:
import support

In [6]:
train_file = "file:///Users/markduan/duan/USC_course/USC_APDS/INF553/project/data/train_review.json"
test_file = "file:///Users/markduan/duan/USC_course/USC_APDS/INF553/project/data/test_review.json"
als_model_file = "/Users/markduan/duan/USC_course/USC_APDS/INF553/project/model/als.json"
itemcf_model_file = 'file:///Users/markduan/duan/USC_course/USC_APDS/INF553/project/model/model_itemCF.json'
output_file = "../predict/prediction_combine.json"

In [7]:
agm_train_file = "/Users/markduan/duan/USC_course/USC_APDS/INF553/project/model/agm_train.json"

In [8]:
user_avg_file = "/Users/markduan/duan/USC_course/USC_APDS/INF553/project/data/user_avg.json"
business_avg_file = "/Users/markduan/duan/USC_course/USC_APDS/INF553/project/data/business_avg.json"

In [9]:
def getAvg(avg_file):
    with open(avg_file, 'r', encoding='utf-8') as fp:
        avg_d = json.load(fp)
    return avg_d

In [10]:
user_avg = getAvg(user_avg_file)
business_avg = getAvg(business_avg_file)

In [11]:
N_NEIGHBORS_ITEMBASED = 7
DEFAULT_OUTPUT = None
WEIGHT = 0.3

In [12]:
def meanList(l):
    return sum(l) / len(l)

def averageRating(x):
    uid = x[0][0]
    bid = x[0][1]
    stars = x[1] # list
    average_stars = meanList(stars)
    return (uid, [(bid, average_stars)])

def convertToDict(l):
    # l - [(bid, star), ...]
    bs = {}
    for bid, star in l:
        bs[bid] = star
    return bs

In [13]:
def predictICF(corated, target_bid, model):
    """
    corated - {bid: star, ...}
    """
    if corated == None:
        return None
    corated.pop(target_bid, None)
    bid_cor = list(corated.keys())
    collect = []
    for b in bid_cor:
        pair = None
        if b < target_bid:
            pair = (b, target_bid)
        else:
            pair = (target_bid, b)

        # if b == target_bid:
        #     print('same:', pair)
        w = model.get(pair)
        if w != None:
            # pair may not have a value in the model
            # when b == target_bid, pair have no value, too
            collect.append((pair, w, b))
        # else:
        #     collect.append((pair, 0, b))
    # print(collect)
    collect.sort(key=lambda x: x[1], reverse=True)
    
    if len(collect) < N_NEIGHBORS_ITEMBASED:
        return None
    neighbors = collect[:N_NEIGHBORS_ITEMBASED]
    sum_w = 0
    n = 0
    for p, w, b in neighbors:
        star = corated[b]
        n += star * w
        sum_w += w
    if sum_w == 0:
        return None
    else:
        return n /sum_w
#         predict_stars = n / sum_w
#         origin_n = Decimal(str(predict_stars))
#         ans_n = origin_n.quantize(Decimal('0'), rounding=ROUND_HALF_UP)
#         return float(ans_n)

In [14]:
# def getData(sc):
#     train_raw_data = sc.textFile(train_file) \
#         .map(json.loads)
    
#     test_raw_data = sc.textFile(test_file) \
#         .map(json.loads) \
#         .map(lambda x: (x['user_id'], x['business_id'])) \
#         .persist(StorageLevel.MEMORY_AND_DISK)

#     return train_raw_data, test_raw_data

In [15]:
def getData(sc):
    agm_train_l = support.readRenameTable(agm_train_file)
    train_raw_data = sc.parallelize(agm_train_l)
    
    
    test_raw_data = sc.textFile(test_file) \
        .map(json.loads) \
        .map(lambda x: (x['user_id'], x['business_id'])) \
        .persist(StorageLevel.MEMORY_AND_DISK)

    return train_raw_data, test_raw_data

In [16]:
def getModelItembased(sc, model_file):
    model = sc.textFile(model_file) \
        .map(json.loads) \
        .map(lambda r: ((r['b1'], r['b2']), r['sim'])) \
        .collectAsMap()
    # model - {(bid1, bid2): sim, ...}  ps: bid1 < bid2
    return model

In [17]:
def transformTrainDataGroupByUid(train_raw_data):
#     u_d = train_raw_data.map(lambda r: ((r['user_id'], r['business_id']), [r['stars']])) \
    u_d = train_raw_data.map(lambda x: (tuple(x[0]), [x[1]])) \
        .reduceByKey(lambda x, y: x + y) \
        .map(lambda x: averageRating(x)) \
        .reduceByKey(lambda x, y: x + y) \
        .map(lambda x: (x[0], convertToDict(x[1]))) \
        .collectAsMap()
    # u_d - {uid: {bid: star, ...}, ...}
    return u_d

In [18]:
# def predictItembased(model, u_d, test_raw_data):
#     prediction = test_raw_data.map(lambda x: (x, computeStarsItembased(u_d.get(x[0]), x[1], model)))
#     # RDD - [((uid, bid), star), ...]
#     return prediction

In [19]:
def outputResultToFileItembased(prediction):
    with open(output_file, 'w', encoding='utf-8') as fp:
        for item in prediction:
            t = {
                'user_id': item[0][0],
                'business_id': item[0][1],
                'stars': item[1]
            }
            fp.write(json.dumps(t))
            fp.write('\n')

In [20]:
def loadAlsModel(model_file):
    with open(model_file, 'r', encoding='utf-8') as fp:
        model = json.load(fp)
    user_feature = model[0]
    product_feature = model[1]
    als_model = Als()
    als_model.setModel(user_feature, product_feature)
    return als_model

In [21]:
# def predictAls(als_model, test_raw_data):
#     prediction = test_raw_data.map(lambda x: (x, als_model.predict(x[0], x[1]))) \
#         .collect()

In [22]:
conf = SparkConf() \
    .setAppName("project") \
    .setMaster("local[*]") \
    .set("spark.driver.memory","4g")
sc = SparkContext(conf=conf)
sc.setLogLevel("ERROR")

In [23]:
# get raw data
train_raw_data, test_raw_data = getData(sc)

In [24]:
# get itembased cf model
itemcf_model = getModelItembased(sc, itemcf_model_file)

In [25]:
# transform to generate a dataset for item-based model
u_d = transformTrainDataGroupByUid(train_raw_data)

In [26]:
# get als model
als_model = loadAlsModel(als_model_file)

In [27]:
# def adjustAlsPrediction(prediction, uid, bid, user_avg, business_avg):
#     if prediction == None:
#         return None
#     u_avg = user_avg.get(uid, user_avg['UNK'])
#     b_avg = business_avg.get(bid, business_avg['UNK'])
#     _avg = (u_avg + b_avg) / 2
#     _stars = prediction + _avg
#     return _stars

In [28]:
def fitRange(x):
    if x > 5.0:
        x = 5.0
    elif x < 0.0:
        x = 0.0
    else:
        x = x
    return x

In [35]:
def decidePrediction(p_cf, p_als, weight):
    res = None
    if p_cf != None and p_als != None:
        res = p_cf * weight + p_als * (1 - weight)
        res =  fitRange(res)
    elif p_cf == None and p_als != None:
        res = p_als
        res =  fitRange(res)
    elif p_cf != None and p_als == None:
        res = p_cf
        res =  fitRange(res)
    else:
        res = None
    return res

In [36]:
def getMean(v1, v2):
    return (v1 + v2) / 2

In [37]:
def dealwithNone(v, uid, bid, user_avg, business_avg):
    if v != None:
        return v
    u_avg = user_avg.get(uid, user_avg['UNK'])
    b_avg = business_avg.get(bid, business_avg['UNK'])
    _avg = getMean(u_avg, b_avg)
    return b_avg

In [38]:
# predict by itembased cf
prediction = test_raw_data.map(lambda x: (x, (predictICF(u_d.get(x[0]), x[1], itemcf_model), als_model.predict(x[0], x[1])))) \
    .map(lambda x: (x[0], decidePrediction(x[1][0], x[1][1], WEIGHT))) \
    .map(lambda x: (x[0], dealwithNone(x[1], x[0][0], x[0][1], user_avg, business_avg))) \
    .collect()

In [39]:
prediction

[(('1JEXL5K6VTx01tAs6Jskkg', 'M30I1NPl5JuHthxo1IXPGg'), 3.9069896769530033),
 (('ocC1kDdcWg6hVaTDJssM-Q', 'nUpz0YiBsOK7ff9k3vUJ3A'), 3.499354635616214),
 (('3kKfcfYKpNjQAOhhB5l7Vw', '9-cVW8wSRoE6Nmnc7v0uUA'), 2.733933882833526),
 (('5jZCm98_09LCqJL7WULqgg', '2sx52lDoiEtef7xgPCaoBw'), 4.7628614105616975),
 (('jZN2u4UF_i9cPFicdqF9dg', 'jKmAswXvFVRHN4VP-88zOA'), 5.0),
 (('gUJE1jwzhCuYIYjyXHdqoQ', 'lF4pEu4_55SSFEo6Q58ftQ'), 4.380134038089523),
 (('Aov5VTeaEv-DWGNaT2iEvQ', 'piGQNN6ECbSC0agHhvoVeg'), 4.3289160785513445),
 (('xGfzdTImX2pWnGum0Io8Zw', 'dEAk-gE-5Q95a7p91gNn8A'), 4.311102865187877),
 (('YM053Px-v4iuuYL4F6iIIg', '03e_aZlpTBiIva4SlBE1nw'), 2.97172175467972),
 (('43ifr6xOE2zpnBrqHS-E2A', 'iocG2xLxxHDmjsnGz9oikQ'), 3.651996576367594),
 (('crX_X0iuDcy2LuzKyXExHA', 'mmbXj7CmjGbH-wR3j6cD0w'), 3.1688963244827733),
 (('gK9x80p6d-3c_gpMO5SxJQ', 'S_JZst0IGCads_KhoueMCw'), 4.787484554882724),
 (('u74MmTY_0_qZUTSHzEJ3oQ', 'LSPobCZ-hkxWGpPQH550hw'), 2.79467631453349),
 (('66M62k1x9QWDErms7hgp

In [40]:
# output prediction to file
outputResultToFileItembased(prediction)