In [1]:
import findspark
findspark.init()

In [2]:
import json

In [3]:
from pyspark import SparkConf, SparkContext, StorageLevel

In [4]:
import support

In [43]:
train_file = "file:///Users/markduan/duan/USC_course/USC_APDS/INF553/project/data/train_review.json"
test_file = "file:///Users/markduan/duan/USC_course/USC_APDS/INF553/project/data/test_review.json"
model_file = "/Users/markduan/duan/USC_course/USC_APDS/INF553/project/model/als.json"
output_file = "../predict/prediction_als_none.json"

als_not_lonely_model_file = "/Users/markduan/duan/USC_course/USC_APDS/INF553/project/model/als_not_lonely.json"
als_lonely_model_file = "/Users/markduan/duan/USC_course/USC_APDS/INF553/project/model/als_lonely.json"

agm_train_file = "/Users/markduan/duan/USC_course/USC_APDS/INF553/project/model/agm_train.json"

In [6]:
business_avg_file = "/Users/markduan/duan/USC_course/USC_APDS/INF553/project/data/business_avg.json"

In [17]:
def loadAlsModel(model_file):
    with open(model_file, 'r', encoding='utf-8') as fp:
        model = json.load(fp)
    user_feature = model[0]
    product_feature = model[1]
    als_model = support.Als()
    als_model.setModel(user_feature, product_feature)
    return als_model

In [79]:
def outputResultToFile(prediction, output_file):
    with open(output_file, 'w', encoding='utf-8') as fp:
        for item in prediction:
            t = {
                'user_id': item[0][0],
                'business_id': item[0][1],
                'stars': item[1]
            }
            fp.write(json.dumps(t))
            fp.write('\n')

In [9]:
conf = SparkConf() \
    .setAppName("task") \
    .setMaster("local[*]") \
    .set("spark.driver.memory","4g")
sc = SparkContext(conf=conf)
sc.setLogLevel("ERROR")

In [10]:
business_avg = support.getAvg(business_avg_file)

In [77]:
train_data = sc.textFile(train_file) \
    .map(json.loads) \
    .map(lambda x: ((x['user_id'], x['business_id']), [x['stars']])) \
    .reduceByKey(lambda x, y: x + y) \
    .mapValues(lambda vs: support.meanList(vs)) \
    .collectAsMap()

In [69]:
agm_train_l = support.readRenameTable(agm_train_file)

In [70]:
agm_train_l

[[['XCNi6raOHuxmI66Cg2Er2Q', 'Rky0LFjOHUrzypZuOkEhBg'], 4.0],
 [['XCNi6raOHuxmI66Cg2Er2Q', '_pBXtjN43eqMV0XZTz7nmw'], 5.0],
 [['XCNi6raOHuxmI66Cg2Er2Q', 'PjchgHDQwjepIbg4w-DCyw'], 3.0],
 [['XCNi6raOHuxmI66Cg2Er2Q', 'azIZRqX_1qbp0_6GKSgSAA'], 3.0],
 [['XCNi6raOHuxmI66Cg2Er2Q', 'YF4QJ_j-PM7cMZja6K1MwQ'], 5.0],
 [['XCNi6raOHuxmI66Cg2Er2Q', 'cf6rtw3f05DlnJ7muIAEgA'], 3.0],
 [['XCNi6raOHuxmI66Cg2Er2Q', '3Mx4renubPRnjHUw1n2UkA'], 5.0],
 [['XCNi6raOHuxmI66Cg2Er2Q', 'CiYLq33nAyghFkUR15pP-Q'], 4.0],
 [['XCNi6raOHuxmI66Cg2Er2Q', '3iu_3Ip1m4QLTutHdRmc4w'], 4.0],
 [['XCNi6raOHuxmI66Cg2Er2Q', 'VyjyHoBg3KC5BSFRlD0ZPQ'], 4.0],
 [['XCNi6raOHuxmI66Cg2Er2Q', 'YJ8ljUhLsz6CtT_2ORNFmg'], 4.0],
 [['XCNi6raOHuxmI66Cg2Er2Q', '5LNZ67Yw9RD6nf4_UhXOjw'], 5.0],
 [['XCNi6raOHuxmI66Cg2Er2Q', 'ECOkEVUodMLUxvI0PMI4gQ'], 4.0],
 [['XCNi6raOHuxmI66Cg2Er2Q', 'ZZPLE9A6r-Wszc8EdYoVEg'], 3.0],
 [['XCNi6raOHuxmI66Cg2Er2Q', 'iCQpiavjjPzJ5_3gPD5Ebg'], 3.0],
 [['XCNi6raOHuxmI66Cg2Er2Q', 'NvKNe9DnQavC9GstglcBJQ'], 5.0],
 [['XCNi

In [71]:
# train_data = sc.parallelize(agm_train_l).map(lambda x: (tuple(x[0]), [x[1]])) \
#     .reduceByKey(lambda x, y: x + y) \
#     .mapValues(lambda vs: support.meanList(vs)) \
#     .collectAsMap()

In [72]:
def dealwithNone(v, bid, business_avg):
    if v != None:
        return v
    b_avg = business_avg.get(bid, business_avg['UNK'])
    return b_avg

In [73]:
def normalX(x):
    if x == None:
        return x
    if x > 5.0:
        x = 5.0
    elif x < 0.0:
        x = 0.0
    else:
        x = x
    return x

In [74]:
def predictAlsOrTrain(uid, bid, als_model, train_data):
    res1 = train_data.get((uid, bid))
    res2 = als_model.predict(uid, bid)
    res = None
    if res1 != None and res2 != None:
        res = res1 * 0.3 + res2 * 0.7
    elif res1 == None and res2 != None:
        res = res2
    elif res1 != None and res2 == None:
        res = res1
    else:
        res = None
    res = normalX(res)
    return res

In [80]:
# without none
model_file = als_not_lonely_model_file
output_file = "../predict/prediction_als.json"
als_model = loadAlsModel(model_file)
prediction = sc.textFile(test_file) \
    .map(json.loads) \
    .map(lambda x: (x['user_id'], x['business_id'])) \
    .map(lambda x: (x, predictAlsOrTrain(x[0], x[1], als_model, train_data))) \
    .map(lambda x: (x[0], dealwithNone(x[1], x[0][1], business_avg))) \
    .collect()
outputResultToFile(prediction, output_file)

In [82]:
# without none
model_file = als_not_lonely_model_file
output_file = "../predict/prediction_als.json"
als_model = loadAlsModel(model_file)
prediction = sc.textFile(test_file) \
    .map(json.loads) \
    .map(lambda x: (x['user_id'], x['business_id'])) \
    .map(lambda x: (x, als_model.predict(x[0], x[1]))) \
    .collect()
outputResultToFile(prediction, output_file)