In [1]:
import findspark
findspark.init()

In [2]:
import json
# import os

In [3]:
from pyspark import SparkConf, SparkContext, StorageLevel
# from pyspark.sql import SparkSession

In [4]:
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating

In [5]:
train_file = "file:///Users/markduan/duan/USC_course/USC_APDS/INF553/project/data/train_review.json"
test_file = "file:///Users/markduan/duan/USC_course/USC_APDS/INF553/project/data/test_review.json"
model_file = "/Users/markduan/duan/USC_course/USC_APDS/INF553/project/model/als.json"
checkpoint_file = "file:///Users/markduan/duan/USC_course/USC_APDS/INF553/project/dev/checkpoint"

In [6]:
conf = SparkConf() \
    .setAppName("task") \
    .setMaster("local[*]") \
    .set("spark.driver.memory","4g")
sc = SparkContext(conf=conf)
sc.setLogLevel("ERROR")

In [31]:
train_data = sc.textFile(train_file) \
    .map(json.loads) \
    .map(lambda x: ((x['user_id'], x['business_id']), [x['stars']])) \
    .reduceByKey(lambda x, y: x + y) \
    .mapValues(lambda vs: max(vs)) \
    .collectAsMap()

In [7]:
# spark = SparkSession.builder.config(conf=SparkConf()).getOrCreate()

In [8]:
raw_data = sc.textFile(train_file).map(json.loads).persist(StorageLevel.MEMORY_AND_DISK)

In [9]:
lonely_user = raw_data.map(lambda x: (x['user_id'], 1)) \
    .reduceByKey(lambda x, y: x + y) \
    .filter(lambda x: x[1] < 1) \
    .map(lambda x: x[0]) \
    .collect()

In [10]:
lonely_business = raw_data.map(lambda x: (x['business_id'], 1)) \
    .reduceByKey(lambda x, y: x + y) \
    .filter(lambda x: x[1] < 1) \
    .map(lambda x: x[0]) \
    .collect()

In [11]:
u_table = raw_data.map(lambda x: x['user_id']) \
    .distinct() \
    .collect()

In [12]:
b_table = raw_data.map(lambda x: x['business_id']) \
    .distinct() \
    .collect()

In [13]:
u_d = {}
for i in range(len(u_table)):
    k = u_table[i]
    u_d[k] = i
b_d = {}
for i in range(len(b_table)):
    k = b_table[i]
    b_d[k] = i

In [14]:
def meanList(l):
    return sum(l) / len(l)

In [15]:
stars_data = raw_data.map(lambda x: ((x['user_id'], x['business_id']), [x['stars']])) \
    .filter(lambda x: x[0][0] not in lonely_user and x[0][1] not in lonely_business) \
    .reduceByKey(lambda x, y: x + y) \
    .map(lambda x: (x[0], meanList(x[1]))) \
    .map(lambda x: (u_d[x[0][0]], b_d[x[0][1]], x[1])) \
    .map(lambda x: Rating(x[0], x[1], x[2])) \
    .persist(StorageLevel.MEMORY_AND_DISK)

In [16]:
stars_data.collect()

[Rating(user=56400, product=7062, rating=5.0),
 Rating(user=60017, product=12653, rating=5.0),
 Rating(user=70635, product=11086, rating=4.0),
 Rating(user=24744, product=2508, rating=2.0),
 Rating(user=5, product=2031, rating=5.0),
 Rating(user=77589, product=3041, rating=1.0),
 Rating(user=45885, product=11600, rating=5.0),
 Rating(user=56405, product=10042, rating=2.0),
 Rating(user=81077, product=9528, rating=5.0),
 Rating(user=35245, product=7067, rating=1.0),
 Rating(user=7029, product=10045, rating=5.0),
 Rating(user=49444, product=7072, rating=3.0),
 Rating(user=45891, product=6545, rating=5.0),
 Rating(user=60023, product=12658, rating=3.0),
 Rating(user=88202, product=6552, rating=3.0),
 Rating(user=35251, product=969, rating=1.0),
 Rating(user=10590, product=9539, rating=5.0),
 Rating(user=56416, product=10052, rating=4.0),
 Rating(user=1, product=4081, rating=3.0),
 Rating(user=3469, product=8522, rating=5.0),
 Rating(user=52926, product=4555, rating=5.0),
 Rating(user=2115

In [17]:
sc.setCheckpointDir(checkpoint_file)
ALS.checkpointInterval = 2

In [18]:
modelRDD = ALS.train(ratings=stars_data, rank=1, iterations=70, lambda_=0.01, nonnegative=True)

In [19]:
def collectAlsModel(modelRDD, u_table, b_table):
    user_featrue = modelRDD.userFeatures() \
        .map(lambda x: (u_table[x[0]], list(x[1])[0])) \
        .collectAsMap()
    product_feature = modelRDD.productFeatures() \
        .map(lambda x: (b_table[x[0]], list(x[1])[0])) \
        .collectAsMap()
    return [user_featrue, product_feature]

In [20]:
def saveAlsModel(modelRDD, u_table, b_table, model_file):
    model = collectAlsModel(modelRDD, u_table, b_table)
    with open(model_file, 'w', encoding='utf-8') as fp:
        json.dump(model, fp)

In [21]:
saveAlsModel(modelRDD, u_table, b_table, model_file)

In [22]:
# def convertToNum(x, u_table, b_table):
#     # x - (uid, bid)
#     uid = x[0]
#     bid = x[1]
#     try:
#         n_uid = u_table.index(uid)
#         n_bid = b_table.index(bid)
#     except ValueError as e:
#         return (uid, None)
#     return (n_uid, n_bid)

In [23]:
# raw_testing = sc.textFile(test_file) \
#     .map(json.loads) \
#     .map(lambda r: (r['user_id'], r['business_id'])) \
#     .persist(StorageLevel.MEMORY_AND_DISK)

# testing_with_none = raw_testing.map(lambda x: convertToNum(x, u_table, b_table)).persist(StorageLevel.MEMORY_AND_DISK)
# testing_without_none = testing_with_none.filter(lambda x: type(x[0]) != str)
# testing_none = testing_with_none.filter(lambda x: type(x[0]) == str) \
#     .map(lambda x: (x, None)) \
#     .collect()

In [24]:
# test_pre_all = modelRDD.predictAll(testing_without_none).persist(StorageLevel.MEMORY_AND_DISK)

In [25]:
# test_pre = test_pre_all.map(lambda x: ((u_table[x[0]], b_table[x[1]]), x[2])).collect()

In [26]:
# res_dict = {}
# for x in test_pre:
#     res_dict[x[0]] = x[1]
# for x in testing_none:
#     res_dict[x[0]] = x[1]
# # res_dict - {(uid, bid): stars, ...}

In [27]:
# res_dict = {}
# for x in test_pre:
#     v = x[1]
#     if v > 5.0:
#         v = 5.0
#     res_dict[x[0]] = v
# for x in testing_none:
#     res_dict[x[0]] = x[1]
# # res_dict - {(uid, bid): stars, ...}

In [28]:
# prediction = raw_testing.map(lambda x: (x, res_dict[x])).collect()

In [29]:
# def outputResultToFileItembased(prediction):
#     with open(output_file, 'w', encoding='utf-8') as fp:
#         for item in prediction:
#             t = {
#                 'user_id': item[0][0],
#                 'business_id': item[0][1],
#                 'stars': item[1]
#             }
#             fp.write(json.dumps(t))
#             fp.write('\n')

In [30]:
# # output prediction to file
# outputResultToFileItembased(prediction)