In [7]:
import findspark
findspark.init()

In [8]:
import json
# import os

In [9]:
from pyspark import SparkConf, SparkContext, StorageLevel
# from pyspark.sql import SparkSession

In [10]:
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating

In [11]:
train_file = "file:///Users/markduan/duan/USC_course/USC_APDS/INF553/project/data/train_review.json"
test_file = "file:///Users/markduan/duan/USC_course/USC_APDS/INF553/project/data/test_review.json"
model_file = "/Users/markduan/duan/USC_course/USC_APDS/INF553/project/model/als.json"
checkpoint_file = "file:///Users/markduan/duan/USC_course/USC_APDS/INF553/project/dev/checkpoint"

In [12]:
user_avg_file = "/Users/markduan/duan/USC_course/USC_APDS/INF553/project/data/user_avg.json"
business_avg_file = "/Users/markduan/duan/USC_course/USC_APDS/INF553/project/data/business_avg.json"

In [13]:
def getAvg(avg_file):
    with open(avg_file, 'r', encoding='utf-8') as fp:
        avg_d = json.load(fp)
    return avg_d

In [14]:
user_avg = getAvg(user_avg_file)
business_avg = getAvg(business_avg_file)

In [15]:
conf = SparkConf() \
    .setAppName("task") \
    .setMaster("local[*]") \
    .set("spark.driver.memory","4g")
sc = SparkContext(conf=conf)
sc.setLogLevel("ERROR")

In [16]:
# spark = SparkSession.builder.config(conf=SparkConf()).getOrCreate()

In [17]:
raw_data = sc.textFile(train_file).map(json.loads).persist(StorageLevel.MEMORY_AND_DISK)

In [18]:
u_table = raw_data.map(lambda x: x['user_id']) \
    .distinct() \
    .collect()

In [19]:
b_table = raw_data.map(lambda x: x['business_id']) \
    .distinct() \
    .collect()

In [20]:
def adjustStars(stars, uid, bid, user_avg, business_avg):
    u_avg = user_avg.get(uid, user_avg['UNK'])
    b_avg = business_avg.get(bid, business_avg['UNK'])
    _avg = (u_avg + b_avg) / 2
    adjusted_stars = stars - _avg
    return adjusted_stars

In [22]:
stars_data = raw_data.map(lambda x: (x['user_id'], x['business_id'], x['stars'])) \
    .map(lambda x: (x[0], x[1], adjustStars(x[2], x[0], x[1], user_avg, business_avg))) \
    .map(lambda x: (u_table.index(x[0]), b_table.index(x[1]), x[2])) \
    .map(lambda x: Rating(x[0], x[1], x[2]))

In [23]:
sc.setCheckpointDir(checkpoint_file)
ALS.checkpointInterval = 2

In [24]:
modelRDD = ALS.train(ratings=stars_data, rank=1, iterations=10, lambda_=0.01)

In [25]:
def collectAlsModel(modelRDD, u_table, b_table):
    user_featrue = modelRDD.userFeatures() \
        .map(lambda x: (u_table[x[0]], list(x[1])[0])) \
        .collectAsMap()
    product_feature = modelRDD.productFeatures() \
        .map(lambda x: (b_table[x[0]], list(x[1])[0])) \
        .collectAsMap()
    return [user_featrue, product_feature]

In [26]:
def saveAlsModel(modelRDD, u_table, b_table, model_file):
    model = collectAlsModel(modelRDD, u_table, b_table)
    with open(model_file, 'w', encoding='utf-8') as fp:
        json.dump(model, fp)

In [27]:
saveAlsModel(modelRDD, u_table, b_table, model_file)