In [1]:
import findspark
findspark.init()

In [2]:
import json
# import os

In [3]:
from pyspark import SparkConf, SparkContext, StorageLevel
# from pyspark.sql import SparkSession

In [4]:
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating

In [5]:
train_file = "file:///Users/markduan/duan/USC_course/USC_APDS/INF553/project/data/train_review.json"
test_file = "file:///Users/markduan/duan/USC_course/USC_APDS/INF553/project/data/test_review.json"
model_file = "/Users/markduan/duan/USC_course/USC_APDS/INF553/project/model/als.json"
checkpoint_file = "file:///Users/markduan/duan/USC_course/USC_APDS/INF553/project/dev/checkpoint"

In [6]:
LONELY_USER_THRESHOLD = 5
LONELY_BUSINESS_THRESHOLD = 5

In [7]:
conf = SparkConf() \
    .setAppName("task") \
    .setMaster("local[*]") \
    .set("spark.driver.memory","4g")
sc = SparkContext(conf=conf)
sc.setLogLevel("ERROR")

In [8]:
# spark = SparkSession.builder.config(conf=SparkConf()).getOrCreate()

In [9]:
raw_data = sc.textFile(train_file).map(json.loads).persist(StorageLevel.MEMORY_AND_DISK)

In [10]:
u_table = raw_data.map(lambda x: x['user_id']) \
    .distinct() \
    .collect()

In [11]:
b_table = raw_data.map(lambda x: x['business_id']) \
    .distinct() \
    .collect()

In [12]:
rename_data = raw_data.map(lambda x: (u_table.index(x['user_id']), b_table.index(x['business_id']), x['stars'])) \
    .persist(StorageLevel.MEMORY_AND_DISK)
# RDD - [(n_u, n_b, stars), ...]

In [13]:
def meanList(l):
    return sum(l) / len(l)

In [14]:
data_1 = rename_data.map(lambda x: ((x[0], x[1]), [x[2]])) \
    .reduceByKey(lambda x, y: x + y) \
    .mapValues(lambda v: max(v)) \
    .persist(StorageLevel.MEMORY_AND_DISK)
# RDD - [((n_u, n_b), stars), ...]

In [15]:
data_1.collect()

[((63601, 3539), 4.0),
 ((84681, 7551), 2.0),
 ((28261, 2513), 4.0),
 ((60020, 8032), 4.0),
 ((17632, 3542), 5.0),
 ((81076, 5540), 5.0),
 ((17637, 10041), 4.0),
 ((67124, 10590), 5.0),
 ((31718, 11606), 1.0),
 ((7032, 5036), 5.0),
 ((21143, 4077), 1.0),
 ((88203, 5035), 4.0),
 ((18, 8516), 4.0),
 ((45899, 971), 5.0),
 ((14103, 2039), 4.0),
 ((35253, 2523), 2.0),
 ((56418, 972), 5.0),
 ((84693, 7561), 5.0),
 ((45904, 3552), 5.0),
 ((67137, 4087), 5.0),
 ((77601, 7), 3.0),
 ((3474, 3048), 5.0),
 ((63627, 6051), 5.0),
 ((35260, 7572), 4.0),
 ((31740, 8504), 4.0),
 ((31741, 493), 4.0),
 ((77611, 6561), 1.0),
 ((31742, 11112), 5.0),
 ((52936, 3562), 2.0),
 ((14114, 494), 3.0),
 ((88215, 3061), 4.0),
 ((74147, 4567), 1.0),
 ((21163, 3061), 1.0),
 ((81094, 4568), 2.0),
 ((74151, 33), 4.0),
 ((48, 7090), 5.0),
 ((49, 11115), 2.0),
 ((42349, 9079), 5.0),
 ((60049, 11117), 5.0),
 ((74142, 4102), 4.0),
 ((24771, 4103), 4.0),
 ((14128, 996), 3.0),
 ((60, 11630), 2.0),
 ((17670, 6578), 1.0),
 ((88

In [16]:
def getStarsData(data_1, rename_data, lonely_u, longly_b):
    # stars_data_RDD : RDD - [Rating(user=63601, product=3539, rating=4.0), ...]
    lonely_user = rename_data.map(lambda x: (x[0], 1)) \
        .reduceByKey(lambda x, y: x + y) \
        .filter(lambda x: x[1] < lonely_u) \
        .map(lambda x: x[0]) \
        .collect()
    
    lonely_business = rename_data.map(lambda x: (x[1], 1)) \
        .reduceByKey(lambda x, y: x + y) \
        .filter(lambda x: x[1] < longly_b) \
        .map(lambda x: x[0]) \
        .collect()
    
    stars_data_RDD = data_1.filter(lambda x: x[0][0] not in lonely_user and x[0][1] not in lonely_business) \
        .map(lambda x: Rating(x[0][0], x[0][1], x[1])) \
        .persist(StorageLevel.MEMORY_AND_DISK)
    
    return stars_data_RDD

In [17]:
def collectAlsModel(modelRDD, u_table, b_table):
    user_featrue = modelRDD.userFeatures() \
        .map(lambda x: (u_table[x[0]], list(x[1])[0])) \
        .collectAsMap()
    product_feature = modelRDD.productFeatures() \
        .map(lambda x: (b_table[x[0]], list(x[1])[0])) \
        .collectAsMap()
    return [user_featrue, product_feature]

In [18]:
def saveAlsModel(modelRDD, u_table, b_table, model_file):
    model = collectAlsModel(modelRDD, u_table, b_table)
    with open(model_file, 'w', encoding='utf-8') as fp:
        json.dump(model, fp)

In [19]:
def getAlsModel(sc, stars_data_RDD, u_table, b_table, model_file, rank=1, iterations=70, lambda_=0.01, nonnegative=True):
    sc.setCheckpointDir(checkpoint_file)
    ALS.checkpointInterval = 2
    modelRDD = ALS.train(ratings=stars_data_RDD, rank=1, iterations=70, lambda_=0.01, nonnegative=True)
    saveAlsModel(modelRDD, u_table, b_table, model_file)

In [22]:
LONELY_USER_THRESHOLD = list(range(5, 6))
LONELY_BUSINESS_THRESHOLD = list(range(8, 20))

In [23]:
for u in LONELY_USER_THRESHOLD:
    for b in LONELY_BUSINESS_THRESHOLD:
        model_file = "/Users/markduan/duan/USC_course/USC_APDS/INF553/project/model/als_%d_%d.json" % (u, b)
        stars_data_RDD = getStarsData(data_1, rename_data, u, b)
        getAlsModel(sc, stars_data_RDD, u_table, b_table, model_file)

In [34]:
# model_file = "/Users/markduan/duan/USC_course/USC_APDS/INF553/project/model/als_5_5.json"

In [32]:
# stars_data_RDD = getStarsData(data_1, rename_data, 5, 5)

In [35]:
# getAlsModel(sc, stars_data_RDD, u_table, b_table, model_file)