In [1]:
import findspark
findspark.init()

In [2]:
import json

In [3]:
from pyspark import SparkConf, SparkContext, StorageLevel

In [4]:
test_file = "file:///Users/markduan/duan/USC_course/USC_APDS/INF553/project/data/test_review.json"
model_file = "/Users/markduan/duan/USC_course/USC_APDS/INF553/project/model/als.json"
output_file = "../predict/prediction_als_none.json"

In [5]:
class Als():
    def __init__(self):
        self.user_feature = None
        self.product_feature = None
    
    def setModel(self, user_feature, product_feature):
        """
        user_feature - {uid: float, ...}
        product_feature - {bid: float, ...}
        """
        self.user_feature = user_feature
        self.product_feature = product_feature
    
    def predict(self, uid, bid):
        u_value = self.user_feature.get(uid)
        b_value = self.product_feature.get(bid)
        if u_value == None or b_value == None:
            return None
        res = u_value * b_value
        if res > 5.0:
            res = 5.0
        elif res < 0.0:
            res = 0.0
        return res

In [6]:
def loadAlsModel(model_file):
    with open(model_file, 'r', encoding='utf-8') as fp:
        model = json.load(fp)
    user_feature = model[0]
    product_feature = model[1]
    als_model = Als()
    als_model.setModel(user_feature, product_feature)
    return als_model

In [7]:
def outputResultToFileItembased(prediction, output_file):
    with open(output_file, 'w', encoding='utf-8') as fp:
        for item in prediction:
            t = {
                'user_id': item[0][0],
                'business_id': item[0][1],
                'stars': item[1]
            }
            fp.write(json.dumps(t))
            fp.write('\n')

In [8]:
conf = SparkConf() \
    .setAppName("task") \
    .setMaster("local[*]") \
    .set("spark.driver.memory","4g")
sc = SparkContext(conf=conf)
sc.setLogLevel("ERROR")

In [9]:
def loadAlsModels(lonely_pairs):
    als_models = []
    for u, b in lonely_pairs:
        model_file = "/Users/markduan/duan/USC_course/USC_APDS/INF553/project/model/als_%d_%d.json" % (u, b)
        als_model = loadAlsModel(model_file)
        als_models.append(als_model)
    return als_models

In [10]:
def getPres(uid, bid, als_models):
    rs = [m.predict(uid, bid) for m in als_models]
    rs.reverse()
    return tuple(rs)

In [11]:
def decideWhich(vs):
    length_vs = len(vs)
    for i in range(length_vs):
        v = vs[i]
        if v != None:
            return v
    return None

In [45]:
lonely_pairs = [(i, i) for i in range(6, 11)]

In [46]:
# lonely_pairs = [(i, i) for i in [3, 5, 7, 10]]

In [47]:
als_models = loadAlsModels(lonely_pairs)

In [48]:
output_file = "../predict/prediction_als_none_%d_to_%d.json" % (5, 10)
prediction = sc.textFile(test_file) \
    .map(json.loads) \
    .map(lambda x: (x['user_id'], x['business_id'])) \
    .map(lambda x: (x, getPres(x[0], x[1], als_models))) \
    .mapValues(lambda vs: decideWhich(vs)) \
    .map(lambda x: (x[0], dealwithNone(x[1], x[0][0], x[0][1], user_avg, business_avg))) \
    .collect()
outputResultToFileItembased(prediction, output_file)

In [15]:

model_file = "/Users/markduan/duan/USC_course/USC_APDS/INF553/project/model/als_%d_%d.json" % (i, i)
output_file = "../predict/prediction_als_none_%d_%d.json" % (i, i)
als_model = loadAlsModel(model_file)
prediction = sc.textFile(test_file) \
    .map(json.loads) \
    .map(lambda x: (x['user_id'], x['business_id'])) \
    .map(lambda x: (x, als_model.predict(x[0], x[1]))) \
    .collect()
outputResultToFileItembased(prediction, output_file)

In [5]:
k1 = [1,2,3]
k1.reverse()

In [6]:
k1

[3, 2, 1]

In [25]:
def getAvg(avg_file):
    with open(avg_file, 'r', encoding='utf-8') as fp:
        avg_d = json.load(fp)
    return avg_d

In [26]:
user_avg_file = "/Users/markduan/duan/USC_course/USC_APDS/INF553/project/data/user_avg.json"
business_avg_file = "/Users/markduan/duan/USC_course/USC_APDS/INF553/project/data/business_avg.json"

In [27]:
user_avg = getAvg(user_avg_file)
business_avg = getAvg(business_avg_file)

In [28]:
def getMean(v1, v2):
    return (v1 + v2) / 2

In [29]:
def dealwithNone(v, uid, bid, user_avg, business_avg):
    if v != None:
        return v
    u_avg = user_avg.get(uid, user_avg['UNK'])
    b_avg = business_avg.get(bid, business_avg['UNK'])
#     _avg = getMean(u_avg, b_avg)
    return b_avg

In [50]:
LONELY_USER_THRESHOLD = list(range(5, 6))
LONELY_BUSINESS_THRESHOLD = list(range(8, 20))

for u in LONELY_USER_THRESHOLD:
    for b in LONELY_BUSINESS_THRESHOLD:
        model_file = "/Users/markduan/duan/USC_course/USC_APDS/INF553/project/model/als_%d_%d.json" % (u, b)
        output_file = "../predict/prediction_als_%d_%d.json" % (u, b)
        als_model = loadAlsModel(model_file)
        prediction = sc.textFile(test_file) \
            .map(json.loads) \
            .map(lambda x: (x['user_id'], x['business_id'])) \
            .map(lambda x: (x, als_model.predict(x[0], x[1]))) \
            .map(lambda x: (x[0], dealwithNone(x[1], x[0][0], x[0][1], user_avg, business_avg))) \
            .collect()
        outputResultToFileItembased(prediction, output_file)