In [1]:
import numpy as np

In [2]:
import findspark
findspark.init()

In [3]:
import sys
import time
import json
import re

In [4]:
from pyspark import SparkConf, SparkContext, StorageLevel
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating

In [5]:
import importlib

In [6]:
import support

In [7]:
importlib.reload(support)

<module 'support' from '/Users/markduan/duan/USC_course/USC_APDS/INF553/project/dev/support.py'>

In [8]:
JS_THRESHOLD = 0.7

UNK = 3.7961611526341503
LONELY_USER_THRESHOLD = 5
LONELY_BUSINESS_THRESHOLD = 8

In [9]:
business_json = "file:///Users/markduan/duan/USC_course/USC_APDS/INF553/project/data/business.json"
train_file = "file:///Users/markduan/duan/USC_course/USC_APDS/INF553/project/data/train_review.json"
stopwords_file = "file:///Users/markduan/duan/USC_course/USC_APDS/INF553/project/data/extra_data/stopwords"

als_not_lonely_model_file = "/Users/markduan/duan/USC_course/USC_APDS/INF553/project/model/als_not_lonely.json"
als_lonely_model_file = "/Users/markduan/duan/USC_course/USC_APDS/INF553/project/model/als_lonely.json"
checkpoint_file = "file:///Users/markduan/duan/USC_course/USC_APDS/INF553/project/dev/checkpoint"

u_table_file = "/Users/markduan/duan/USC_course/USC_APDS/INF553/project/model/u_table.json"
b_table_file = "/Users/markduan/duan/USC_course/USC_APDS/INF553/project/model/b_table.json"

business_avg_file = "/Users/markduan/duan/USC_course/USC_APDS/INF553/project/data/business_avg.json"

In [10]:
business_jaccard_similarity_file = "/Users/markduan/duan/USC_course/USC_APDS/INF553/project/model/jaccard.json"
agm_train_file = "/Users/markduan/duan/USC_course/USC_APDS/INF553/project/model/agm_train.json"

In [11]:
u_table = support.readRenameTable(u_table_file)
b_table = support.readRenameTable(b_table_file)

u_d = {u_table[i]: i for i in range(len(u_table))}
b_d = {b_table[i]: i for i in range(len(b_table))}

In [12]:
business_avg = support.getAvg(business_avg_file)

In [13]:
n_b_avg = {b_d[k]: business_avg[k] for k in business_avg}

In [14]:
conf = SparkConf() \
    .setAppName("task") \
    .setMaster("local[*]") \
    .set("spark.driver.memory","4g")
sc = SparkContext(conf=conf)

In [15]:
raw_data = sc.textFile(train_file) \
    .map(json.loads) \
    .persist(StorageLevel.MEMORY_AND_DISK)

In [16]:
b_profile = raw_data.map(lambda r: (r['user_id'], r['business_id'], r['stars'])) \
    .map(lambda x: (u_d[x[0]], b_d[x[1]], x[2])) \
    .map(lambda x: (x[1], [(x[0], x[2])])) \
    .reduceByKey(lambda x, y: x + y) \
    .mapValues(lambda vs: {k: v for k, v in vs}) \
    .collectAsMap()

In [17]:
b_list = list(sorted(b_profile.keys()))
b_length = len(b_profile)

In [18]:
COSINE_CORATED_THRESHOLD = 5
def computeCosineSimilarity(i_dict, j_dict):
    i_keys = set(i_dict.keys())
    j_keys = set(j_dict.keys())
    inter = list(i_keys.intersection(j_keys))
    if len(inter) == 0 or len(inter) < COSINE_CORATED_THRESHOLD:
        return 0.0
    vct_i = np.array([i_dict[k] for k in inter])
    vct_j = np.array([j_dict[k] for k in inter])
    cs = np.dot(vct_i, vct_j) / (np.linalg.norm(vct_i) * np.linalg.norm(vct_j))
    return cs

In [19]:
COSINE_THRESHOLD = 0.8
def getCosSim(i_b, b_profile, b_list):
    i_dict = b_profile[i_b]
    l_ = []
    for j in range(len(b_list)):
        j_b = b_list[j]
        if j_b > i_b:
            j_dict = b_profile[j_b]
            sim = computeCosineSimilarity(i_dict, j_dict)
            if sim >= COSINE_THRESHOLD and sim != 0.0:
                new_1 = (i_b, [(j_b, sim)])
                new_2 = (j_b, [(i_b, sim)])
                l_.append(new_1)
                l_.append(new_2)
    return l_

In [20]:
cos_sim = sc.parallelize(b_list) \
    .flatMap(lambda x: getCosSim(x, b_profile, b_list)) \
    .reduceByKey(lambda x, y: x + y) \
    .mapValues(lambda vs: {k: v for k, v in vs}) \
    .collect()

In [21]:
cos_sim[:20]

[(10008,
  {2: 0.9237604307034011,
   15: 0.9206396184375115,
   16: 0.8000938831904313,
   20: 0.9201355408734887,
   21: 0.9833081801596844,
   38: 0.9827076298239908,
   53: 0.9764334937469208,
   65: 0.9228870083100785,
   87: 0.8737510287347613,
   99: 0.9758381846400692,
   101: 0.9444002816030351,
   114: 0.9576590948053578,
   120: 0.9733551979230721,
   138: 0.9191450300180578,
   151: 0.9914601339836673,
   165: 0.9688682772728905,
   170: 0.9252847888866454,
   177: 0.9610479662344115,
   179: 0.8746479842897222,
   180: 0.9930739280865575,
   181: 0.9889499370655616,
   185: 0.8506730149201002,
   188: 0.9830921920842036,
   198: 0.9232314354605025,
   204: 0.9588232551863847,
   239: 0.9438438449245833,
   252: 0.8598227289184035,
   259: 0.921745835149465,
   260: 0.9033398372969141,
   261: 0.9749005254295223,
   267: 0.9497111242436802,
   273: 0.982018325917156,
   274: 0.8980860319837043,
   279: 0.9941772773664709,
   280: 0.9897859062034421,
   285: 0.98396917080112

In [22]:
len(cos_sim)
# 8266 11097

7615

In [24]:
AGM_THRESHOLD = 1
n = 0
for x in cos_sim:
    if len(x[1]) >= AGM_THRESHOLD:
        n += 1

In [25]:
n
# 5429 9858

7615

In [26]:
data_0 = raw_data.map(lambda r: (r['user_id'], r['business_id'], r['stars'])) \
    .map(lambda x: (u_d[x[0]], b_d[x[1]], x[2])) \
    .map(lambda x: (x[0], [(x[1], x[2])])) \
    .reduceByKey(lambda x, y: x + y) \
    .persist(StorageLevel.MEMORY_AND_DISK)

In [27]:
data_0.collect()

[(82212,
  [(8070, 4.0),
   (10221, 5.0),
   (2187, 3.0),
   (8773, 3.0),
   (12315, 5.0),
   (9881, 3.0),
   (7099, 5.0),
   (4785, 4.0),
   (10067, 4.0),
   (11971, 4.0),
   (1319, 4.0),
   (10422, 5.0),
   (9287, 4.0),
   (1015, 3.0),
   (2393, 3.0),
   (10890, 5.0),
   (3926, 3.0),
   (515, 3.0),
   (12934, 5.0),
   (2598, 5.0),
   (2194, 3.0),
   (6042, 5.0),
   (2696, 4.0),
   (6424, 4.0),
   (7326, 4.0),
   (3302, 3.0),
   (13022, 4.0),
   (4001, 4.0),
   (8582, 4.0),
   (9076, 4.0),
   (8467, 5.0),
   (12495, 3.0),
   (12881, 4.0),
   (10165, 3.0),
   (10929, 3.0),
   (599, 4.0),
   (8142, 5.0),
   (6737, 4.0),
   (9653, 5.0),
   (11040, 4.0),
   (8128, 3.0),
   (2452, 4.0),
   (4279, 5.0),
   (1726, 5.0),
   (10308, 3.0),
   (2411, 4.0),
   (4328, 4.0),
   (12934, 5.0),
   (9084, 5.0),
   (4541, 5.0),
   (1041, 5.0),
   (8215, 4.0),
   (1210, 4.0),
   (12934, 5.0),
   (1973, 2.0),
   (9019, 5.0),
   (3217, 3.0),
   (584, 5.0),
   (1956, 5.0),
   (11944, 4.0),
   (4002, 4.0),
 

In [28]:
def adjustedSim(sim, target, accord, n_b_avg):
    t_avg = n_b_avg.get(target, UNK)
    a_avg = n_b_avg.get(accord, UNK)
    if a_avg > t_avg:
        return sim
    else:
        return 1 / sim

In [70]:
AGM_CORATED_THRESHOLD = 4
AGM_THRESHOLD = 1
AGM_USER_THRESHOLD = 3
def processValues(vs, cos_sim):
    # vs - [(n_b, star), ...]
    # cos_sim - [(0, {1:0.7, ...}), ...]
    if len(vs) >= AGM_USER_THRESHOLD or len(vs) < AGM_THRESHOLD:
        return vs
    v_d = {k: v for k, v in vs}
    v_d_keys = set(v_d.keys())
    vs_agm = []
    for x in cos_sim:
        target_b = x[0]
        if target_b not in v_d_keys:
            sim_b = x[1]
            sim_b_keys = set(sim_b.keys())
            inter = list(v_d_keys.intersection(sim_b_keys))
            if len(inter) >= AGM_THRESHOLD and len(inter) != 0:
                order_b = [(k, sim_b[k]) for k in inter]
                order_b.sort(key=lambda x: x[1], reverse=True)
                inter = [x[0] for x in order_b[:AGM_CORATED_THRESHOLD]]
                v_vct = np.array([v_d[k] for k in inter])
                b_vct_fenzi = np.array([adjustedSim(sim_b[k], target_b, k, n_b_avg) for k in inter])
                b_vct = np.array([sim_b[k] for k in inter])
                
                agm_stars = np.dot(v_vct, b_vct_fenzi) / b_vct.sum()
                if agm_stars > 5.0:
                    agm_stars = 5.0
                vs_agm.append((target_b, agm_stars))
    return vs + vs_agm

In [71]:
data_0.mapValues(lambda vs: len(vs)).filter(lambda x: x[1] < AGM_USER_THRESHOLD and x[1] >= AGM_THRESHOLD).count()
# 37902

1385

In [72]:
data_1 = data_0.mapValues(lambda vs: processValues(vs, cos_sim)).persist(StorageLevel.MEMORY_AND_DISK)
# RDD - [(n_u, [(n_b, star), ...]), ...]

In [73]:
agm_data = data_1.flatMap(lambda x: [(x[0], b, star) for b, star in x[1]]).persist(StorageLevel.MEMORY_AND_DISK)
# RDD - [(n_u, n_b, star), ...]

In [74]:
agm_data.collect()

[(82212, 8070, 4.0),
 (82212, 10221, 5.0),
 (82212, 2187, 3.0),
 (82212, 8773, 3.0),
 (82212, 12315, 5.0),
 (82212, 9881, 3.0),
 (82212, 7099, 5.0),
 (82212, 4785, 4.0),
 (82212, 10067, 4.0),
 (82212, 11971, 4.0),
 (82212, 1319, 4.0),
 (82212, 10422, 5.0),
 (82212, 9287, 4.0),
 (82212, 1015, 3.0),
 (82212, 2393, 3.0),
 (82212, 10890, 5.0),
 (82212, 3926, 3.0),
 (82212, 515, 3.0),
 (82212, 12934, 5.0),
 (82212, 2598, 5.0),
 (82212, 2194, 3.0),
 (82212, 6042, 5.0),
 (82212, 2696, 4.0),
 (82212, 6424, 4.0),
 (82212, 7326, 4.0),
 (82212, 3302, 3.0),
 (82212, 13022, 4.0),
 (82212, 4001, 4.0),
 (82212, 8582, 4.0),
 (82212, 9076, 4.0),
 (82212, 8467, 5.0),
 (82212, 12495, 3.0),
 (82212, 12881, 4.0),
 (82212, 10165, 3.0),
 (82212, 10929, 3.0),
 (82212, 599, 4.0),
 (82212, 8142, 5.0),
 (82212, 6737, 4.0),
 (82212, 9653, 5.0),
 (82212, 11040, 4.0),
 (82212, 8128, 3.0),
 (82212, 2452, 4.0),
 (82212, 4279, 5.0),
 (82212, 1726, 5.0),
 (82212, 10308, 3.0),
 (82212, 2411, 4.0),
 (82212, 4328, 4.0),
 

In [75]:
agm_data.count() 
# 1083832 1915462

2432287

In [76]:
agm_train = agm_data.map(lambda x: ((u_table[x[0]], b_table[x[1]]), x[2])).collect()

In [77]:
support.writeDownRenameTable(agm_train, agm_train_file)

In [78]:
lonely_user = agm_data.map(lambda x: (x[0], 5)) \
        .reduceByKey(lambda x, y: x + y) \
        .filter(lambda x: x[1] < LONELY_USER_THRESHOLD) \
        .map(lambda x: x[0]) \
        .collect()
lonely_business = agm_data.map(lambda x: (x[1], 5)) \
        .reduceByKey(lambda x, y: x + y) \
        .filter(lambda x: x[1] < LONELY_BUSINESS_THRESHOLD) \
        .map(lambda x: x[0]) \
        .collect()

In [79]:
def collectAlsModel(modelRDD, u_table, b_table):
    user_featrue = modelRDD.userFeatures() \
        .map(lambda x: (u_table[x[0]], list(x[1])[0])) \
        .collectAsMap()
    product_feature = modelRDD.productFeatures() \
        .map(lambda x: (b_table[x[0]], list(x[1])[0])) \
        .collectAsMap()
    return [user_featrue, product_feature]

def saveAlsModel(modelRDD, u_table, b_table, model_file):
    model = collectAlsModel(modelRDD, u_table, b_table)
    with open(model_file, 'w', encoding='utf-8') as fp:
        json.dump(model, fp)

In [80]:
als_model_file = als_not_lonely_model_file
stars_data = agm_data.filter(lambda x: x[0] not in lonely_user and x[1] not in lonely_business) \
    .map(lambda x: Rating(x[0], x[1], x[2])).persist(StorageLevel.MEMORY_AND_DISK)
sc.setCheckpointDir(checkpoint_file)
ALS.checkpointInterval = 2
modelRDD = ALS.train(ratings=stars_data, rank=1, iterations=70, lambda_=0.01, nonnegative=True)
saveAlsModel(modelRDD, u_table, b_table, als_model_file)

In [41]:
stars_data_not_lonely = agm_data.filter(lambda x: x[0] not in lonely_user and x[1] not in lonely_business) \
    .map(lambda x: Rating(x[0], x[1], x[2])).persist(StorageLevel.MEMORY_AND_DISK)

In [42]:
stars_data_lonely = agm_data.filter(lambda x: x[0] in lonely_user or x[1] in lonely_business) \
    .map(lambda x: Rating(x[0], x[1], x[2])).persist(StorageLevel.MEMORY_AND_DISK)

In [43]:
sc.setCheckpointDir(checkpoint_file)
ALS.checkpointInterval = 2

In [44]:
modelRDD_not_lonely = ALS.train(ratings=stars_data_not_lonely, rank=1, iterations=70, lambda_=0.01, nonnegative=True)
saveAlsModel(modelRDD_not_lonely, u_table, b_table, als_not_lonely_model_file)

In [45]:
modelRDD_lonely = ALS.train(ratings=stars_data_lonely, rank=1, iterations=70, lambda_=0.01, nonnegative=True)
saveAlsModel(modelRDD_lonely, u_table, b_table, als_lonely_model_file)