In [1]:
import numpy as np

In [2]:
import findspark
findspark.init()

In [3]:
import sys
import time
import json
import re

In [4]:
from pyspark import SparkConf, SparkContext, StorageLevel
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating

In [5]:
import importlib

In [6]:
import support

In [7]:
importlib.reload(support)

<module 'support' from '/Users/markduan/duan/USC_course/USC_APDS/INF553/project/dev/support.py'>

In [8]:
JS_THRESHOLD = 0.7
AGM_USER_THRESHOLD = 8
AGM_THRESHOLD = 3
UNK = 3.7961611526341503
LONELY_USER_THRESHOLD = 5
LONELY_BUSINESS_THRESHOLD = 8

In [9]:
business_json = "file:///Users/markduan/duan/USC_course/USC_APDS/INF553/project/data/business.json"
train_file = "file:///Users/markduan/duan/USC_course/USC_APDS/INF553/project/data/train_review.json"
stopwords_file = "file:///Users/markduan/duan/USC_course/USC_APDS/INF553/project/data/extra_data/stopwords"

als_not_lonely_model_file = "/Users/markduan/duan/USC_course/USC_APDS/INF553/project/model/als_not_lonely.json"
als_lonely_model_file = "/Users/markduan/duan/USC_course/USC_APDS/INF553/project/model/als_lonely.json"
checkpoint_file = "file:///Users/markduan/duan/USC_course/USC_APDS/INF553/project/dev/checkpoint"

u_table_file = "/Users/markduan/duan/USC_course/USC_APDS/INF553/project/model/u_table.json"
b_table_file = "/Users/markduan/duan/USC_course/USC_APDS/INF553/project/model/b_table.json"

business_avg_file = "/Users/markduan/duan/USC_course/USC_APDS/INF553/project/data/business_avg.json"

In [10]:
business_jaccard_similarity_file = "/Users/markduan/duan/USC_course/USC_APDS/INF553/project/model/jaccard.json"
agm_train_file = "/Users/markduan/duan/USC_course/USC_APDS/INF553/project/model/agm_train.json"

In [11]:
u_table = support.readRenameTable(u_table_file)
b_table = support.readRenameTable(b_table_file)

u_d = {}
for i in range(len(u_table)):
    k = u_table[i]
    u_d[k] = i
b_d = {}
for i in range(len(b_table)):
    k = b_table[i]
    b_d[k] = i

In [12]:
business_avg = support.getAvg(business_avg_file)

In [13]:
n_b_avg = {b_d[k]: business_avg[k] for k in business_avg}

In [14]:
conf = SparkConf() \
    .setAppName("task") \
    .setMaster("local[*]") \
    .set("spark.driver.memory","4g")
sc = SparkContext(conf=conf)

In [15]:
# get stopwords
stopwords = sc.textFile(stopwords_file).collect()

In [16]:
def processCategories(v, stopwords):
    # v - "Arcades, Arts & Entertainment"
    v = v.lower()
    pattern = r"[a-z]+" # only words
    words_without_punc_num = re.findall(pattern, v)
    words_clean = set([word for word in words_without_punc_num if word not in stopwords])
    return words_clean

In [17]:
b_profile = sc.textFile(business_json) \
    .map(json.loads) \
    .map(lambda x: (x['business_id'], x['categories'])) \
    .map(lambda x: (b_d[x[0]], x[1])) \
    .mapValues(lambda v: processCategories(v, stopwords)) \
    .collectAsMap()

In [18]:
b_list = list(sorted(b_profile.keys()))
b_length = len(b_profile)

In [19]:
def computeJaccardSimilarity(i_set, j_set):
    fenzi = len(i_set.intersection(j_set))
    fenmu = len(i_set.union(j_set))
    return fenzi / fenmu

In [20]:
def getJS(i_b, b_profile, b_list):
    i_set = b_profile[i_b]
    l_ = []
    for j in range(len(b_list)):
        j_b = b_list[j]
        if j_b > i_b:
            j_set = b_profile[j_b]
            sim = computeJaccardSimilarity(i_set, j_set)
            if sim >= JS_THRESHOLD and sim != 0.0:
                new_1 = (i_b, [(j_b, sim)])
                new_2 = (j_b, [(i_b, sim)])
                l_.append(new_1)
                l_.append(new_2)
    return l_

In [21]:
jaccard_sim = sc.parallelize(b_list) \
    .flatMap(lambda x: getJS(x, b_profile, b_list)) \
    .reduceByKey(lambda x, y: x + y) \
    .mapValues(lambda vs: {k: v for k, v in vs}) \
    .collect()

In [22]:
jaccard_sim[:20]

[(0,
  {812: 0.7857142857142857,
   3914: 0.7142857142857143,
   4363: 0.75,
   6702: 0.7333333333333333,
   9054: 0.8571428571428571,
   11338: 0.8461538461538461}),
 (6024,
  {1: 0.75,
   315: 0.8571428571428571,
   987: 0.8571428571428571,
   2997: 0.75,
   3659: 0.8571428571428571,
   3679: 0.8571428571428571,
   4719: 0.8571428571428571,
   5019: 0.75,
   5395: 0.75,
   7286: 0.75,
   11662: 0.8571428571428571,
   12233: 0.8571428571428571}),
 (11048,
  {2: 0.8,
   165: 0.75,
   615: 0.75,
   725: 1.0,
   1121: 0.75,
   1295: 0.75,
   1307: 0.75,
   1535: 0.75,
   1829: 1.0,
   1874: 0.75,
   2796: 0.75,
   3145: 0.75,
   3245: 0.75,
   3557: 1.0,
   3682: 0.75,
   3873: 0.75,
   4091: 0.75,
   4138: 0.8,
   4281: 0.75,
   4568: 0.75,
   4674: 0.75,
   4801: 0.75,
   5023: 0.75,
   5263: 0.75,
   5463: 0.75,
   5661: 0.75,
   5860: 0.75,
   5875: 0.75,
   5939: 0.75,
   5989: 0.75,
   6303: 0.75,
   6306: 0.75,
   6420: 0.75,
   6453: 0.75,
   6947: 0.75,
   7112: 0.75,
   7437: 0

In [23]:
len(jaccard_sim)
# 8266

11509

In [24]:
n = 0
for x in jaccard_sim:
    if len(x[1]) >= AGM_THRESHOLD:
        n += 1

In [25]:
n
# 5429

10006

In [26]:
raw_data = sc.textFile(train_file) \
    .map(json.loads) \
    .persist(StorageLevel.MEMORY_AND_DISK)

In [27]:
data_0 = raw_data.map(lambda r: (r['user_id'], r['business_id'], r['stars'])) \
    .map(lambda x: (u_d[x[0]], b_d[x[1]], x[2])) \
    .map(lambda x: (x[0], [(x[1], x[2])])) \
    .reduceByKey(lambda x, y: x + y) \
    .persist(StorageLevel.MEMORY_AND_DISK)

In [28]:
data_0.collect()

[(82212,
  [(8070, 4.0),
   (10221, 5.0),
   (2187, 3.0),
   (8773, 3.0),
   (12315, 5.0),
   (9881, 3.0),
   (7099, 5.0),
   (4785, 4.0),
   (10067, 4.0),
   (11971, 4.0),
   (1319, 4.0),
   (10422, 5.0),
   (9287, 4.0),
   (1015, 3.0),
   (2393, 3.0),
   (10890, 5.0),
   (3926, 3.0),
   (515, 3.0),
   (12934, 5.0),
   (2598, 5.0),
   (2194, 3.0),
   (6042, 5.0),
   (2696, 4.0),
   (6424, 4.0),
   (7326, 4.0),
   (3302, 3.0),
   (13022, 4.0),
   (4001, 4.0),
   (8582, 4.0),
   (9076, 4.0),
   (8467, 5.0),
   (12495, 3.0),
   (12881, 4.0),
   (10165, 3.0),
   (10929, 3.0),
   (599, 4.0),
   (8142, 5.0),
   (6737, 4.0),
   (9653, 5.0),
   (11040, 4.0),
   (8128, 3.0),
   (2452, 4.0),
   (4279, 5.0),
   (1726, 5.0),
   (10308, 3.0),
   (2411, 4.0),
   (4328, 4.0),
   (12934, 5.0),
   (9084, 5.0),
   (4541, 5.0),
   (1041, 5.0),
   (8215, 4.0),
   (1210, 4.0),
   (12934, 5.0),
   (1973, 2.0),
   (9019, 5.0),
   (3217, 3.0),
   (584, 5.0),
   (1956, 5.0),
   (11944, 4.0),
   (4002, 4.0),
 

In [29]:
def adjustedSim(sim, target, accord, n_b_avg):
    t_avg = n_b_avg.get(target, UNK)
    a_avg = n_b_avg.get(accord, UNK)
    if a_avg > t_avg:
        return sim
    else:
        return 1 / sim

In [30]:
def processValues(vs, jaccard_sim):
    # vs - [(n_b, star), ...]
    # jaccard_sim - [(0, {1:0.7, ...}), ...]
    if len(vs) >= AGM_USER_THRESHOLD or len(vs) < AGM_THRESHOLD:
        return vs
    v_d = {k: v for k, v in vs}
    v_d_keys = set(v_d.keys())
    vs_agm = []
    for x in jaccard_sim:
        target_b = x[0]
        if target_b not in v_d_keys:
            sim_b = x[1]
            sim_b_keys = set(sim_b.keys())
            inter = list(v_d_keys.intersection(sim_b_keys))
            if len(inter) >= AGM_THRESHOLD and len(inter) != 0:
                v_vct = np.array([v_d[k] for k in inter])
                b_vct_fenzi = np.array([adjustedSim(sim_b[k], target_b, k, n_b_avg) for k in inter])
                b_vct = np.array([sim_b[k] for k in inter])
                
                agm_stars = np.dot(v_vct, b_vct_fenzi) / b_vct.sum()
                if agm_stars > 5.0:
                    agm_stars = 5.0
                vs_agm.append((target_b, agm_stars))
    return vs + vs_agm

In [31]:
data_0.mapValues(lambda vs: len(vs)).filter(lambda x: x[1] < AGM_USER_THRESHOLD).count()
# 37902

53828

In [32]:
data_1 = data_0.mapValues(lambda vs: processValues(vs, jaccard_sim)).persist(StorageLevel.MEMORY_AND_DISK)
# RDD - [(n_u, [(n_b, star), ...]), ...]

In [33]:
agm_data = data_1.flatMap(lambda x: [(x[0], b, star) for b, star in x[1]]).persist(StorageLevel.MEMORY_AND_DISK)
# RDD - [(n_u, n_b, star), ...]

In [34]:
agm_data.collect()

[(82212, 8070, 4.0),
 (82212, 10221, 5.0),
 (82212, 2187, 3.0),
 (82212, 8773, 3.0),
 (82212, 12315, 5.0),
 (82212, 9881, 3.0),
 (82212, 7099, 5.0),
 (82212, 4785, 4.0),
 (82212, 10067, 4.0),
 (82212, 11971, 4.0),
 (82212, 1319, 4.0),
 (82212, 10422, 5.0),
 (82212, 9287, 4.0),
 (82212, 1015, 3.0),
 (82212, 2393, 3.0),
 (82212, 10890, 5.0),
 (82212, 3926, 3.0),
 (82212, 515, 3.0),
 (82212, 12934, 5.0),
 (82212, 2598, 5.0),
 (82212, 2194, 3.0),
 (82212, 6042, 5.0),
 (82212, 2696, 4.0),
 (82212, 6424, 4.0),
 (82212, 7326, 4.0),
 (82212, 3302, 3.0),
 (82212, 13022, 4.0),
 (82212, 4001, 4.0),
 (82212, 8582, 4.0),
 (82212, 9076, 4.0),
 (82212, 8467, 5.0),
 (82212, 12495, 3.0),
 (82212, 12881, 4.0),
 (82212, 10165, 3.0),
 (82212, 10929, 3.0),
 (82212, 599, 4.0),
 (82212, 8142, 5.0),
 (82212, 6737, 4.0),
 (82212, 9653, 5.0),
 (82212, 11040, 4.0),
 (82212, 8128, 3.0),
 (82212, 2452, 4.0),
 (82212, 4279, 5.0),
 (82212, 1726, 5.0),
 (82212, 10308, 3.0),
 (82212, 2411, 4.0),
 (82212, 4328, 4.0),
 

In [35]:
agm_data.count() 
# 1083832 1179716 1030200

1056853

In [36]:
agm_train = agm_data.map(lambda x: ((u_table[x[0]], b_table[x[1]]), x[2])).collect()

In [37]:
support.writeDownRenameTable(agm_train, agm_train_file)

In [38]:
lonely_user = agm_data.map(lambda x: (x[0], 1)) \
        .reduceByKey(lambda x, y: x + y) \
        .filter(lambda x: x[1] < LONELY_USER_THRESHOLD) \
        .map(lambda x: x[0]) \
        .collect()
lonely_business = agm_data.map(lambda x: (x[1], 1)) \
        .reduceByKey(lambda x, y: x + y) \
        .filter(lambda x: x[1] < LONELY_BUSINESS_THRESHOLD) \
        .map(lambda x: x[0]) \
        .collect()

In [39]:
def collectAlsModel(modelRDD, u_table, b_table):
    user_featrue = modelRDD.userFeatures() \
        .map(lambda x: (u_table[x[0]], list(x[1])[0])) \
        .collectAsMap()
    product_feature = modelRDD.productFeatures() \
        .map(lambda x: (b_table[x[0]], list(x[1])[0])) \
        .collectAsMap()
    return [user_featrue, product_feature]

def saveAlsModel(modelRDD, u_table, b_table, model_file):
    model = collectAlsModel(modelRDD, u_table, b_table)
    with open(model_file, 'w', encoding='utf-8') as fp:
        json.dump(model, fp)

In [40]:
stars_data_not_lonely = agm_data.filter(lambda x: x[0] not in lonely_user and x[1] not in lonely_business) \
    .map(lambda x: Rating(x[0], x[1], x[2])).persist(StorageLevel.MEMORY_AND_DISK)

In [41]:
stars_data_lonely = agm_data.filter(lambda x: x[0] in lonely_user or x[1] in lonely_business) \
    .map(lambda x: Rating(x[0], x[1], x[2])).persist(StorageLevel.MEMORY_AND_DISK)

In [42]:
sc.setCheckpointDir(checkpoint_file)
ALS.checkpointInterval = 2

In [43]:
modelRDD_not_lonely = ALS.train(ratings=stars_data_not_lonely, rank=1, iterations=70, lambda_=0.01, nonnegative=True)
saveAlsModel(modelRDD_not_lonely, u_table, b_table, als_not_lonely_model_file)

In [44]:
modelRDD_lonely = ALS.train(ratings=stars_data_lonely, rank=1, iterations=70, lambda_=0.01, nonnegative=True)
saveAlsModel(modelRDD_lonely, u_table, b_table, als_lonely_model_file)

In [45]:
import os

In [53]:
if not os.path.exists('./model/'):
    os.mkdir('./model/')