In [8]:
import json
import random
import os
from collections import Counter
from typing import Tuple, List, Any

In [5]:
user_path = "/mnt/d/Download/yelp/yelp_academic_dataset_user.json"
business_path = "/mnt/d/Download/yelp/yelp_academic_dataset_business.json"
review_path = "/mnt/d/Download/yelp/yelp_academic_dataset_review.json"

## Make a smaller development set

We randomly select a subset of user and business, then find out all reviews that are written by these users and for these businesses. We use these reviews as our development set.

We choose n users and m businesses to be in our development set. (n and m would be relatively small but still keeps a reasonable number of reviews.)

In [46]:
with open(user_path, 'r', encoding="utf-8") as f:
    user_data = list(map(json.loads, f.readlines()))

In [47]:
with open(business_path, 'r', encoding="utf-8") as f:
    business_data = list(map(json.loads, f.readlines()))

In [48]:
with open(review_path, 'r', encoding="utf-8") as f:
    review_data = list(map(json.loads, f.readlines()))

In [49]:
# # we prioritize restaurants with the most reviews
# business_cnt = Counter([review["business_id"] for review in review_data])
# business_cnt = list(business_cnt.items())
# business_cnt.sort(key=lambda x: x[1], reverse=True)

# we prioritize users with most reviews...
user_cnt = Counter(review["user_id"] for review in review_data)
user_cnt = list(user_cnt.items())
user_cnt.sort(key=lambda x: x[1], reverse=True)

In [54]:
subset_user_id = {user_id for user_id, _ in user_cnt[:100]}
subset_user_data = [user for user in user_data if user["user_id"] in subset_user_id]
subset_review_data = [review for review in review_data if review["user_id"] in subset_user_id]

subset_business_id_cnt = Counter([review["business_id"] for review in subset_review_data])
subset_business_id_cnt = list(subset_business_id_cnt.items())
subset_business_id_cnt.sort(key=lambda x: x[1], reverse=True)

subset_business_id = {business_id for business_id, _ in subset_business_id_cnt[:100]}
subset_business_data = [business for business in business_data if business["business_id"] in subset_business_id]

subset_review_data = [review for review in review_data if review["user_id"] in subset_user_id and review["business_id"] in subset_business_id]
print(len(subset_business_data))
print(len(subset_review_data))
print(len(subset_user_data))

100
1801
100


Write out subsets as dev json files.

In [55]:
output_folder = "/mnt/d/Download/yelp/subset/"

def save_data(
    data: Tuple[List[Any], ...],
    data_folder: str,
    filenames=("yelp_academic_dataset_user.json", "yelp_academic_dataset_business.json", "yelp_academic_dataset_review.json"),
):
    for d, f in zip(data, filenames):
        open(os.path.join(data_folder, f), "w").write("\n".join(map(json.dumps, d)))

save_data((subset_user_data, subset_business_data, subset_review_data), output_folder, filenames=("subset_user.json", "subset_business.json", "subset_review.json"))

Clean up memory space of the original json files. They are gigabytes in size.

In [56]:
del user_data
del business_data
del review_data

## Build training pipeline

We build a memory-based collaborative filtering model, just to make sure the pipeline works. We use the development set to test the pipeline.

In [6]:
from scipy.sparse.linalg import svds
from scipy.sparse import csr_matrix
import numpy as np

In [9]:
def load_data(
    data_folder: str,
    filenames=("yelp_academic_dataset_user.json", "yelp_academic_dataset_business.json", "yelp_academic_dataset_review.json")
) -> Tuple[List[Any], ...]:
    return tuple(map(
        lambda f: list(map(json.loads, open(os.path.join(data_folder, f), "r", encoding="utf-8").readlines())),
        filenames
    ))

(subset_user_data, subset_business_data, subset_review_data) = load_data("/mnt/d/Download/yelp/subset", ("subset_user.json", "subset_business.json", "subset_review.json"))

In [10]:
# split into training and testing
train_len = int(len(subset_review_data) * 0.8)
train_subset_review = subset_review_data[:train_len]
test_subset_review = subset_review_data[train_len:]

In [11]:
user_id_map = {user["user_id"]: i for i, user in enumerate(subset_user_data)}
business_id_map = {business["business_id"]: i for i, business in enumerate(subset_business_data)}

In [12]:
# first step, build a sparse user-business matrix, so that we can scale it up to the whole dataset
matrix = csr_matrix((len(user_id_map), len(business_id_map)), dtype=float)
for review in train_subset_review:
    matrix[user_id_map[review["user_id"]], business_id_map[review["business_id"]]] = review["stars"]

  self._set_intXint(row, col, x.flat[0])


In [13]:
u, s, vt = svds(matrix, k=10)

In [14]:
print(type(u))
print(type(s))
print(type(vt))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [16]:
for review in test_subset_review:
    user_index = user_id_map[review["user_id"]]
    business_index = business_id_map[review["business_id"]]
    prediction = float(u[user_index, :].dot(np.diag(s).dot(vt[:, business_index])))
    print("prediction: {}, actual: {}".format(prediction, review["stars"]))

prediction: 2.2793532787416586, actual: 4.0
prediction: 0.9971262332510351, actual: 2.0
prediction: 2.29548194256379, actual: 3.0
prediction: 0.32417334776352225, actual: 4.0
prediction: 1.8852261795083152, actual: 5.0
prediction: 1.1839549233693598, actual: 5.0
prediction: 1.852341236389862, actual: 5.0
prediction: 0.850113469785263, actual: 4.0
prediction: 0.5363773077513456, actual: 5.0
prediction: 2.1268185984126164, actual: 5.0
prediction: 2.5966672033027316, actual: 5.0
prediction: -0.19436859486841124, actual: 5.0
prediction: 1.852341236389862, actual: 5.0
prediction: 0.12958306908492806, actual: 4.0
prediction: 0.3233847518501618, actual: 5.0
prediction: 0.32417334776352225, actual: 4.0
prediction: 1.805764080769795, actual: 5.0
prediction: 1.852341236389862, actual: 5.0
prediction: 1.679085963912643, actual: 4.0
prediction: 1.0399215149323568, actual: 4.0
prediction: 0.061212363361043465, actual: 5.0
prediction: 0.5544196437638275, actual: 4.0
prediction: 1.5198544509641652, a

In [18]:
print(subset_business_data[0].keys())

dict_keys(['business_id', 'name', 'address', 'city', 'state', 'postal_code', 'latitude', 'longitude', 'stars', 'review_count', 'is_open', 'attributes', 'categories', 'hours'])
