In [1]:
from typing import Tuple, List, Any
import os
import json
# from interface import Json
from collections import Counter
from random import sample, seed

seed(6220)

In [2]:
def gen_subset(
    user_data,
    business_data,
    review_data,
    user_n: int,
    business_n: int,
    review_n: int,
):
    # To control sparsity of the subset data, the algorithm is designed in a way differs from pure random sampling:
    # 0. select top `user_n` users that posts the most reviews
    # 1. among these reviews, select top `business_n` businesses that receives the most reviews
    # 2. among all reviews, select reviews posted by both top `user_n` users and to top `business_n` businesses
    # 3. randomly sample min(review_n, len(result of step 2)) reviews to form review subset
    user_cnt = Counter(review["user_id"] for review in review_data)
    user_cnt = list(user_cnt.items())
    user_cnt.sort(key=lambda x: x[1], reverse=True)

    subset_user_id = {user_id for user_id, _ in user_cnt[:user_n]}
    subset_user_data = [user for user in user_data if user["user_id"] in subset_user_id]
    subset_review_data = [
        review for review in review_data if review["user_id"] in subset_user_id
    ]

    subset_business_id_cnt = Counter(
        [review["business_id"] for review in subset_review_data]
    )
    subset_business_id_cnt = list(subset_business_id_cnt.items())
    subset_business_id_cnt.sort(key=lambda x: x[1], reverse=True)

    subset_business_id = {
        business_id for business_id, _ in subset_business_id_cnt[:business_n]
    }
    subset_business_data = [
        business
        for business in business_data
        if business["business_id"] in subset_business_id
    ]

    subset_review_data = [
        review
        for review in review_data
        if review["user_id"] in subset_user_id
        and review["business_id"] in subset_business_id
    ]
    subset_review_data = sample(
        subset_review_data, min(review_n, len(subset_review_data))
    )

    return subset_user_data, subset_business_data, subset_review_data

In [3]:
%cd /content/drive/MyDrive/Neural-CF

user_path = "./Yelp-Dataset/yelp_academic_dataset_user.json"
business_path = "./Yelp-Dataset/yelp_academic_dataset_business.json"
review_path = "./Yelp-Dataset/yelp_academic_dataset_review.json"

/content/drive/MyDrive/Neural-CF


In [4]:
with open(user_path, 'r', encoding="utf-8") as f:
    user_data = list(map(json.loads, f.readlines()))
with open(business_path, 'r', encoding="utf-8") as f:
    business_data = list(map(json.loads, f.readlines()))
with open(review_path, 'r', encoding="utf-8") as f:
    review_data = list(map(json.loads, f.readlines()))

In [5]:
print(len(user_data))

1987897


In [9]:
## Old method to see the number of review for x user and x business.
user_cnt = Counter(review["user_id"] for review in review_data)
user_cnt = list(user_cnt.items())
user_cnt.sort(key=lambda x: x[1], reverse=True)

subsetSize = 100000
subset_user_id = {user_id for user_id, _ in user_cnt[:subsetSize]}
subset_user_data = [user for user in user_data if user["user_id"] in subset_user_id]
subset_review_data = [review for review in review_data if review["user_id"] in subset_user_id]

subset_business_id_cnt = Counter([review["business_id"] for review in subset_review_data])
subset_business_id_cnt = list(subset_business_id_cnt.items())
subset_business_id_cnt.sort(key=lambda x: x[1], reverse=True)

subset_business_id = {business_id for business_id, _ in subset_business_id_cnt[:subsetSize]}
subset_business_data = [business for business in business_data if business["business_id"] in subset_business_id]

subset_review_data = [review for review in review_data if review["user_id"] in subset_user_id and review["business_id"] in subset_business_id]
print(len(subset_business_data))
print(len(subset_review_data))
print(len(subset_user_data))

100000
3025734
100000


In [10]:
subset_user_data, subset_business_data, subset_review_data = gen_subset(user_data,business_data,
                                                                        review_data, user_n=100000,
                                                                        business_n=100000,review_n=3025734)

In [11]:
print(len(subset_business_data))
print(len(subset_review_data))
print(len(subset_user_data))

100000
3025734
100000


In [12]:
output_folder = "./Yelp-Dataset/subset_100k-user_cnt/"
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

def save_data(
    data: Tuple[List[Any], ...],
    data_folder: str,
    filenames=("yelp_academic_dataset_user.json", "yelp_academic_dataset_business.json", "yelp_academic_dataset_review.json"),
):
    for d, f in zip(data, filenames):
        open(os.path.join(data_folder, f), "w").write("\n".join(map(json.dumps, d)))

save_data((subset_user_data, subset_business_data, subset_review_data), output_folder, filenames=("subset_user.json", "subset_business.json", "subset_review.json"))