In [1]:
!which python

/u/pw7nc/anaconda3/bin/python


In [2]:
import os
import json
import pickle
import pandas as pd
import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize

# Load Dataset

In [7]:
# Load Data from the original dataset
dir_path = '../Dataset/yelp/split/'
output_dir_path = '../Dataset/yelp/'
# Load train dataset
train_review = []
cnt = 0
file_path = os.path.join(dir_path, 'train.txt')
output_file_path = os.path.join(output_dir_path, 'train.json')
cnt_empty_review_train = 0
cnt_empty_feature_review_train = 0
with open(file_path) as f:
    with open(output_file_path, 'w') as f_out:
        print("Load file: {}".format(file_path))
        for line in f:
            line_data = json.loads(line)
            cnt += 1
            if cnt % 100000 == 0:
                print('{} lines loaded.'.format(cnt))
            item_id = str(line_data[0])
            user_id = str(line_data[1])
            rating = int(line_data[2])
            assert rating == line_data[2]   # float is x.0
            review_feat_sent = line_data[3]
            if len(review_feat_sent) == 0:
                cnt_empty_review_train += 1
                continue
            review_text_list = list()
            review_feat_list = list()
            cnt_num_feat = 0
            for feat_sent in review_feat_sent:
                review_text_list.append(feat_sent[1])
                review_feat_list.append(feat_sent[0])
                cnt_num_feat += len(feat_sent[0])
            if cnt_num_feat == 0:
                cnt_empty_feature_review_train += 1
                continue
            review_text = " ".join(review_text_list)
            combined_line_data = {
                'user': user_id,
                'item': item_id,
                'rating': rating,
                'review': review_text,
                'features': review_feat_list
            }
            train_review.append([item_id, user_id, rating, review_text, review_feat_list])
            json.dump(combined_line_data, f_out)
            f_out.write('\n')
print('Finish loading train dataset, totally {0} lines. Among them, {1} lines is saved.'.format(
    cnt, len(train_review)
))
print("Number of lines with empty review: {0}\tNumber of lines with no features: {1}".format(
    cnt_empty_review_train, cnt_empty_feature_review_train
))

Load file: ../Dataset/yelp/split/train.txt
100000 lines loaded.
200000 lines loaded.
300000 lines loaded.
400000 lines loaded.
500000 lines loaded.
600000 lines loaded.
700000 lines loaded.
800000 lines loaded.
Finish loading train dataset, totally 863307 lines. Among them, 697479 lines is saved.
Number of lines with empty review: 165828	Number of lines with no features: 0


In [8]:
# Load test dataset
test_review = []
cnt = 0
file_path = os.path.join(dir_path, 'test.txt')
output_file_path = os.path.join(output_dir_path, 'test.json')
cnt_empty_feature_review_test = 0
cnt_empty_review_test = 0
with open(file_path) as f:
    with open(output_file_path, 'w') as f_out:
        print("Load file: {}".format(file_path))
        for line in f:
            line_data = json.loads(line)
            cnt += 1
            if cnt % 100000 == 0:
                print('{} lines loaded.'.format(cnt))
            item_id = str(line_data[0])
            user_id = str(line_data[1])
            rating = int(line_data[2])
            assert rating == line_data[2]   # float is x.0
            review_feat_sent = line_data[3]
            if len(review_feat_sent) == 0:
                cnt_empty_review_test += 1
                continue
            review_text_list = list()
            review_feat_list = list()
            cnt_num_feat = 0
            for feat_sent in review_feat_sent:
                review_text_list.append(feat_sent[1])
                review_feat_list.append(feat_sent[0])
                cnt_num_feat += len(feat_sent[0])
            if cnt_num_feat == 0:
                cnt_empty_feature_review_test += 1
                continue
            review_text = " ".join(review_text_list)
            combined_line_data = {
                'user': user_id,
                'item': item_id,
                'rating': rating,
                'review': review_text,
                'features': review_feat_list
            }
            test_review.append([item_id, user_id, rating, review_text, review_feat_list])
            json.dump(combined_line_data, f_out)
            f_out.write('\n')
            
print('Finish loading test dataset, totally {0} lines. Among them, {1} lines is saved.'.format(
    cnt, len(test_review)
))
print("Number of lines with empty review: {0}\tNumber of lines with no features: {1}".format(
    cnt_empty_review_test, cnt_empty_feature_review_test
))

Load file: ../Dataset/yelp/split/test.txt
100000 lines loaded.
200000 lines loaded.
Finish loading test dataset, totally 245664 lines. Among them, 196799 lines is saved.
Number of lines with empty review: 48865	Number of lines with no features: 0


# Check Review with 0 sentences

In [10]:
cnt = 0
for train_data_instance in train_review:
    review_text = train_data_instance[-2]
    review_sents = sent_tokenize(review_text)
    if len(review_sents) == 0:
        assert review_text == ''
        cnt += 1
print("Number of reviews on train with 0 sentences: {}".format(cnt))

cnt = 0
for test_data_instance in test_review:
    review_text = test_data_instance[-2]
    review_sents = sent_tokenize(review_text)
    if len(review_sents) == 0:
        assert review_text == ''
        cnt += 1
print("Number of reviews on test with 0 sentences: {}".format(cnt))

Number of reviews on train with 0 sentences: 0
Number of reviews on test with 0 sentences: 0


# Check Duplicate Reviews

In [16]:
# in train set
trainset_user_item_set = set()
duplicate_ui_set_train = set()
cnt_duplicate_review = 0

for train_data_chunk in train_review:
    assert isinstance(train_data_chunk[0], str)
    assert isinstance(train_data_chunk[1], str)
    item_id = train_data_chunk[0]
    user_id = train_data_chunk[1]
    if (user_id, item_id) in trainset_user_item_set:
        duplicate_ui_set_train.add((user_id, item_id))
        cnt_duplicate_review += 1
    else:
        trainset_user_item_set.add((user_id, item_id))

print("[Train] Number of duplicate reviews: {}".format(cnt_duplicate_review))
print("[Train] Number of unique reviews: {}".format(len(trainset_user_item_set)))
print("[Train] Number of duplicate user-item pairs: {}".format(len(duplicate_ui_set_train)))

[Train] Number of duplicate reviews: 29234
[Train] Number of unique reviews: 668245
[Train] Number of duplicate user-item pairs: 25142


In [14]:
# in test set
testset_user_item_set = set()
duplicate_ui_set_test = set()
cnt_duplicate_review = 0

for test_data_chunk in test_review:
    assert isinstance(test_data_chunk[0], str)
    assert isinstance(test_data_chunk[1], str)
    item_id = test_data_chunk[0]
    user_id = test_data_chunk[1]
    if (user_id, item_id) in testset_user_item_set:
        duplicate_ui_set_test.add((user_id, item_id))
        cnt_duplicate_review += 1
    else:
        testset_user_item_set.add((user_id, item_id))

print("[Test] Number of duplicate reviews: {}".format(cnt_duplicate_review))
print("[Test] Number of unique reviews: {}".format(len(testset_user_item_set)))
print("[Test] Number of duplicate user-item pairs: {}".format(len(duplicate_ui_set_test)))

[Test] Number of duplicate reviews: 1993
[Test] Number of unique reviews: 194806
[Test] Number of duplicate user-item pairs: 1932


In [17]:
df_train_data = pd.DataFrame(train_review, columns=['item', 'user', 'rating', 'review', 'features'])
df_test_data = pd.DataFrame(test_review, columns=['item', 'user', 'rating', 'review', 'features'])

In [18]:
df_train_data

Unnamed: 0,item,user,rating,review,features
0,1268,300,2,"the dessert was the best part , but even then ...",[[dessert]]
1,1912,300,3,"when i first came here , i would give this pla...","[[taste, food, chef, restaurants, prices]]"
2,14420,300,5,( parking will be extra difficult on that day ...,[[parking]]
3,720,300,4,the decor of this place is beautiful - i 'm ac...,"[[seats, decor, table], [variety, dishes, food..."
4,2940,300,5,"i would suggest not eating in here you can , t...","[[eating, food], [cakes]]"
...,...,...,...,...,...
697474,7174,14908,2,the food is average and overpriced . atmospher...,"[[food], [atmosphere, buffet]]"
697475,3473,14908,4,pollo con queso is the best . it 's extremely ...,"[[queso], [filling], [salsa, chips, spicy]]"
697476,77,14908,2,the wait is frustrating . the salad i had was ...,"[[wait], [overall, salad]]"
697477,377,14908,1,horrible food . the food was below average . t...,"[[food], [food], [calamari], [fries]]"


In [20]:
# make sure that duplicate reviews has the same rating and review text
# goupby multiple columns (user and item)
groupby_user_item = df_train_data.groupby(['user', 'item'])
cnt_duplicate_review = 0
cnt_user_item_pair = 0
filterd_user_item_pair = []
differ_user_item_pair = []
for key, item in groupby_user_item:
    cur_df_user_item = groupby_user_item.get_group(key)
    cnt_user_item_pair += 1
    if len(cur_df_user_item) > 1:
        sampled_cur_df_user_item = cur_df_user_item.sample(n=1)
        filterd_user_item_pair.append(sampled_cur_df_user_item)
        rating_list = list(cur_df_user_item['rating'])
        review_text_list = list(cur_df_user_item['review'])
        cnt_duplicate_review += len(cur_df_user_item) - 1
        differ_data_flag = False
        for i in range(len(cur_df_user_item)):
            if rating_list[i] != rating_list[0]:
                differ_data_flag = True
                break
            if review_text_list[i] != review_text_list[0]:
                differ_data_flag = True
                break
        if differ_data_flag:
            differ_user_item_pair.append(cur_df_user_item)
    else:
        filterd_user_item_pair.append(cur_df_user_item)
print("Total unique user-item pair: {}".format(cnt_user_item_pair))
print("Total duplicate reviews: {}".format(cnt_duplicate_review))
print("Total duplicate user-item pair with different reviews: {}".format(
    len(differ_user_item_pair))
)
print("Total saved unique user-item reviews: {}".format(
    len(filterd_user_item_pair))
)

Total unique user-item pair: 668245
Total duplicate reviews: 29234
Total duplicate user-item pair with different reviews: 23760
Total saved unique user-item reviews: 668245


In [23]:
differ_user_item_pair[0]

Unnamed: 0,item,user,rating,review,features
38373,1002,0,3,taste much better than ayce sushi ! the sushi ...,"[[sushi, taste], [sushi, bite], [service]]"
39302,1002,0,4,price is compatible to other takeout in smalle...,"[[price, takeout, restaurants]]"


In [24]:
filterd_user_item_pair[0]

Unnamed: 0,item,user,rating,review,features
38534,10000,0,4,recommendations are right on and always fun to...,"[[beer, recommendations], [beers, drink, food]]"


In [25]:
combined_filterd_user_item_pair = pd.concat(filterd_user_item_pair).reset_index(drop=True)

In [26]:
combined_filterd_user_item_pair

Unnamed: 0,item,user,rating,review,features
0,10000,0,4,recommendations are right on and always fun to...,"[[beer, recommendations], [beers, drink, food]]"
1,10019,0,3,just the same pricing but with less item selec...,"[[selections, pricing]]"
2,1002,0,3,taste much better than ayce sushi ! the sushi ...,"[[sushi, taste], [sushi, bite], [service]]"
3,10020,0,3,this is a brand new location of ruelo that rec...,"[[location, plaza], [location, customer, tea, ..."
4,10039,0,4,medium rare is a little dry ... since the stea...,"[[steak, medium], [food, patrons, cheap]]"
...,...,...,...,...,...
668240,4993,9999,5,my two favorites are the carne asada and barba...,"[[juicy, favorites], [combo, taco, tacos, meal..."
668241,704,9999,4,the tacos themselves were a mixed bag . in par...,"[[tacos], [taco], [flavor, vegan], [pork, taco..."
668242,7379,9999,5,everything was great .,[[everything]]
668243,8530,9999,4,our last experience here went pretty well and ...,"[[food], [menu], [wait, chicken], [juicy], [me..."


In [27]:
# Write the filtered result into json
output_file_path = os.path.join(output_dir_path, 'train_review_filtered.json')
with open(output_file_path, 'w') as f_out:
    for row in combined_filterd_user_item_pair.iterrows():
        row[1].to_json(f_out)
        f_out.write('\n')

In [28]:
# make sure that duplicate reviews has the same rating and review text
# goupby multiple columns (user and item)
groupby_user_item = df_test_data.groupby(['user', 'item'])
cnt_duplicate_review_test = 0
cnt_user_item_pair_test = 0
filterd_user_item_pair_test = []
differ_user_item_pair_test = []
for key, item in groupby_user_item:
    cur_df_user_item = groupby_user_item.get_group(key)
    cnt_user_item_pair_test += 1
    if len(cur_df_user_item) > 1:
        sampled_cur_df_user_item = cur_df_user_item.sample(n=1)
        filterd_user_item_pair_test.append(sampled_cur_df_user_item)
        rating_list = list(cur_df_user_item['rating'])
        review_text_list = list(cur_df_user_item['review'])
        cnt_duplicate_review_test += len(cur_df_user_item) - 1
        differ_data_flag = False
        for i in range(len(cur_df_user_item)):
            if rating_list[i] != rating_list[0]:
                differ_data_flag = True
                break
            if review_text_list[i] != review_text_list[0]:
                differ_data_flag = True
                break
        if differ_data_flag:
            differ_user_item_pair_test.append(cur_df_user_item)
    else:
        filterd_user_item_pair_test.append(cur_df_user_item)
print("Total unique user-item pair: {}".format(cnt_user_item_pair_test))
print("Total duplicate reviews: {}".format(cnt_duplicate_review_test))
print("Total duplicate user-item pair with different reviews: {}".format(
    len(differ_user_item_pair_test))
)
print("Total saved unique user-item reviews: {}".format(
    len(filterd_user_item_pair_test))
)

Total unique user-item pair: 194806
Total duplicate reviews: 1993
Total duplicate user-item pair with different reviews: 1766
Total saved unique user-item reviews: 194806


In [29]:
combined_filterd_user_item_pair_test = pd.concat(
    filterd_user_item_pair_test).reset_index(drop=True)

In [30]:
combined_filterd_user_item_pair_test

Unnamed: 0,item,user,rating,review,features
0,10184,0,3,the pork chop itself is moist and tender .,"[[pork, tender]]"
1,1040,0,4,arrowshoot noodle were chewy and have great bi...,"[[bite, noodle]]"
2,10514,0,4,as there are 5 dim sum chefs working in the ki...,"[[kitchen, everything, chefs], [price], [tea],..."
3,10702,0,3,service is polite and since it was not busy th...,[[service]]
4,10807,0,4,i like how the price still being really reason...,"[[price, food], [spices]]"
...,...,...,...,...,...
194801,4154,9999,3,the main draw to this casinos over the others ...,[[restaurant]]
194802,4565,9999,5,it 's not like normal stouts and the flavor is...,"[[flavor], [taste, staff], [prices]]"
194803,6,9999,3,the food here was very good . the prices were ...,"[[food], [prices, fries, burger], [burgers], [..."
194804,624,9999,5,my two favorite meats for tacos are carne asad...,"[[meats, tacos], [salsa, bar], [salsas, beans,..."


In [31]:
# Write the filtered result into json
output_file_path = os.path.join(output_dir_path, 'test_review_filtered.json')
with open(output_file_path, 'w') as f_out:
    for row in combined_filterd_user_item_pair_test.iterrows():
        row[1].to_json(f_out)
        f_out.write('\n')

In [32]:
# get user / item set on the train set
user_set_train = set(combined_filterd_user_item_pair['user'].unique())
item_set_train = set(combined_filterd_user_item_pair['item'].unique())
print("Number of users on the train set: {}".format(len(user_set_train)))
print("Number of items on the train set: {}".format(len(item_set_train)))

Number of users on the train set: 15639
Number of items on the train set: 21515


In [33]:
# for test-set, remove the user-item pair that appears in train-set
# Write the filtered result into json
output_file_path = os.path.join(output_dir_path, 'test_review_filtered_clean.json')
cnt_test_useritem_appears_in_train = 0
cnt_test_new_user = 0
cnt_test_new_item = 0
cnt_test_unique_useritem = 0
with open(output_file_path, 'w') as f_out:
    print("Write file: {}".format(output_file_path))
    for row in combined_filterd_user_item_pair_test.iterrows():
        item_id = row[1]['item']
        user_id = row[1]['user']
        current_user_item_id = (str(user_id), str(item_id))
        if current_user_item_id in trainset_user_item_set:
            cnt_test_useritem_appears_in_train += 1
        else:
            valid_ui_flag = True
            if user_id not in user_set_train:
                cnt_test_new_user += 1
                valid_ui_flag = False
            if item_id not in item_set_train:
                cnt_test_new_item += 1
                valid_ui_flag = False
            if valid_ui_flag:
                cnt_test_unique_useritem += 1
                row[1].to_json(f_out)
                f_out.write('\n')
print("Number of user-item pairs appear on test and train: {}".format(
    cnt_test_useritem_appears_in_train))
print("Number of review which user on test but not on train: {}".format(
    cnt_test_new_user))
print("Number of review which item on test but not on train: {}".format(
    cnt_test_new_item))
print("Number of user-item pairs only appear on test: {}".format(cnt_test_unique_useritem))

Write file: ../Dataset/yelp/test_review_filtered_clean.json
Number of user-item pairs appear on test and train: 11147
Number of review which user on test but not on train: 0
Number of review which item on test but not on train: 9
Number of user-item pairs only appear on test: 183650


# Load Clean Train/Test Data

In [34]:
# Load cleaned train
train_clean_file = os.path.join(output_dir_path, 'train_review_filtered.json')
train_clean_review = []
with open(train_clean_file, 'r') as f:
    for line in f:
        line_data = json.loads(line)
        item_id = line_data['item']
        user_id = line_data['user']
        rating = line_data['rating']
        review = line_data['review']
        features = line_data['features']
        train_clean_review.append([item_id, user_id, rating, review, features])
print("Number of reviews on trainset: {}".format(len(train_clean_review)))

Number of reviews on trainset: 668245


In [35]:
df_train_clean_data = pd.DataFrame(train_clean_review, columns=['item', 'user', 'rating', 'review', 'features'])

In [36]:
df_train_clean_data

Unnamed: 0,item,user,rating,review,features
0,10000,0,4,recommendations are right on and always fun to...,"[[beer, recommendations], [beers, drink, food]]"
1,10019,0,3,just the same pricing but with less item selec...,"[[selections, pricing]]"
2,1002,0,3,taste much better than ayce sushi ! the sushi ...,"[[sushi, taste], [sushi, bite], [service]]"
3,10020,0,3,this is a brand new location of ruelo that rec...,"[[location, plaza], [location, customer, tea, ..."
4,10039,0,4,medium rare is a little dry ... since the stea...,"[[steak, medium], [food, patrons, cheap]]"
...,...,...,...,...,...
668240,4993,9999,5,my two favorites are the carne asada and barba...,"[[juicy, favorites], [combo, taco, tacos, meal..."
668241,704,9999,4,the tacos themselves were a mixed bag . in par...,"[[tacos], [taco], [flavor, vegan], [pork, taco..."
668242,7379,9999,5,everything was great .,[[everything]]
668243,8530,9999,4,our last experience here went pretty well and ...,"[[food], [menu], [wait, chicken], [juicy], [me..."


In [37]:
# Load cleaned test
test_clean_file = os.path.join(output_dir_path, 'test_review_filtered_clean.json')
test_clean_review = []
with open(test_clean_file, 'r') as f:
    for line in f:
        line_data = json.loads(line)
        item_id = line_data['item']
        user_id = line_data['user']
        rating = line_data['rating']
        review = line_data['review']
        features = line_data['features']
        test_clean_review.append([item_id, user_id, rating, review, features])
print("Number of reviews on testset: {}".format(len(test_clean_review)))

Number of reviews on testset: 183650


In [39]:
df_test_clean_data = pd.DataFrame(test_clean_review, columns=['item', 'user', 'rating', 'review', 'features'])

In [40]:
df_test_clean_data

Unnamed: 0,item,user,rating,review,features
0,10184,0,3,the pork chop itself is moist and tender .,"[[pork, tender]]"
1,1040,0,4,arrowshoot noodle were chewy and have great bi...,"[[bite, noodle]]"
2,10514,0,4,as there are 5 dim sum chefs working in the ki...,"[[kitchen, everything, chefs], [price], [tea],..."
3,10702,0,3,service is polite and since it was not busy th...,[[service]]
4,1081,0,4,"ramen salad - refreshing daily specials , make...","[[salad, ramen], [specials]]"
...,...,...,...,...,...
183645,4154,9999,3,the main draw to this casinos over the others ...,[[restaurant]]
183646,4565,9999,5,it 's not like normal stouts and the flavor is...,"[[flavor], [taste, staff], [prices]]"
183647,6,9999,3,the food here was very good . the prices were ...,"[[food], [prices, fries, burger], [burgers], [..."
183648,624,9999,5,my two favorite meats for tacos are carne asad...,"[[meats, tacos], [salsa, bar], [salsas, beans,..."


# Get User/Item Statistics on Train

## Train - User

In [41]:
groupby_user_train = df_train_clean_data.groupby(['user'])

In [42]:
len(groupby_user_train)

15639

In [43]:
user_num_review_list = list()
user_num_review_dict = dict()
for key, item in groupby_user_train:
    user_num_review_dict[key] = len(item)
    user_num_review_list.append(len(item))

In [44]:
print("Mean number of review per user: {}".format(
    np.mean(user_num_review_list)
))
print("Min number of review per user: {}".format(
    np.min(user_num_review_list)
))
print("Max number of review per user: {}".format(
    np.max(user_num_review_list)
))

Mean number of review per user: 42.729394462561544
Min number of review per user: 1
Max number of review per user: 1437


In [45]:
print("Top-10 least numbber of review per user: {}".format(
    sorted(user_num_review_list)[:10]
))
print("Top-10 most numbber of review per user: {}".format(
    sorted(user_num_review_list)[-10:]
))

Top-10 least numbber of review per user: [1, 1, 1, 1, 1, 1, 1, 2, 2, 2]
Top-10 most numbber of review per user: [607, 651, 714, 763, 796, 871, 901, 958, 1380, 1437]


## Train - Item

In [46]:
groupby_item_train = df_train_clean_data.groupby(['item'])

In [47]:
len(groupby_item_train)

21515

In [48]:
item_num_review_list = list()
item_num_review_dict = dict()
for key, item in groupby_item_train:
    item_num_review_dict[key] = len(item)
    item_num_review_list.append(len(item))

In [49]:
print("Mean number of review per item: {}".format(
    np.mean(item_num_review_list)
))
print("Min number of review per item: {}".format(
    np.min(item_num_review_list)
))
print("Max number of review per item: {}".format(
    np.max(item_num_review_list)
))

Mean number of review per item: 31.05949337671392
Min number of review per item: 1
Max number of review per item: 688


In [50]:
print("Top-10 least numbber of review per item: {}".format(
    sorted(item_num_review_list)[:10]
))
print("Top-10 most numbber of review per item: {}".format(
    sorted(item_num_review_list)[-10:]
))

Top-10 least numbber of review per item: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Top-10 most numbber of review per item: [434, 439, 444, 460, 480, 510, 516, 598, 652, 688]
