In [2]:
!which python

/u/pw7nc/anaconda3/bin/python


In [3]:
import nltk
import re
import json
import pandas as pd
import os
import numpy as np
from nltk.tokenize import sent_tokenize

In [4]:
# Load Data from the original dataset
dir_path = '../Dataset/tripadvisor/ta_output/split/'
output_dir_path = '../Dataset/tripadvisor/'

# Load Data

In [7]:
# Load train dataset
train_review = []
cnt = 0
file_path = os.path.join(dir_path, 'train.txt')
output_file_path = os.path.join(output_dir_path, 'train.json')
with open(file_path) as f:
    with open(output_file_path, 'w') as f_out:
        print("Load file: {}".format(file_path))
        for line in f:
            line_data = json.loads(line)
            item_id = str(line_data[0])
            user_id = str(line_data[1])
            try:
                rating = line_data[2]['overall']
            except KeyError:
                print("No overall score at: ", line_data)
            review_feat_sent = line_data[3]
            review_text_list = list()
            feature_list = list()
            for feat_sent in review_feat_sent:
                review_text_list.append(feat_sent[1])
                assert isinstance(feat_sent[0], list)
                assert isinstance(feat_sent[0][0], str)
                feature_list.append(feat_sent[0])
            review_text = " ".join(review_text_list)
            combined_line_data = {
                'user': user_id,
                'item': item_id,
                'rating': rating,
                'review': review_text,
                'feature': feature_list
            }
            train_review.append([item_id, user_id, rating, review_text, feature_list])
            json.dump(combined_line_data, f_out)
            f_out.write('\n')
            cnt += 1
            if cnt % 100000 == 0:
                print('{} lines loaded.'.format(cnt))
print('Finish loading train dataset, totally {} lines.'.format(len(train_review)))


Load file: ../Dataset/tripadvisor/ta_output/split/train.txt
100000 lines loaded.
200000 lines loaded.
Finish loading train dataset, totally 261214 lines.


In [8]:
# Load test dataset
test_review = []
cnt = 0
file_path = os.path.join(dir_path, 'test.txt')
output_file_path = os.path.join(output_dir_path, 'test.json')
with open(file_path) as f:
    with open(output_file_path, 'w') as f_out:
        print("Load file: {}".format(file_path))
        for line in f:
            line_data = json.loads(line)
            item_id = str(line_data[0])
            user_id = str(line_data[1])
            try:
                rating = line_data[2]['overall']
            except KeyError:
                print("No overall score at: ", line_data)
            review_feat_sent = line_data[3]
            review_text_list = list()
            feature_list = list()
            for feat_sent in review_feat_sent:
                review_text_list.append(feat_sent[1])
                assert isinstance(feat_sent[0], list)
                assert isinstance(feat_sent[0][0], str)
                feature_list.append(feat_sent[0])
            review_text = " ".join(review_text_list)
            combined_line_data = {
                'user': user_id,
                'item': item_id,
                'rating': rating,
                'review': review_text,
                'feature': feature_list
            }
            test_review.append([item_id, user_id, rating, review_text, feature_list])
            json.dump(combined_line_data, f_out)
            f_out.write('\n')
            cnt += 1
            if cnt % 10000 == 0:
                print('{} lines loaded.'.format(cnt))
print('Finish loading test dataset, totally {} lines.'.format(len(test_review)))

Load file: ../Dataset/tripadvisor/ta_output/split/test.txt
10000 lines loaded.
20000 lines loaded.
Finish loading test dataset, totally 26665 lines.


# Check Review with 0 sentences

In [10]:
cnt = 0
for train_data_instance in train_review:
    review_text = train_data_instance[-2]
    review_sents = sent_tokenize(review_text)
    if len(review_sents) == 0:
        assert review_text == ''
        cnt += 1
print("Number of reviews on train with 0 sentences: {}".format(cnt))

cnt = 0
for test_data_instance in test_review:
    review_text = test_data_instance[-2]
    review_sents = sent_tokenize(review_text)
    if len(review_sents) == 0:
        assert review_text == ''
        cnt += 1
print("Number of reviews on test with 0 sentences: {}".format(cnt))

Number of reviews on train with 0 sentences: 0
Number of reviews on test with 0 sentences: 0


# Check Duplicate Reviews

In [46]:
# in train set
trainset_user_item_set = set()
multiple_review_user_item_set = set()
cnt_duplicate_review = 0

for train_data_chunk in train_review:
    item_id = str(train_data_chunk[0])
    user_id = str(train_data_chunk[1])
    if (user_id, item_id) in trainset_user_item_set:
        cnt_duplicate_review += 1
        multiple_review_user_item_set.add((user_id, item_id))
    else:
        trainset_user_item_set.add((user_id, item_id))

print("[Train] Number of duplicate reviews: {}".format(cnt_duplicate_review))
print("[Train] Number of user-item pairs that have multiple reviews: {}".format(
    len(multiple_review_user_item_set)))
print("[Train] Number of unique reviews: {}".format(len(trainset_user_item_set)))

[Train] Number of duplicate reviews: 55619
[Train] Number of user-item pairs that have multiple reviews: 9626
[Train] Number of unique reviews: 205595


In [47]:
# in test set
testset_user_item_set = set()
testset_multiple_review_user_item_set = set()
cnt_duplicate_review = 0

for test_data_chunk in test_review:
    item_id = str(test_data_chunk[0])
    user_id = str(test_data_chunk[1])
    if (user_id, item_id) in testset_user_item_set:
        cnt_duplicate_review += 1
        testset_multiple_review_user_item_set.add((user_id, item_id))
    else:
        testset_user_item_set.add((user_id, item_id))

print("[Test] Number of duplicate reviews: {}".format(cnt_duplicate_review))
print("[Test] Number of user-item pairs that have multiple reviews: {}".format(
    len(testset_multiple_review_user_item_set)))
print("[Test] Number of unique reviews: {}".format(len(testset_user_item_set)))

[Test] Number of duplicate reviews: 3963
[Test] Number of user-item pairs that have multiple reviews: 1093
[Test] Number of unique reviews: 22702


In [13]:
df_train_data = pd.DataFrame(train_review, columns=['item', 'user', 'rating', 'review', 'feature'])
df_test_data = pd.DataFrame(test_review, columns=['item', 'user', 'rating', 'review', 'feature'])

In [15]:
df_train_data

Unnamed: 0,item,user,rating,review,feature
0,157,997,3,the best part of the stay was louise cooking w...,"[[location], [location], [value, cleanliness]]"
1,578,997,5,the last concern for most professional travell...,"[[service], [service]]"
2,3713,997,5,"excellent service , highly recommended . great...","[[service], [location], [location], [location]]"
3,1497,997,5,"our flight was delayed , so got into tampa air...","[[location], [rooms, cleanliness], [service], ..."
4,265,997,2,"although staff and service were very good , th...","[[service], [cleanliness], [rooms]]"
...,...,...,...,...,...
261209,4420,3907,5,the lobby and the location of the business cen...,"[[location], [value], [location]]"
261210,1506,3907,3,fantastic location if visiting the disneyland ...,[[location]]
261211,608,3907,1,offered food that was literally non eatable . ...,"[[service], [value], [service], [value], [value]]"
261212,1242,3907,4,overall i had a very good 3 night stay in this...,[[location]]


In [14]:
df_test_data

Unnamed: 0,item,user,rating,review,feature
0,52,997,3,"the price ( with the fees ) is still okay , bu...",[[value]]
1,636,997,5,they have the food bars organized so you can e...,[[service]]
2,1379,997,1,i fell from the shower onto the floor as there...,[[rooms]]
3,645,997,3,talked to sara at main desk twice about this s...,[[service]]
4,1606,997,5,the hotel was very clean and nice and the staf...,"[[service], [value], [rooms, cleanliness], [ro..."
...,...,...,...,...,...
26660,1409,4612,4,very very nice hotel - need a free continental...,[[service]]
26661,1111,4880,5,no complaints food was good cocktails were goo...,"[[service, value], [rooms], [location, service..."
26662,1134,3907,4,i nice hotel that would have gotten 5 stars if...,"[[value], [value], [location], [rooms, location]]"
26663,504,3907,5,"an excellent hotel , rooms are very nice . we ...","[[rooms], [service, location], [value], [rooms]]"


In [16]:
# make sure that duplicate reviews has the same rating and review text
# goupby multiple columns (user and item)
groupby_user_item = df_train_data.groupby(['user', 'item'])
cnt_duplicate_review = 0
cnt_user_item_pair = 0
filterd_user_item_pair = []
differ_user_item_pair = []
for key, item in groupby_user_item:
    cur_df_user_item = groupby_user_item.get_group(key)
    cnt_user_item_pair += 1
    if len(cur_df_user_item) > 1:
        sampled_cur_df_user_item = cur_df_user_item.sample(n=1)
        filterd_user_item_pair.append(sampled_cur_df_user_item)
        rating_list = list(cur_df_user_item['rating'])
        review_text_list = list(cur_df_user_item['review'])
        cnt_duplicate_review += len(cur_df_user_item) - 1
        differ_data_flag = False
        for i in range(len(cur_df_user_item)):
            if rating_list[i] != rating_list[0]:
                differ_data_flag = True
                break
            if review_text_list[i] != review_text_list[0]:
                differ_data_flag = True
                break
        if differ_data_flag:
            differ_user_item_pair.append(cur_df_user_item)
    else:
        filterd_user_item_pair.append(cur_df_user_item)
print("Total unique user-item pair: {}".format(cnt_user_item_pair))
print("Total duplicate reviews: {}".format(cnt_duplicate_review))
print("Total duplicate user-item pair with different reviews: {}".format(
    len(differ_user_item_pair))
)
print("Total saved unique user-item reviews: {}".format(
    len(filterd_user_item_pair))
)

Total unique user-item pair: 205595
Total duplicate reviews: 55619
Total duplicate user-item pair with different reviews: 9332
Total saved unique user-item reviews: 205595


In [17]:
differ_user_item_pair[10]

Unnamed: 0,item,user,rating,review,feature
7179,1006,0,2,it is more like a skilled nursing facility tha...,"[[rooms], [rooms], [location]]"
9931,1006,0,4,we got a steal of a deal using priceline for t...,"[[value], [cleanliness], [rooms], [rooms], [se..."
11212,1006,0,3,the downtown is basically a ghost town at nigh...,"[[location], [location]]"
21547,1006,0,1,on the second day housekeeping wanted to clean...,"[[cleanliness, rooms], [service], [service, lo..."
25278,1006,0,2,"valet parking is your only option , ( there is...","[[value], [value]]"
31215,1006,0,2,we arrived early and were told rooms would not...,"[[rooms], [rooms]]"
31955,1006,0,4,"the rooms were beautiful . the decor , as soon...","[[rooms], [rooms, location], [rooms], [rooms],..."
34555,1006,0,1,it seems to be becoming clear that this is a f...,[[cleanliness]]
35066,1006,0,5,i had to cut my trip short and cancel my room ...,"[[location, rooms], [service, value], [service..."
42688,1006,0,5,we pulled up and the valet was quick to help u...,"[[service], [service], [service], [rooms], [ro..."


In [18]:
filterd_user_item_pair[0]

Unnamed: 0,item,user,rating,review,feature
46297,0,0,5,this is our second stay at this hotel ; we sta...,"[[location, rooms], [rooms, cleanliness], [roo..."


In [19]:
combined_filterd_user_item_pair = pd.concat(filterd_user_item_pair).reset_index(drop=True)

In [20]:
combined_filterd_user_item_pair

Unnamed: 0,item,user,rating,review,feature
0,0,0,5,this is our second stay at this hotel ; we sta...,"[[location, rooms], [rooms, cleanliness], [roo..."
1,1,0,3,"small , cramped rooms , moldy grout in shower ...","[[rooms], [location, service]]"
2,10,0,3,"the room doors would slam when guests leave , ...","[[rooms, value], [service]]"
3,100,0,2,"first , the old style tv was on top of the clo...","[[rooms, location], [value], [service], [locat..."
4,1000,0,5,the food was exceptional - we really enjoyed t...,"[[service, location], [service], [rooms, servi..."
...,...,...,...,...,...
205590,752,999,5,"the rooms are spacious , quiet , and clean . m...","[[rooms], [service], [service], [service], [lo..."
205591,819,999,5,"room was very nice . bed was comfortable , had...","[[rooms], [rooms], [rooms], [rooms], [rooms], ..."
205592,827,999,3,not really the best stay i ever had . room was...,"[[location], [rooms, cleanliness, value]]"
205593,852,999,3,our room was not as nice as i had hoped . the ...,"[[rooms], [cleanliness]]"


In [21]:
# Write the filtered result into json
output_file_path = os.path.join(output_dir_path, 'train_review_filtered.json')
with open(output_file_path, 'w') as f_out:
    for row in combined_filterd_user_item_pair.iterrows():
        row[1].to_json(f_out)
        f_out.write('\n')

In [22]:
# make sure that duplicate reviews has the same rating and review text
# goupby multiple columns (user and item)
groupby_user_item = df_test_data.groupby(['user', 'item'])
cnt_duplicate_review_test = 0
cnt_user_item_pair_test = 0
filterd_user_item_pair_test = []
differ_user_item_pair_test = []
for key, item in groupby_user_item:
    cur_df_user_item = groupby_user_item.get_group(key)
    cnt_user_item_pair_test += 1
    if len(cur_df_user_item) > 1:
        sampled_cur_df_user_item = cur_df_user_item.sample(n=1)
        filterd_user_item_pair_test.append(sampled_cur_df_user_item)
        rating_list = list(cur_df_user_item['rating'])
        review_text_list = list(cur_df_user_item['review'])
        cnt_duplicate_review_test += len(cur_df_user_item) - 1
        differ_data_flag = False
        for i in range(len(cur_df_user_item)):
            if rating_list[i] != rating_list[0]:
                differ_data_flag = True
                break
            if review_text_list[i] != review_text_list[0]:
                differ_data_flag = True
                break
        if differ_data_flag:
            differ_user_item_pair_test.append(cur_df_user_item)
    else:
        filterd_user_item_pair_test.append(cur_df_user_item)
print("Total unique user-item pair: {}".format(cnt_user_item_pair_test))
print("Total duplicate reviews: {}".format(cnt_duplicate_review_test))
print("Total duplicate user-item pair with different reviews: {}".format(
    len(differ_user_item_pair_test))
)
print("Total saved unique user-item reviews: {}".format(
    len(filterd_user_item_pair_test))
)

Total unique user-item pair: 22702
Total duplicate reviews: 3963
Total duplicate user-item pair with different reviews: 1090
Total saved unique user-item reviews: 22702


In [23]:
combined_filterd_user_item_pair_test = pd.concat(
    filterd_user_item_pair_test).reset_index(drop=True)

In [24]:
combined_filterd_user_item_pair_test

Unnamed: 0,item,user,rating,review,feature
0,0,0,4,we got a room facing disneyland . right out or...,"[[rooms], [rooms], [rooms], [location], [rooms]]"
1,100,0,3,"rooms were of average content , but convenient...","[[location], [service], [location, service]]"
2,1002,0,2,did quite a bit of shopping and eating . the r...,"[[location], [rooms]]"
3,1006,0,2,i only wanted to find a hotel close to the por...,"[[location], [rooms], [rooms], [rooms], [rooms..."
4,1007,0,5,it would be my first choice anytime i 'm in au...,[[value]]
...,...,...,...,...,...
22697,0,999,5,"this was a pleasant place , and with our annua...","[[value], [rooms], [rooms]]"
22698,128,999,5,we enjoyed our stay at the hilton very much ! ...,"[[location], [service], [rooms]]"
22699,429,999,5,from the moment we arrived at the front desk u...,[[service]]
22700,816,999,4,"wifi gratuit , nous n avons pas essayé le brea...",[[service]]


In [25]:
# Write the filtered result into json
output_file_path = os.path.join(output_dir_path, 'test_review_filtered.json')
with open(output_file_path, 'w') as f_out:
    for row in combined_filterd_user_item_pair_test.iterrows():
        row[1].to_json(f_out)
        f_out.write('\n')

In [27]:
# get user/item on trainset
train_user_ids = set(combined_filterd_user_item_pair['user'].unique())
train_item_ids = set(combined_filterd_user_item_pair['item'].unique())
print("Number of users on train-set: {}".format(len(train_user_ids)))
print("Number of items on train-set: {}".format(len(train_item_ids)))

Number of users on train-set: 4950
Number of items on train-set: 4493


In [28]:
# for test-set, 1. remove the user-item pair that appears in train-set
# 2. remove user/item that not appear in train-set
# Write the filtered result into json
output_file_path = os.path.join(output_dir_path, 'test_review_filtered_clean.json')
cnt_test_useritem_appears_in_train = 0
cnt_test_useritem_new_in_test = 0
cnt_test_unique_useritem = 0
with open(output_file_path, 'w') as f_out:
    for row in combined_filterd_user_item_pair_test.iterrows():
        item_id = row[1]['item']
        user_id = row[1]['user']
        current_user_item_id = (str(user_id), str(item_id))
        if current_user_item_id in trainset_user_item_set:
            cnt_test_useritem_appears_in_train += 1
        elif item_id not in train_item_ids or user_id not in train_user_ids:
            cnt_test_useritem_new_in_test += 1
        else:
            cnt_test_unique_useritem += 1
            row[1].to_json(f_out)
            f_out.write('\n')
print("Number of user-item pairs appear on test and train: {}".format(
    cnt_test_useritem_appears_in_train))
print("Number of user-item pairs only appear on test: {}".format(cnt_test_unique_useritem))
print("Number of user/item new on test: {}".format(cnt_test_useritem_new_in_test))

Number of user-item pairs appear on test and train: 3243
Number of user-item pairs only appear on test: 19444
Number of user/item new on test: 15


# Load Clean Train/Test Data

In [29]:
# Load cleaned train
train_clean_file = os.path.join(output_dir_path, 'train_review_filtered.json')
train_clean_review = []
with open(train_clean_file, 'r') as f:
    for line in f:
        line_data = json.loads(line)
        item_id = line_data['item']
        user_id = line_data['user']
        rating = line_data['rating']
        review = line_data['review']
        train_clean_review.append([item_id, user_id, rating, review])
print("Number of reviews on trainset: {}".format(len(train_clean_review)))

Number of reviews on trainset: 205595


In [30]:
df_train_clean_data = pd.DataFrame(train_clean_review, columns=['item', 'user', 'rating', 'review'])

In [31]:
df_train_clean_data

Unnamed: 0,item,user,rating,review
0,0,0,5,this is our second stay at this hotel ; we sta...
1,1,0,3,"small , cramped rooms , moldy grout in shower ..."
2,10,0,3,"the room doors would slam when guests leave , ..."
3,100,0,2,"first , the old style tv was on top of the clo..."
4,1000,0,5,the food was exceptional - we really enjoyed t...
...,...,...,...,...
205590,752,999,5,"the rooms are spacious , quiet , and clean . m..."
205591,819,999,5,"room was very nice . bed was comfortable , had..."
205592,827,999,3,not really the best stay i ever had . room was...
205593,852,999,3,our room was not as nice as i had hoped . the ...


In [32]:
# Load cleaned test
test_clean_file = os.path.join(output_dir_path, 'test_review_filtered_clean.json')
test_clean_review = []
with open(test_clean_file, 'r') as f:
    for line in f:
        line_data = json.loads(line)
        item_id = line_data['item']
        user_id = line_data['user']
        rating = line_data['rating']
        review = line_data['review']
        test_clean_review.append([item_id, user_id, rating, review])
print("Number of reviews on testset: {}".format(len(test_clean_review)))

Number of reviews on testset: 19444


In [33]:
df_test_clean_data = pd.DataFrame(test_clean_review, columns=['item', 'user', 'rating', 'review'])

In [34]:
df_test_clean_data

Unnamed: 0,item,user,rating,review
0,1111,0,2,when i mentioned this to the front desk they d...
1,1379,0,3,"the service was good . our room , was not the ..."
2,1391,0,5,we stayed at the signature for four days to ce...
3,1579,0,4,the lake buena vista is a perfect place to sta...
4,1689,0,5,summer ( at the front desk ) was perfect ! she...
...,...,...,...,...
19439,0,999,5,"this was a pleasant place , and with our annua..."
19440,128,999,5,we enjoyed our stay at the hilton very much ! ...
19441,429,999,5,from the moment we arrived at the front desk u...
19442,816,999,4,"wifi gratuit , nous n avons pas essayé le brea..."


In [48]:
# get user/item on trainset
test_user_ids = set(df_test_clean_data['user'].unique())
test_item_ids = set(df_test_clean_data['item'].unique())
print("Number of users on test-set: {}".format(len(test_user_ids)))
print("Number of items on test-set: {}".format(len(test_item_ids)))

Number of users on test-set: 4936
Number of items on test-set: 4121


# Get User/Item Statistics on Train

## Train - User

In [35]:
groupby_user_train = df_train_clean_data.groupby(['user'])

In [36]:
len(groupby_user_train)

4950

In [37]:
user_num_review_list = list()
user_num_review_dict = dict()
for key, item in groupby_user_train:
    user_num_review_dict[key] = len(item)
    user_num_review_list.append(len(item))

In [38]:
import numpy as np
print("Number of user: {}".format(len(user_num_review_list)))
print("Mean number of review per user: {}".format(
    np.mean(user_num_review_list)
))
print("Min number of review per user: {}".format(
    np.min(user_num_review_list)
))
print("Max number of review per user: {}".format(
    np.max(user_num_review_list)
))

Number of user: 4950
Mean number of review per user: 41.534343434343434
Min number of review per user: 9
Max number of review per user: 3407


In [39]:
print("Top-10 least numbber of review per user: {}".format(
    sorted(user_num_review_list)[:10]
))
print("Top-10 most numbber of review per user: {}".format(
    sorted(user_num_review_list)[-10:]
))

Top-10 least numbber of review per user: [9, 9, 10, 10, 10, 10, 10, 10, 11, 11]
Top-10 most numbber of review per user: [299, 304, 306, 324, 356, 364, 385, 471, 673, 3407]


## Train - Item

In [40]:
groupby_item_train = df_train_clean_data.groupby(['item'])

In [41]:
len(groupby_item_train)

4493

In [42]:
item_num_review_list = list()
item_num_review_dict = dict()
for key, item in groupby_item_train:
    item_num_review_dict[key] = len(item)
    item_num_review_list.append(len(item))

In [43]:
print("Number of items: {}".format(len(item_num_review_list)))
print("Mean number of review per item: {}".format(
    np.mean(item_num_review_list)
))
print("Min number of review per item: {}".format(
    np.min(item_num_review_list)
))
print("Max number of review per item: {}".format(
    np.max(item_num_review_list)
))

Number of items: 4493
Mean number of review per item: 45.75895837970176
Min number of review per item: 7
Max number of review per item: 585


In [44]:
print("Top-10 least numbber of review per item: {}".format(
    sorted(item_num_review_list)[:10]
))
print("Top-10 most numbber of review per item: {}".format(
    sorted(item_num_review_list)[-10:]
))

Top-10 least numbber of review per item: [7, 8, 8, 9, 9, 9, 9, 9, 9, 10]
Top-10 most numbber of review per item: [362, 375, 381, 392, 410, 411, 447, 461, 464, 585]
