In [1]:
!which python

/u/pw7nc/anaconda3/bin/python


In [2]:
import re
import json
import os
import pandas as pd
import numpy as np
import spacy
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from spacy.lang.en import English
nlp = English()
tokenizer = nlp.tokenizer
import string
punct = string.punctuation
from sklearn.feature_extraction import _stop_words

In [3]:
dataset_name = "tripadvisor"

# Read Data

In [4]:
dir_path = '../Dataset/{}'.format(dataset_name)
# Load train dataset
train_review = []
cnt = 0
file_path = os.path.join(dir_path, 'train_review_filtered.json')
with open(file_path) as f:
    print("Load file: {}".format(file_path))
    for line in f:
        line_data = json.loads(line)
        user_id = line_data['user']
        item_id = line_data['item']
        rating = line_data['rating']
        review = line_data['review']
        train_review.append([item_id, user_id, rating, review])
        cnt += 1
        if cnt % 100000 == 0:
            print('{} lines loaded.'.format(cnt))
print('Finish loading train dataset, totally {} lines.'.format(len(train_review)))
# Load test dataset
test_review = []
cnt = 0
file_path = os.path.join(dir_path, 'test_review_filtered_clean.json')
with open(file_path) as f:
    print("Load file: {}".format(file_path))
    for line in f:
        line_data = json.loads(line)
        user_id = line_data['user']
        item_id = line_data['item']
        rating = line_data['rating']
        review = line_data['review']
        test_review.append([item_id, user_id, rating, review])
        cnt += 1
print('Finish loading test dataset, totally {} lines.'.format(len(test_review)))

Load file: ../Dataset/tripadvisor/train_review_filtered.json
100000 lines loaded.
200000 lines loaded.
Finish loading train dataset, totally 205595 lines.
Load file: ../Dataset/tripadvisor/test_review_filtered_clean.json
Finish loading test dataset, totally 19459 lines.


In [5]:
# convert to pandas dataframe
df_train_data = pd.DataFrame(train_review, columns=['item', 'user', 'rating', 'review'])
df_test_data = pd.DataFrame(test_review, columns=['item', 'user', 'rating', 'review'])

In [6]:
df_train_data

Unnamed: 0,item,user,rating,review
0,0,0,5,"having done some research , i requested a room..."
1,1,0,1,will not stay here again .
2,10,0,2,the hotel reception was very willing to try an...
3,100,0,3,this was older but clean . the breakfast was t...
4,1000,0,5,the food was exceptional - we really enjoyed t...
...,...,...,...,...
205590,752,999,5,"the rooms are spacious , quiet , and clean . m..."
205591,819,999,5,"room was very nice . bed was comfortable , had..."
205592,827,999,3,not really the best stay i ever had . room was...
205593,852,999,3,our room was not as nice as i had hoped . the ...


In [7]:
df_test_data

Unnamed: 0,item,user,rating,review
0,1111,0,2,when i mentioned this to the front desk they d...
1,1379,0,2,i guess for a highway stop this motel is fine ...
2,1391,0,5,we stayed at the signature for four days to ce...
3,1579,0,4,the lake buena vista is a perfect place to sta...
4,1689,0,5,summer ( at the front desk ) was perfect ! she...
...,...,...,...,...
19454,0,999,5,"this was a pleasant place , and with our annua..."
19455,128,999,5,we enjoyed our stay at the hilton very much ! ...
19456,429,999,5,from the moment we arrived at the front desk u...
19457,816,999,4,"wifi gratuit , nous n avons pas essayé le brea..."


In [8]:
len(df_train_data['rating'].unique())

5

In [9]:
print(sorted(list(df_train_data['rating'].unique())))
# from 50 to 100, without 54

[1, 2, 3, 4, 5]


In [10]:
list(df_train_data['review'])[100:105]

['personnel and service were outstanding . wonderful western décor on beautiful grounds . a short walk to center of town , but if you prefer a shuttle service is available .',
 'overall , the stay was fine . our bathroom sink faucet dripped and they charge $ 10 / day for high - speed internet . i was able to use a dial - up service for $ 1 per connection .',
 'great location too ! super friendly staff that really went out of their way . i think his name was gordon . in any event , this is my first choice in dc .',
 "the room was basically clean and toiletries replenished daily . beds were comfy . shuttle bus service was superb , the drivers were friendly , knowledgable , and efficient . it 's because the a / c vents went out into the hallways from the rooms . as long as you kept the a / c on and windows closed , your room was fine .",
 "it was incredibly dirty in the halls and the room . the bed was like 100 years old and we could not sleep . seriously do n't stay here - there are plen

In [11]:
# Load Features
feature2id_filepath = '../Dataset/{}/train/feature/feature2id.json'.format(dataset_name)
id2feature_filepath = '../Dataset/{}/train/feature/id2feature.json'.format(dataset_name)
with open(feature2id_filepath, 'r') as f:
    print("Load file: {}".format(feature2id_filepath))
    feature2id_vocab = json.load(f)
with open(id2feature_filepath, 'r') as f:
    print("Load file: {}".format(id2feature_filepath))
    id2feature_vocab = json.load(f)
assert len(feature2id_vocab) == len(id2feature_vocab)

Load file: ../Dataset/tripadvisor/train/feature/feature2id.json
Load file: ../Dataset/tripadvisor/train/feature/id2feature.json


In [12]:
feature_words_set = set(list(feature2id_vocab.keys()))

In [14]:
# Write Train Combined Data.
# NOTE: Here, we remove those sentences without any features.
train_combined_review_file = "../Dataset/{}/train_combined.json".format(dataset_name)
cnt_line_without_features = 0
cnt_line = 0
with open(train_combined_review_file, 'w') as fw:
    print("Write file: {}".format(train_combined_review_file))
    for idx, row in df_train_data.iterrows():
        item_id = row['item']
        user_id = row['user']
        rating = row['rating']
        review_origin = row['review']
        review_sents = sent_tokenize(review_origin)
        review_sents_has_feature = list()
        for rvw_sent in review_sents:
            rvw_tokens = word_tokenize(rvw_sent)
            rvw_has_feature = False
            for token in rvw_tokens:
                if token in feature_words_set:
                    rvw_has_feature = True
                    break
            if rvw_has_feature:
                review_sents_has_feature.append(rvw_sent)
        if len(review_sents_has_feature) == 0:
            cnt_line_without_features += 1
            continue
        review_has_feature = " ".join(review_sents_has_feature)
        line_data = {
            "user": user_id, "item": item_id, "rating": rating, "review": review_has_feature
        }
        json.dump(line_data, fw)
        fw.write("\n")
        cnt_line += 1
        if (idx+1) % 10000 == 0:
            print("{} lines".format(idx+1))
print("Finished! Totally {0} lines of training data. Among them {1} lines has no features".format(
    idx+1, cnt_line_without_features
))


Write file: ../Dataset/tripadvisor/train_combined.json
10000 lines
20000 lines
30000 lines
40000 lines
50000 lines
60000 lines
70000 lines
80000 lines
90000 lines
100000 lines
110000 lines
120000 lines
130000 lines
140000 lines
150000 lines
160000 lines
170000 lines
180000 lines
190000 lines
200000 lines
Finished! Totally 205595 lines of training data. Among them 3243 lines has no features


In [15]:
(cnt_line + cnt_line_without_features) == (idx+1)

True

In [None]:
# Write Train Combined Data.
# NOTE: Here, we remove those sentences without any features.
test_combined_review_file = "../Dataset/{}/test_combined.json".format(dataset_name)
cnt_line_without_features = 0
cnt_line = 0
with open(test_combined_review_file, 'w') as fw:
    print("Write file: {}".format(test_combined_review_file))
    for idx, row in df_test_data.iterrows():
        item_id = row['item']
        user_id = row['user']
        rating = row['rating']
        review_origin = row['review']
        review_sents = sent_tokenize(review_origin)
        review_sents_has_feature = list()
        for rvw_sent in review_sents:
            rvw_tokens = word_tokenize(rvw_sent)
            rvw_has_feature = False
            for token in rvw_tokens:
                if token in feature_words_set:
                    rvw_has_feature = True
                    break
            if rvw_has_feature:
                review_sents_has_feature.append(rvw_sent)
        if len(review_sents_has_feature) == 0:
            cnt_line_without_features += 1
            continue
        review_has_feature = " ".join(review_sents_has_feature)
        line_data = {
            "user": user_id, "item": item_id, "rating": rating, "review": review_has_feature
        }
        json.dump(line_data, fw)
        fw.write("\n")
        cnt_line += 1
        if (idx+1) % 10000 == 0:
            print("{} lines".format(idx+1))
print("Finished! Totally {0} lines of test data. Among them {1} lines has no features".format(
    idx+1, cnt_line_without_features
))

In [None]:
## Split ##

## Load Sentence2ID and ID2Sentence Mapping From Train

In [11]:
sentence2id_filepath = '../Dataset/{}/train/sentence/sentence2id.json'.format(dataset_name)
with open(sentence2id_filepath, 'r') as f:
    print("Load file: {}".format(sentence2id_filepath))
    trainset_sent_to_id = json.load(f)
id2sentence_filepath = '../Dataset/{}/train/sentence/id2sentence.json'.format(dataset_name)
with open(id2sentence_filepath, 'r') as f:
    print("Load file: {}".format(id2sentence_filepath))
    trainset_id_to_sent = json.load(f)
assert len(trainset_sent_to_id) == len(trainset_id_to_sent)
print("Number of sentences on train: {}".format(len(trainset_id_to_sent)))

Load file: ../Dataset/yelp/train/sentence/sentence2id.json
Load file: ../Dataset/yelp/train/sentence/id2sentence.json
Number of sentences on train: 492739


## Load Sentence2ID and ID2Sentence Mapping From Valid/Test

In [12]:
validset_id2sentence_filepath = '../Dataset/{}/valid/sentence/id2sentence.json'.format(dataset_name)
with open(validset_id2sentence_filepath, 'r') as f:
    print("Load file: {}".format(validset_id2sentence_filepath))
    validset_id_to_sent = json.load(f)
validset_sentence2id_filepath = '../Dataset/{}/valid/sentence/sentence2id.json'.format(dataset_name)
with open(validset_sentence2id_filepath, 'r') as f:
    print("Load file: {}".format(validset_sentence2id_filepath))
    validset_sent_to_id = json.load(f)
print("There are {} sentences in the validation set.".format(len(validset_id_to_sent)))

testset_id2sentence_filepath = '../Dataset/{}/test/sentence/id2sentence.json'.format(dataset_name)
with open(testset_id2sentence_filepath, 'r') as f:
    print("Load file: {}".format(testset_id2sentence_filepath))
    testset_id_to_sent = json.load(f)
testset_sentence2id_filepath = '../Dataset/{}/test/sentence/sentence2id.json'.format(dataset_name)
with open(testset_sentence2id_filepath, 'r') as f:
    print("Load file: {}".format(testset_sentence2id_filepath))
    testset_sent_to_id = json.load(f)
print("There are {} sentences in the test set.".format(len(testset_id_to_sent)))

assert testset_id_to_sent == validset_id_to_sent
assert testset_sent_to_id == validset_sent_to_id

Load file: ../Dataset/yelp/valid/sentence/id2sentence.json
Load file: ../Dataset/yelp/valid/sentence/sentence2id.json
There are 109833 sentences in the validation set.
Load file: ../Dataset/yelp/test/sentence/id2sentence.json
Load file: ../Dataset/yelp/test/sentence/sentence2id.json
There are 109833 sentences in the test set.


## Load Features

In [13]:
feature2id_filepath = '../Dataset/{}/train/feature/feature2id.json'.format(dataset_name)
id2feature_filepath = '../Dataset/{}/train/feature/id2feature.json'.format(dataset_name)
with open(feature2id_filepath, 'r') as f:
    print("Load file: {}".format(feature2id_filepath))
    feature2id_vocab = json.load(f)
with open(id2feature_filepath, 'r') as f:
    print("Load file: {}".format(id2feature_filepath))
    id2feature_vocab = json.load(f)
assert len(feature2id_vocab) == len(id2feature_vocab)

Load file: ../Dataset/yelp/train/feature/feature2id.json
Load file: ../Dataset/yelp/train/feature/id2feature.json


## Load Sentence to Feature Tf Dict

In [14]:
trainset_sent2featuretf_file = '../Dataset/{}/train/sentence/sentence2featuretf.json'.format(dataset_name)
testset_sent2featuretf_file = '../Dataset/{}/test/sentence/sentence2featuretf.json'.format(dataset_name)
with open(trainset_sent2featuretf_file, 'r') as f:
    print("Load file: {}".format(trainset_sent2featuretf_file))
    trainset_sent2featuretf = json.load(f)
with open(testset_sent2featuretf_file, 'r') as f:
    print("Load file: {}".format(testset_sent2featuretf_file))
    testset_sent2featuretf = json.load(f)

Load file: ../Dataset/yelp/train/sentence/sentence2featuretf.json
Load file: ../Dataset/yelp/test/sentence/sentence2featuretf.json


## Load multi-line train/valid/test data

In [15]:
trainset_data_multiline_file = "../Dataset/{}/train/useritem2sentids_multilines.json".format(dataset_name)
validset_data_multiline_file = "../Dataset/{}/valid/useritem2sentids_test_multilines.json".format(dataset_name)
testset_data_multiline_file = "../Dataset/{}/test/useritem2sentids_test_multilines.json".format(dataset_name)

In [16]:
def construct_text_review_train(review_content, review_sent_ids):
    combined_review = []
    combined_review_ids = []
    for ex_sent in sent_tokenize(review_content):
        if ex_sent in trainset_sent_to_id:
            ex_sent_id = trainset_sent_to_id[ex_sent]
            if ex_sent_id in review_sent_ids and ex_sent_id not in combined_review_ids:
                combined_review.append(ex_sent)
                combined_review_ids.append(ex_sent_id)
    try:
        assert len(combined_review_ids) == len(review_sent_ids)
    except:
        print("Error! review is: {}".format(review_content))
    return " ".join(combined_review), combined_review_ids

In [17]:
def construct_text_review_test(review_content, review_sent_ids):
    combined_review = []
    combined_review_ids = []
    for ex_sent in sent_tokenize(review_content):
        if ex_sent in testset_sent_to_id:
            ex_sent_id = testset_sent_to_id[ex_sent]
            if ex_sent_id in review_sent_ids and ex_sent_id not in combined_review_ids:
                combined_review.append(ex_sent)
                combined_review_ids.append(ex_sent_id)
    try:
        assert len(combined_review_ids) == len(review_sent_ids)
    except:
        print("Error! review is: {}".format(review_content))
    return " ".join(combined_review), combined_review_ids

In [18]:
trainset_ui2rvw = dict()
trainset_ui2rating = dict()
for idx, train_rvw_data in df_train_data.iterrows():
    user_id = train_rvw_data['user']
    item_id = train_rvw_data['item']
    rating = train_rvw_data['rating']
    assert isinstance(rating, int)
    rvw_text = train_rvw_data['review']
    assert isinstance(rvw_text, str)
    if user_id in trainset_ui2rvw:
        assert item_id not in trainset_ui2rvw[user_id]
        assert user_id in trainset_ui2rating
        assert item_id not in trainset_ui2rating[user_id]
        trainset_ui2rvw[user_id][item_id] = rvw_text
        trainset_ui2rating[user_id][item_id] = rating
    else:
        assert user_id not in trainset_ui2rating
        trainset_ui2rvw[user_id] = dict()
        trainset_ui2rating[user_id] = dict()
        trainset_ui2rvw[user_id][item_id] = rvw_text
        trainset_ui2rating[user_id][item_id] = rating
    if (idx+1) % 20000 == 0:
        print("{} lines".format(idx+1))
print("Totally {} lines".format(idx+1))

20000 lines
40000 lines
60000 lines
80000 lines
100000 lines
120000 lines
140000 lines
160000 lines
180000 lines
Totally 191227 lines


In [19]:
# Load train set (multi-line)
combined_train_review = []
with open(trainset_data_multiline_file) as f:
    print("Load file: {}".format(trainset_data_multiline_file))
    cnt_line = 0
    for line in f:
        line_data = json.loads(line)
        user_id = line_data['user_id']
        item_id = line_data['item_id']
        assert isinstance(user_id, str)
        assert isinstance(item_id, str)
        candidate_ids = line_data["candidate"]
        review_ids = line_data["review"]
        # get the review and rating of this user-item
        this_review = trainset_ui2rvw[user_id][item_id]
        this_rating = trainset_ui2rating[user_id][item_id]
        this_combined_review, this_combined_review_sentids = construct_text_review_train(
            this_review, review_ids)
        combined_train_review.append(
            [item_id, user_id, this_rating, this_combined_review, this_combined_review_sentids])
        cnt_line += 1
        if cnt_line % 10000 == 0:
            print("Processed {} lines of data".format(cnt_line))
print("Finished! Totally {} lines.".format(cnt_line))

Load file: ../Dataset/yelp/train/useritem2sentids_multilines.json
Processed 10000 lines of data
Processed 20000 lines of data
Processed 30000 lines of data
Processed 40000 lines of data
Processed 50000 lines of data
Processed 60000 lines of data
Processed 70000 lines of data
Processed 80000 lines of data
Processed 90000 lines of data
Processed 100000 lines of data
Processed 110000 lines of data
Processed 120000 lines of data
Processed 130000 lines of data
Processed 140000 lines of data
Processed 150000 lines of data
Processed 160000 lines of data
Processed 170000 lines of data
Processed 180000 lines of data
Processed 190000 lines of data
Finished! Totally 191227 lines.


In [20]:
print(combined_train_review[:2])

[['1071', '1001', 4, 'place was well lit , very clean , and staff very friendly ... wow , , , i was thinking this is gon na hurt my pocket now : - ( ... on to the food and ordering , aioli was ok , as well as the tomato sauce . garlic cheese bread w crab - very delicious , and definitely enough crab to know there is actually crab in it ... cajun style shrimp boil - wife got this and she loved it , it had really great broth , and their homemade bread was awesome for dipping . staff / bartender was very informative and even the one to suggest trying that saffron cream before ordering ... very attentive and friendly ... as far as a seafood " restaurant " , i \'m not sure how to classify this place , there food is really great , but there is a lack of full plate options / entrees .', ['6093', '6094', '6095']], ['10877', '1001', 3, 'the price was great and so was the subs ... this is a can do place , , , parking can be a bia , so be warned , , will add to a once a month rotation ...', ['609

In [21]:
len(combined_train_review)

191227

In [22]:
testset_ui2rvw = dict()
testset_ui2rating = dict()
for idx, test_rvw_data in df_test_data.iterrows():
    user_id = test_rvw_data['user']
    item_id = test_rvw_data['item']
    rating = test_rvw_data['rating']
    assert isinstance(rating, int)
    rvw_text = test_rvw_data['review']
    assert isinstance(rvw_text, str)
    if user_id in testset_ui2rvw:
        assert item_id not in testset_ui2rvw[user_id]
        assert user_id in testset_ui2rating
        assert item_id not in testset_ui2rating[user_id]
        testset_ui2rvw[user_id][item_id] = rvw_text
        testset_ui2rating[user_id][item_id] = rating
    else:
        assert user_id not in testset_ui2rating
        testset_ui2rvw[user_id] = dict()
        testset_ui2rating[user_id] = dict()
        testset_ui2rvw[user_id][item_id] = rvw_text
        testset_ui2rating[user_id][item_id] = rating
    if (idx+1) % 20000 == 0:
        print("{} lines".format(idx+1))
print("Totally {} lines".format(idx+1))

20000 lines
40000 lines
Totally 42702 lines


In [23]:
# Load test set (multi-line)
combined_test_review = []
with open(testset_data_multiline_file) as f:
    print("Load file: {}".format(testset_data_multiline_file))
    cnt_line = 0
    for line in f:
        line_data = json.loads(line)
        user_id = str(line_data['user_id'])
        item_id = str(line_data['item_id'])
        assert isinstance(user_id, str)
        assert isinstance(item_id, str)
        candidate_ids = line_data["candidate"]
        review_ids = line_data["review"]
        # get the review and rating of this user-item
        this_review = testset_ui2rvw[user_id][item_id]
        this_rating = testset_ui2rating[user_id][item_id]
        this_combined_review, this_combined_review_sentids = construct_text_review_test(
            this_review, review_ids)
        combined_test_review.append(
            [item_id, user_id, this_rating, this_combined_review, this_combined_review_sentids])
        cnt_line += 1
        if cnt_line % 20000 == 0:
            print("Processed {} lines of data".format(cnt_line))
print("Finished! Totally {} lines.".format(cnt_line))

Load file: ../Dataset/yelp/test/useritem2sentids_test_multilines.json
Processed 20000 lines of data
Processed 40000 lines of data
Finished! Totally 42702 lines.


In [24]:
len(combined_test_review)

42702

In [25]:
print(combined_test_review[:2])

[['1098', '1001', 4, "great food , great price , great atmosphere ... portion size was huge comparative to the lunch pricing ... iced tea was definitely not china mist or nestle , or lipton , very good also . 2 appetizers with beer , and 2 full entree 's for about 30 bucks out the door ... simply great ... will definitely be back , , often", ['1102', '1103']], ['1473', '1001', 4, "the bbq pork also was way different this time , so i 'm not sure what happened if , short staff for new years or what , , but will try a few more times before concluding it and removed from faves ... the singapore style rice noodles were the same n that saved the day ... other goto is their chinese broccoli ...", ['1104']]]


## Write Train / Test Data into Multi-line Format

In [26]:
# Write Train Data
train_combined_review_file = "../Dataset/{}/train_combined.json".format(dataset_name)
cnt_line = 0
with open(train_combined_review_file, 'w') as fw:
    print("Write file: {}".format(train_combined_review_file))
    for train_combined_instance in combined_train_review:
        item_id = train_combined_instance[0]
        user_id = train_combined_instance[1]
        rating = train_combined_instance[2]
        review_content = train_combined_instance[3]
        current_train_combined_dict = {
            "user":user_id, 'item':item_id, 'rating':rating, 'review':review_content}
        # dump this dict into the json file
        json.dump(current_train_combined_dict, fw)
        fw.write('\n')
        cnt_line += 1
print("Finished! Totally {} lines of training data.".format(cnt_line))

Write file: ../Dataset/yelp/train_combined.json
Finished! Totally 191227 lines of training data.


In [27]:
# Write Test Data
test_combined_review_file = "../Dataset/{}/test_combined.json".format(dataset_name)
cnt_line = 0
with open(test_combined_review_file, 'w') as fw:
    print("Write file: {}".format(test_combined_review_file))
    for test_combined_instance in combined_test_review:
        item_id = test_combined_instance[0]
        user_id = test_combined_instance[1]
        rating = test_combined_instance[2]
        review_content = test_combined_instance[3]
        current_test_combined_dict = {
            "user":user_id, 'item':item_id, 'rating':rating, 'review':review_content}
        # dump this dict into the json file
        json.dump(current_test_combined_dict, fw)
        fw.write('\n')
        cnt_line += 1
print("Finished! Totally {} lines of test data.".format(cnt_line))

Write file: ../Dataset/yelp/test_combined.json
Finished! Totally 42702 lines of test data.


## Construct Valid Set

### We also need a valid set. In the graph model, the current valid set use the same user-item pairs as in the test set but different candidate sets. However, we don't have candidate set here. Therefore, one easy way to construct the valid set is to randomly extract a subset of test set as valid set.

In [28]:
valid_ratio = 0.4
import random
combined_valid_review = []
for test_combined_instance in combined_test_review:
    if random.random() <= valid_ratio:
        combined_valid_review.append(test_combined_instance)
print("Finished! Totally {} lines of valid data.".format(len(combined_valid_review)))

Finished! Totally 17086 lines of valid data.


In [29]:
# Write Valid Data
valid_combined_review_file = "../Dataset/{}/valid_combined.json".format(dataset_name)
cnt_line = 0
with open(valid_combined_review_file, 'w') as fw:
    print("Write file: {}".format(valid_combined_review_file))
    for valid_combined_instance in combined_valid_review:
        item_id = valid_combined_instance[0]
        user_id = valid_combined_instance[1]
        rating = valid_combined_instance[2]
        review_content = valid_combined_instance[3]
        current_valid_combined_dict = {"user":user_id, 'item':item_id, 'rating':rating, 'review':review_content}
        # dump this dict into the json file
        json.dump(current_valid_combined_dict, fw)
        fw.write('\n')
        cnt_line += 1
print("Finished! Totally {} lines of valid data.".format(cnt_line))

Write file: ../Dataset/yelp/valid_combined.json
Finished! Totally 17086 lines of valid data.


In [30]:
# convert test data into dataframe
df_combined_test_review = pd.DataFrame(
    combined_test_review, columns=['item', 'user', 'rating', 'review', 'review_sentids'])

In [31]:
df_combined_test_review

Unnamed: 0,item,user,rating,review,review_sentids
0,1098,1001,4,"great food , great price , great atmosphere .....","[1102, 1103]"
1,1473,1001,4,the bbq pork also was way different this time ...,[1104]
2,157,1001,4,"but that dumb naan , or pita bread stuff was a...",[1105]
3,1707,1001,4,_ price - average - please recognize fresh veg...,"[1106, 1107, 1108]"
4,2911,1001,4,"pizza was very good , fresh ingredients , , no...",[1109]
...,...,...,...,...,...
42697,3933,9999,4,"they do n't have a matinee price , but then ag...",[109817]
42698,4154,9999,3,the main draw to this casinos over the others ...,[109818]
42699,4565,9999,5,it 's not like normal stouts and the flavor is...,"[109819, 109820, 109821]"
42700,624,9999,5,my two favorite meats for tacos are carne asad...,"[109822, 109823, 109824, 109825, 109826, 109827]"


In [32]:
# # Load valid data
# valid_combined_review_file = "../Dataset/{}/valid_combined.json".format(dataset_name)
# combined_valid_review = []
# with open(valid_combined_review_file, 'r') as fw:
#     print("Load file: {}".format(valid_combined_review_file))
#     for line in fw:
#         valid_combined_instance = json.loads(line)
#         user_id = valid_combined_instance['user']
#         item_id = valid_combined_instance['item']
#         rating = valid_combined_instance['rating']
#         review_text = valid_combined_instance['review']
#         df_combined_this_instance = df_combined_test_review.loc[
#             (df_combined_test_review['user'] == user_id) & (df_combined_test_review['item'] == item_id)]
#         this_review_sentids = list(df_combined_this_instance['review_sentids'])[0]
#         combined_valid_review.append(
#             [item_id, user_id, rating, review_text, this_review_sentids]
#         )
# print("Finished, totally {} lines of valid data.".format(len(combined_valid_review)))

# Construct Dataset Align with the format of NRT and NARRE

In [32]:
train_combined_review_file = "../Dataset/{}/train_combined.json".format(dataset_name)
test_combined_review_file = "../Dataset/{}/test_combined.json".format(dataset_name)
valid_combined_review_file = "../Dataset/{}/valid_combined.json".format(dataset_name)

# Load train data
train_combined_review = []
cnt = 0
with open(train_combined_review_file) as f:
    print("Load file: {}".format(train_combined_review_file))
    for line in f:
        line_data = json.loads(line)
        user_id = int(line_data['user'])        # convert str to int
        item_id = int(line_data['item'])        # convert str to int
        rating = line_data['rating']
        review = line_data['review']
        train_combined_review.append([item_id, user_id, rating, review])
        cnt += 1
        if cnt % 100000 == 0:
            print('{} lines loaded.'.format(cnt))
print('Finish loading train dataset, totally {} lines.'.format(len(train_combined_review)))

Load file: ../Dataset/yelp/train_combined.json
100000 lines loaded.
Finish loading train dataset, totally 191227 lines.


In [33]:
# Load valid data
valid_combined_review = []
cnt = 0
with open(valid_combined_review_file) as f:
    print("Load file: {}".format(valid_combined_review_file))
    for line in f:
        line_data = json.loads(line)
        user_id = int(line_data['user'])        # convert str to int
        item_id = int(line_data['item'])        # convert str to int
        rating = line_data['rating']
        review = line_data['review']
        valid_combined_review.append([item_id, user_id, rating, review])
        cnt += 1
        if cnt % 100000 == 0:
            print('{} lines loaded.'.format(cnt))
print('Finish loading valid dataset, totally {} lines.'.format(len(valid_combined_review)))

Load file: ../Dataset/yelp/valid_combined.json
Finish loading valid dataset, totally 17086 lines.


In [34]:
# Load test data
test_combined_review = []
cnt = 0
with open(test_combined_review_file) as f:
    print("Load file: {}".format(test_combined_review_file))
    for line in f:
        line_data = json.loads(line)
        user_id = int(line_data['user'])        # convert str to int
        item_id = int(line_data['item'])        # convert str to int
        rating = line_data['rating']
        review = line_data['review']
        test_combined_review.append([item_id, user_id, rating, review])
        cnt += 1
        if cnt % 100000 == 0:
            print('{} lines loaded.'.format(cnt))
print('Finish loading test dataset, totally {} lines.'.format(len(test_combined_review)))

Load file: ../Dataset/yelp/test_combined.json
Finish loading test dataset, totally 42702 lines.


In [35]:
# Convert Train/Valid/Test DataFrame
df_train_data_combined = pd.DataFrame(
    train_combined_review, columns=['item', 'user', 'rating', 'review'])
df_valid_data_combined = pd.DataFrame(
    valid_combined_review, columns=['item', 'user', 'rating', 'review'])
df_test_data_combined = pd.DataFrame(
    test_combined_review, columns=['item', 'user', 'rating', 'review'])

In [36]:
train_item_ids = list(df_train_data_combined['item'].unique())
train_user_ids = list(df_train_data_combined['user'].unique())
valid_item_ids = list(df_valid_data_combined['item'].unique())
valid_user_ids = list(df_valid_data_combined['user'].unique())
test_item_ids = list(df_test_data_combined['item'].unique())
test_user_ids = list(df_test_data_combined['user'].unique())
# construct the set of the item/user ids on train set
train_item_ids_set = set(train_item_ids)
train_user_ids_set = set(train_user_ids)
# check to make sure that item/user id on valid/test set appears in the train set
for item_id in valid_item_ids:
    assert item_id in train_item_ids_set
for item_id in test_item_ids:
    assert item_id in train_item_ids_set
for user_id in valid_user_ids:
    assert user_id in train_user_ids_set
for user_id in test_user_ids:
    assert user_id in train_user_ids_set

In [37]:
# construct userid-userid_idx mapping
print("Number of user on train-set: {}".format(len(train_user_ids)))
assert len(train_user_ids) == len(train_user_ids_set)
userid2idx = {}
idx_cnt = 0
for user_id in train_user_ids:
    assert user_id not in userid2idx
    userid2idx[user_id] = idx_cnt
    idx_cnt += 1
# construct itemid-itemid_idx mapping
print("Number of item on train-set: {}".format(len(train_item_ids)))
assert len(train_item_ids) == len(train_item_ids_set)
itemid2idx = {}
idx_cnt = 0
for item_id in train_item_ids:
    assert item_id not in itemid2idx
    itemid2idx[item_id] = idx_cnt
    idx_cnt += 1

Number of user on train-set: 4604
Number of item on train-set: 7837


In [41]:
userid2idx[1001]

0

In [38]:
list(userid2idx.items())[:20]

[(1001, 0),
 (10011, 1),
 (1002, 2),
 (10020, 3),
 (10021, 4),
 (10024, 5),
 (10028, 6),
 (1003, 7),
 (10037, 8),
 (1004, 9),
 (10040, 10),
 (10043, 11),
 (10044, 12),
 (10049, 13),
 (1005, 14),
 (10058, 15),
 (10061, 16),
 (10062, 17),
 (10069, 18),
 (10082, 19)]

In [39]:
# save userid-userid_idx mapping into txt file
# i.e. at line 1, we write the userid with idx of 0
user_idx_file = "./combined_data/{}/users.txt".format(dataset_name)
item_idx_file = "./combined_data/{}/items.txt".format(dataset_name)
with open(user_idx_file, 'w') as f_usr:
    f_usr.write('\n'.join(map(str, userid2idx.keys())))
with open(item_idx_file, 'w') as f_itm:
    f_itm.write('\n'.join(map(str, itemid2idx.keys())))

In [43]:
# save user2uid and item2iid mapping dict
user2uid_file = "./combined_data/{}/user2uid.json".format(dataset_name)
item2iid_file = "./combined_data/{}/item2iid.json".format(dataset_name)
user2idx = {str(k):str(v) for k,v in userid2idx.items()}
item2idx = {str(k):str(v) for k,v in itemid2idx.items()}
with open(user2uid_file, 'w') as f:
    print("Write file: {}".format(user2uid_file))
    json.dump(user2idx, f)
with open(item2iid_file, 'w') as f:
    print("Write file: {}".format(item2iid_file))
    json.dump(item2idx, f)

Write file: ./combined_data/yelp/user2uid.json
Write file: ./combined_data/yelp/item2iid.json


In [44]:
# Construct Train Dataset. 
# 1. Add feature words for each sentence.
# 2. Convert the user/item id to relevant index.
train_combined_review_with_feature = []
train_combined_review_with_feature_comb_sent = []
for train_data_instance in combined_train_review:
    item_id = int(train_data_instance[0])
    user_id = int(train_data_instance[1])
    rating = train_data_instance[2]
    review_text = train_data_instance[3]
    review_sentids = train_data_instance[4]
    # convert user/item id to idx
    item_idx = itemid2idx[item_id]
    user_idx = userid2idx[user_id]
    # add feature words for each sentence
    review_sents_with_features = []
    feat_words_set = set()
    rvw_sents_list = list()
    for rvw_sent_id in review_sentids:
        # get this sentence's feature tf
        feature_tf = trainset_sent2featuretf[rvw_sent_id]
        feature_ids = feature_tf.keys()
        feature_words = [id2feature_vocab[fea_id] for fea_id in feature_ids]
        # get the sentence text content of this sentid
        rvw_sent = trainset_id_to_sent[rvw_sent_id]
        review_sents_with_features.append([feature_words, rvw_sent])
        for word in feature_words:
            feat_words_set.add(word)
        rvw_sents_list.append(rvw_sent)
    feat_words_comb = list(feat_words_set)
    rvw_sents_comb = " ".join(rvw_sents_list)
    # pack item_idx, user_idx, rating, review(with feature)
    train_combined_review_with_feature.append(
        [item_idx, user_idx, rating, review_sents_with_features]
    )
    train_combined_review_with_feature_comb_sent.append(
        [item_idx, user_idx, rating, [[feat_words_comb, rvw_sents_comb]]]
    )

In [45]:
train_combined_review_with_feature[:2]

[[0,
  0,
  4,
  [[['food', 'staff', 'sauce', 'tomato', 'aioli'],
    'place was well lit , very clean , and staff very friendly ... wow , , , i was thinking this is gon na hurt my pocket now : - ( ... on to the food and ordering , aioli was ok , as well as the tomato sauce .'],
   [['cheese', 'bread', 'shrimp', 'broth', 'garlic', 'crab', 'homemade'],
    'garlic cheese bread w crab - very delicious , and definitely enough crab to know there is actually crab in it ... cajun style shrimp boil - wife got this and she loved it , it had really great broth , and their homemade bread was awesome for dipping .'],
   [['food',
     'restaurant',
     'staff',
     'cream',
     'options',
     'plate',
     'seafood',
     'entrees'],
    'staff / bartender was very informative and even the one to suggest trying that saffron cream before ordering ... very attentive and friendly ... as far as a seafood " restaurant " , i \'m not sure how to classify this place , there food is really great , but

In [46]:
train_combined_review_with_feature_comb_sent[:2]

[[0,
  0,
  4,
  [[['bread',
     'options',
     'crab',
     'restaurant',
     'seafood',
     'homemade',
     'garlic',
     'tomato',
     'shrimp',
     'staff',
     'aioli',
     'cream',
     'sauce',
     'broth',
     'entrees',
     'food',
     'cheese',
     'plate'],
    'place was well lit , very clean , and staff very friendly ... wow , , , i was thinking this is gon na hurt my pocket now : - ( ... on to the food and ordering , aioli was ok , as well as the tomato sauce . garlic cheese bread w crab - very delicious , and definitely enough crab to know there is actually crab in it ... cajun style shrimp boil - wife got this and she loved it , it had really great broth , and their homemade bread was awesome for dipping . staff / bartender was very informative and even the one to suggest trying that saffron cream before ordering ... very attentive and friendly ... as far as a seafood " restaurant " , i \'m not sure how to classify this place , there food is really great 

In [47]:
# Construct Test Dataset. 
# 1. Add feature words for each sentence.
# 2. Convert the user/item id to relevant index.
test_combined_review_with_feature = []
test_combined_review_with_feature_comb_sent = []
for test_data_instance in combined_test_review:
    item_id = int(test_data_instance[0])
    user_id = int(test_data_instance[1])
    rating = test_data_instance[2]
    review_text = test_data_instance[3]
    review_sentids = test_data_instance[4]
    # convert user/item id to idx
    item_idx = itemid2idx[item_id]
    user_idx = userid2idx[user_id]
    # add feature words for each sentence
    review_sents_with_features = []
    feat_words_set = set()
    rvw_sents_list = list()
    for rvw_sent_id in review_sentids:
        # get this sentence's feature tf
        feature_tf = testset_sent2featuretf[rvw_sent_id]
        feature_ids = feature_tf.keys()
        feature_words = [id2feature_vocab[fea_id] for fea_id in feature_ids]
        # get the sentence text content of this sentid
        rvw_sent = testset_id_to_sent[rvw_sent_id]
        review_sents_with_features.append([feature_words, rvw_sent])
        for word in feature_words:
            feat_words_set.add(word)
        rvw_sents_list.append(rvw_sent)
    feat_words_comb = list(feat_words_set)
    rvw_sents_comb = " ".join(rvw_sents_list)
    # pack item_idx, user_idx, rating, review(with feature)
    test_combined_review_with_feature.append(
        [item_idx, user_idx, rating, review_sents_with_features]
    )
    test_combined_review_with_feature_comb_sent.append(
        [item_idx, user_idx, rating, [[feat_words_comb, rvw_sents_comb]]]
    )

In [48]:
test_combined_review_with_feature[:2]

[[2251,
  0,
  4,
  [[['food',
     'price',
     'atmosphere',
     'portion',
     'lunch',
     'size',
     'tea',
     'pricing'],
    'great food , great price , great atmosphere ... portion size was huge comparative to the lunch pricing ... iced tea was definitely not china mist or nestle , or lipton , very good also .'],
   [['beer', 'appetizers', 'bucks'],
    "2 appetizers with beer , and 2 full entree 's for about 30 bucks out the door ... simply great ... will definitely be back , , often"]]],
 [3039,
  0,
  4,
  [[['staff', 'rice', 'pork', 'noodles', 'bbq', 'broccoli'],
    "the bbq pork also was way different this time , so i 'm not sure what happened if , short staff for new years or what , , but will try a few more times before concluding it and removed from faves ... the singapore style rice noodles were the same n that saved the day ... other goto is their chinese broccoli ..."]]]]

In [49]:
test_combined_review_with_feature_comb_sent[:2]

[[2251,
  0,
  4,
  [[['size',
     'price',
     'pricing',
     'appetizers',
     'portion',
     'bucks',
     'atmosphere',
     'beer',
     'lunch',
     'food',
     'tea'],
    "great food , great price , great atmosphere ... portion size was huge comparative to the lunch pricing ... iced tea was definitely not china mist or nestle , or lipton , very good also . 2 appetizers with beer , and 2 full entree 's for about 30 bucks out the door ... simply great ... will definitely be back , , often"]]],
 [3039,
  0,
  4,
  [[['pork', 'rice', 'broccoli', 'staff', 'noodles', 'bbq'],
    "the bbq pork also was way different this time , so i 'm not sure what happened if , short staff for new years or what , , but will try a few more times before concluding it and removed from faves ... the singapore style rice noodles were the same n that saved the day ... other goto is their chinese broccoli ..."]]]]

In [50]:
# Construct Valid Dataset. 
# 1. Add feature words for each sentence.
# 2. Convert the user/item id to relevant index.
valid_combined_review_with_feature = []
valid_combined_review_with_feature_comb_sent = []
for valid_data_instance in combined_valid_review:
    item_id = int(valid_data_instance[0])
    user_id = int(valid_data_instance[1])
    rating = valid_data_instance[2]
    review_text = valid_data_instance[3]
    review_sentids = valid_data_instance[4]
    # convert user/item id to idx
    item_idx = itemid2idx[item_id]
    user_idx = userid2idx[user_id]
    # add feature words for each sentence
    review_sents_with_features = []
    feat_words_set = set()
    rvw_sents_list = list()
    for rvw_sent_id in review_sentids:
        # get this sentence's feature tf
        feature_tf = testset_sent2featuretf[rvw_sent_id]
        feature_ids = feature_tf.keys()
        feature_words = [id2feature_vocab[fea_id] for fea_id in feature_ids]
        # get the sentence text content of this sentid
        rvw_sent = testset_id_to_sent[rvw_sent_id]
        review_sents_with_features.append([feature_words, rvw_sent])
        for word in feature_words:
            feat_words_set.add(word)
        rvw_sents_list.append(rvw_sent)
    feat_words_comb = list(feat_words_set)
    rvw_sents_comb = " ".join(rvw_sents_list)
    # pack item_idx, user_idx, rating, review(with feature)
    valid_combined_review_with_feature.append(
        [item_idx, user_idx, rating, review_sents_with_features]
    )
    valid_combined_review_with_feature_comb_sent.append(
        [item_idx, user_idx, rating, [[feat_words_comb, rvw_sents_comb]]]
    )

In [51]:
valid_combined_review_with_feature[:2]

[[2251,
  0,
  4,
  [[['food',
     'price',
     'atmosphere',
     'portion',
     'lunch',
     'size',
     'tea',
     'pricing'],
    'great food , great price , great atmosphere ... portion size was huge comparative to the lunch pricing ... iced tea was definitely not china mist or nestle , or lipton , very good also .'],
   [['beer', 'appetizers', 'bucks'],
    "2 appetizers with beer , and 2 full entree 's for about 30 bucks out the door ... simply great ... will definitely be back , , often"]]],
 [3039,
  0,
  4,
  [[['staff', 'rice', 'pork', 'noodles', 'bbq', 'broccoli'],
    "the bbq pork also was way different this time , so i 'm not sure what happened if , short staff for new years or what , , but will try a few more times before concluding it and removed from faves ... the singapore style rice noodles were the same n that saved the day ... other goto is their chinese broccoli ..."]]]]

In [52]:
valid_combined_review_with_feature_comb_sent[:2]

[[2251,
  0,
  4,
  [[['size',
     'price',
     'pricing',
     'appetizers',
     'portion',
     'bucks',
     'atmosphere',
     'beer',
     'lunch',
     'food',
     'tea'],
    "great food , great price , great atmosphere ... portion size was huge comparative to the lunch pricing ... iced tea was definitely not china mist or nestle , or lipton , very good also . 2 appetizers with beer , and 2 full entree 's for about 30 bucks out the door ... simply great ... will definitely be back , , often"]]],
 [3039,
  0,
  4,
  [[['pork', 'rice', 'broccoli', 'staff', 'noodles', 'bbq'],
    "the bbq pork also was way different this time , so i 'm not sure what happened if , short staff for new years or what , , but will try a few more times before concluding it and removed from faves ... the singapore style rice noodles were the same n that saved the day ... other goto is their chinese broccoli ..."]]]]

## Write Train/Valid/Test with feature Datasets into Files

In [53]:
train_combined_with_feature_comb_sent_file = "./combined_data/{}/split/train.txt".format(dataset_name)
test_combined_with_feature_comb_sent_file = "./combined_data/{}/split/test.txt".format(dataset_name)
valid_combined_with_feature_comb_sent_file = "./combined_data/{}/split/valid.txt".format(dataset_name)
train_combined_with_feature_file = "./combined_data/{}/split/train_split_sent.txt".format(dataset_name)
test_combined_with_feature_file = "./combined_data/{}/split/test_split_sent.txt".format(dataset_name)
valid_combined_with_feature_file = "./combined_data/{}/split/valid_split_sent.txt".format(dataset_name)

In [49]:
# write train dataset
cnt_line = 0
for train_combined_with_f_instance in train_combined_review_with_feature:
    with open(train_combined_with_feature_file, 'a') as f_wf:
        json.dump(train_combined_with_f_instance, f_wf)
        cnt_line += 1
        if cnt_line == len(train_combined_review_with_feature):
            # for the last line, don't add a new empty line
            pass
        else:
            f_wf.write('\n')
print("Finish writing train set.")
cnt_line = 0
for train_combined_with_f_instance in train_combined_review_with_feature_comb_sent:
    with open(train_combined_with_feature_comb_sent_file, 'a') as f_wf:
        json.dump(train_combined_with_f_instance, f_wf)
        cnt_line += 1
        if cnt_line == len(train_combined_review_with_feature_comb_sent):
            # for the last line, don't add a new empty line
            pass
        else:
            f_wf.write('\n')
print("Finish writing train set (comb review sentences).")
# write test dataset
cnt_line = 0
for test_combined_with_f_instance in test_combined_review_with_feature:
    with open(test_combined_with_feature_file, 'a') as f_wf:
        json.dump(test_combined_with_f_instance, f_wf)
        cnt_line += 1
        if cnt_line == len(test_combined_review_with_feature):
            # for the last line, don't add a new empty line
            pass
        else:
            f_wf.write('\n')
print("Finish writing test set.")
cnt_line = 0
for test_combined_with_f_instance in test_combined_review_with_feature_comb_sent:
    with open(test_combined_with_feature_comb_sent_file, 'a') as f_wf:
        json.dump(test_combined_with_f_instance, f_wf)
        cnt_line += 1
        if cnt_line == len(test_combined_review_with_feature_comb_sent):
            # for the last line, don't add a new empty line
            pass
        else:
            f_wf.write('\n')
print("Finish writing test set (comb review sentences).")
# write valid dataset
cnt_line = 0
for valid_combined_with_f_instance in valid_combined_review_with_feature:
    with open(valid_combined_with_feature_file, 'a') as f_wf:
        json.dump(valid_combined_with_f_instance, f_wf)
        cnt_line += 1
        if cnt_line == len(valid_combined_review_with_feature):
            # for the last line, don't add a new empty line
            pass
        else:
            f_wf.write('\n')
print("Finish writing valid set.")
cnt_line = 0
for valid_combined_with_f_instance in valid_combined_review_with_feature_comb_sent:
    with open(valid_combined_with_feature_comb_sent_file, 'a') as f_wf:
        json.dump(valid_combined_with_f_instance, f_wf)
        cnt_line += 1
        if cnt_line == len(valid_combined_review_with_feature_comb_sent):
            # for the last line, don't add a new empty line
            pass
        else:
            f_wf.write('\n')
print("Finish writing valid set (comb review sentences).")

Finish writing train set.
Finish writing test set.
Finish writing valid set.


# Write Aligned Proxy Text

## Load Valid / Test Proxy

In [50]:
cnt_line = 0
valid_ui2proxy = dict()
valid_ui2sent_proxy_file = '../Dataset/{}/valid/useritem2sentids_withproxy_multilines.json'.format(dataset_name)
with open(valid_ui2sent_proxy_file, 'r') as f:
    print("Load file: {}".format(valid_ui2sent_proxy_file))
    for line in f:
        line_data = json.loads(line)
        user_id = str(line_data['user'])
        item_id = str(line_data['item'])
        proxy_text = line_data['select_text']
        if user_id in valid_ui2proxy:
            assert item_id not in valid_ui2proxy[user_id]
            valid_ui2proxy[user_id][item_id] = proxy_text
        else:
            valid_ui2proxy[user_id] = dict()
            valid_ui2proxy[user_id][item_id] = proxy_text
        cnt_line += 1
print("Number of reviews in valid set: {}".format(cnt_line))

Load file: ../Dataset/yelp/valid/useritem2sentids_withproxy_multilines.json
Number of reviews in valid set: 85748


In [51]:
cnt_line = 0
test_ui2proxy = dict()
test_ui2sent_proxy_file = '../Dataset/{}/test/useritem2sentids_withproxy_multilines.json'.format(dataset_name)
with open(test_ui2sent_proxy_file, 'r') as f:
    print("Load file: {}".format(test_ui2sent_proxy_file))
    for line in f:
        line_data = json.loads(line)
        user_id = str(line_data['user'])
        item_id = str(line_data['item'])
        proxy_text = line_data['select_text']
        if user_id in test_ui2proxy:
            assert item_id not in test_ui2proxy[user_id]
            test_ui2proxy[user_id][item_id] = proxy_text
        else:
            test_ui2proxy[user_id] = dict()
            test_ui2proxy[user_id][item_id] = proxy_text
        cnt_line += 1
print("Number of reviews in test set: {}".format(cnt_line))

Load file: ../Dataset/yelp/test/useritem2sentids_withproxy_multilines.json
Number of reviews in test set: 85748


In [52]:
# Load valid data
valid_combined_review_file = "../Dataset/{}/valid_combined.json".format(dataset_name)
valid_combined_proxy_file = "../Dataset/{}/valid_proxy_combined.json".format(dataset_name)
valid_review_text_file = "../Dataset/{}/valid_review.txt".format(dataset_name)
valid_proxy_text_file = "../Dataset/{}/valid_proxy.txt".format(dataset_name)
cnt_line = 0
with open(valid_combined_review_file, 'r') as f1:
    with open(valid_combined_proxy_file, 'w') as f2:
        with open(valid_review_text_file, 'w') as f3:
            with open(valid_proxy_text_file, 'w') as f4:
                print("Load file: {}".format(valid_combined_review_file))
                print("Write file: {}".format(valid_combined_proxy_file))
                print("Write file: {}".format(valid_review_text_file))
                print("Write file: {}".format(valid_proxy_text_file))
                for line in f1:
                    cnt_line += 1
                    valid_combined_instance = json.loads(line)
                    user_id = valid_combined_instance['user']
                    item_id = valid_combined_instance['item']
                    review_text = valid_combined_instance['review']
                    proxy_text = valid_ui2proxy[user_id][item_id]
                    # write proxy text into file
                    proxy_line_data = {
                        'user': user_id,
                        'item': item_id,
                        'proxy': proxy_text
                    }
                    json.dump(proxy_line_data, f2)
                    f2.write('\n')
                    # write review text
                    f3.write(review_text)
                    f3.write('\n')
                    # write proxy text
                    f4.write(proxy_text)
                    f4.write('\n')
print("Finished, totally {} lines of valid data.".format(cnt_line))

Load file: ../Dataset/yelp/valid_combined.json
Write file: ../Dataset/yelp/valid_proxy_combined.json
Write file: ../Dataset/yelp/valid_review.txt
Write file: ../Dataset/yelp/valid_proxy.txt
Finished, totally 34369 lines of valid data.


In [53]:
# Load valid data
test_combined_review_file = "../Dataset/{}/test_combined.json".format(dataset_name)
test_combined_proxy_file = "../Dataset/{}/test_proxy_combined.json".format(dataset_name)
test_review_text_file = "../Dataset/{}/test_review.txt".format(dataset_name)
test_proxy_text_file = "../Dataset/{}/test_proxy.txt".format(dataset_name)
cnt_line = 0
with open(test_combined_review_file, 'r') as f1:
    with open(test_combined_proxy_file, 'w') as f2:
        with open(test_review_text_file, 'w') as f3:
            with open(test_proxy_text_file, 'w') as f4:
                print("Load file: {}".format(test_combined_review_file))
                print("Write file: {}".format(test_combined_proxy_file))
                print("Write file: {}".format(test_review_text_file))
                print("Write file: {}".format(test_proxy_text_file))
                for line in f1:
                    cnt_line += 1
                    test_combined_instance = json.loads(line)
                    user_id = test_combined_instance['user']
                    item_id = test_combined_instance['item']
                    review_text = test_combined_instance['review']
                    proxy_text = test_ui2proxy[user_id][item_id]
                    # write proxy text into file
                    proxy_line_data = {
                        'user': user_id,
                        'item': item_id,
                        'proxy': proxy_text
                    }
                    json.dump(proxy_line_data, f2)
                    f2.write('\n')
                    # write review text
                    f3.write(review_text)
                    f3.write('\n')
                    # write proxy text
                    f4.write(proxy_text)
                    f4.write('\n')
print("Finished, totally {} lines of test data.".format(cnt_line))

Load file: ../Dataset/yelp/test_combined.json
Write file: ../Dataset/yelp/test_proxy_combined.json
Write file: ../Dataset/yelp/test_review.txt
Write file: ../Dataset/yelp/test_proxy.txt
Finished, totally 85748 lines of test data.
