In [1]:
!which python

/u/pw7nc/anaconda3/bin/python


In [2]:
import re
import json
import os

import numpy as np
import pandas as pd
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer

import spacy
import nltk
from nltk.tokenize import sent_tokenize
from spacy.lang.en import English
nlp = English()
# Create a Tokenizer with the default settings for English
# including punctuation rules and exceptions
tokenizer = nlp.tokenizer
import string
punct = string.punctuation
from sklearn.feature_extraction import _stop_words

In [3]:
dataset_name = "tripadvisor"

# Load Data

## Load Dataset

In [4]:
dir_path = '../Dataset/{}'.format(dataset_name)
# Load train dataset
train_review = []
cnt = 0
file_path = os.path.join(dir_path, 'train_review_filtered.json')
with open(file_path) as f:
    print("Load file: {}".format(file_path))
    for line in f:
        line_data = json.loads(line)
        user_id = line_data['user']
        item_id = line_data['item']
        rating = line_data['rating']
        review = line_data['review']
        train_review.append([item_id, user_id, rating, review])
        cnt += 1
        if cnt % 100000 == 0:
            print('{} lines loaded.'.format(cnt))
print('Finish loading train dataset, totally {} lines.'.format(len(train_review)))
# Load test dataset
test_review = []
cnt = 0
file_path = os.path.join(dir_path, 'test_review_filtered_clean.json')
with open(file_path) as f:
    print("Load file: {}".format(file_path))
    for line in f:
        line_data = json.loads(line)
        user_id = line_data['user']
        item_id = line_data['item']
        rating = line_data['rating']
        review = line_data['review']
        test_review.append([item_id, user_id, rating, review])
        cnt += 1
        if cnt % 10000 == 0:
            print('{} lines loaded.'.format(cnt))
print('Finish loading test dataset, totally {} lines.'.format(len(test_review)))

Load file: ../Dataset/tripadvisor/train_review_filtered.json
100000 lines loaded.
200000 lines loaded.
Finish loading train dataset, totally 205595 lines.
Load file: ../Dataset/tripadvisor/test_review_filtered_clean.json
10000 lines loaded.
Finish loading test dataset, totally 19444 lines.


In [5]:
df_train_data = pd.DataFrame(train_review, columns=['item', 'user', 'rating', 'review'])
df_test_data = pd.DataFrame(test_review, columns=['item', 'user', 'rating', 'review'])

In [6]:
df_train_data

Unnamed: 0,item,user,rating,review
0,0,0,5,this is our second stay at this hotel ; we sta...
1,1,0,3,"small , cramped rooms , moldy grout in shower ..."
2,10,0,3,"the room doors would slam when guests leave , ..."
3,100,0,2,"first , the old style tv was on top of the clo..."
4,1000,0,5,the food was exceptional - we really enjoyed t...
...,...,...,...,...
205590,752,999,5,"the rooms are spacious , quiet , and clean . m..."
205591,819,999,5,"room was very nice . bed was comfortable , had..."
205592,827,999,3,not really the best stay i ever had . room was...
205593,852,999,3,our room was not as nice as i had hoped . the ...


In [7]:
# groupby multiple columns
groupby_user_item = df_train_data.groupby(['user', 'item'])
cnt = 0
for key, item in groupby_user_item:
    cur_df_user_item = groupby_user_item.get_group(key)
    if len(cur_df_user_item) > 1:
        if cnt <= 10:
            print(cur_df_user_item)
        cnt += 1
print("{} data instance are the same".format(cnt))
# make sure that there are no duplicated reviews

0 data instance are the same


## Load Sentence2ID and ID2Sentence Mapping

In [8]:
sentence2id_filepath = '../Dataset/{}/train/sentence/sentence2id.json'.format(dataset_name)
with open(sentence2id_filepath, 'r') as f:
    print("Load file: {}".format(sentence2id_filepath))
    sent_to_id = json.load(f)

Load file: ../Dataset/tripadvisor/train/sentence/sentence2id.json


In [9]:
type(sent_to_id['good service .'])

str

In [10]:
sent_to_id['good service .']

'200'

In [11]:
id2sentence_filepath = '../Dataset/{}/train/sentence/id2sentence.json'.format(dataset_name)
with open(id2sentence_filepath, 'r') as f:
    print("Load file: {}".format(id2sentence_filepath))
    id_to_sent = json.load(f)

Load file: ../Dataset/tripadvisor/train/sentence/id2sentence.json


In [12]:
id_to_sent['0']

'great location .'

In [13]:
assert len(sent_to_id) == len(id_to_sent)
print("Number of sentence (with feature) on train set: {}".format(len(sent_to_id)))

Number of sentence (with feature) on train set: 740398


## Load Feature Words

In [14]:
feature2id_filepath = '../Dataset/{}/train/feature/feature2id.json'.format(dataset_name)
with open(feature2id_filepath, 'r') as f:
    print("Load file: {}".format(feature2id_filepath))
    feature_vocab = json.load(f)

Load file: ../Dataset/tripadvisor/train/feature/feature2id.json


In [15]:
len(feature_vocab)

503

In [16]:
feature_vocab['wifi']

'110'

## Load Sentence2Feature

In [17]:
sentence2feature_filepath = '../Dataset/{}/train/sentence/sentence2feature.json'.format(dataset_name)
with open(sentence2feature_filepath, 'r') as f:
    print("Load file: {}".format(sentence2feature_filepath))
    sentence_to_feature = json.load(f)

Load file: ../Dataset/tripadvisor/train/sentence/sentence2feature.json


In [18]:
sentence_to_feature['0']

{'9': 1.0}

In [19]:
assert len(sentence_to_feature) == len(sent_to_id)
len(sentence_to_feature)

740398

# Construct User-Item Pair

## GrounpBy User

In [20]:
group_by_user = df_train_data.groupby('user')
group_by_user_dict = dict(tuple(group_by_user))

In [21]:
group_by_user_dict['1001']

Unnamed: 0,item,user,rating,review
4578,100,1001,4,the staff was helpful and friendly .
4579,1000,1001,5,we will stay with secrets again .
4580,102,1001,5,"i really enjoy this hotel , staff , amenities ..."
4581,1091,1001,4,pros : great cherry creek location . cons : ba...
4582,1152,1001,1,"and , honest to god , pubic hairs between the ..."
4583,118,1001,4,the resort was n't a disappointment at all . o...
4584,1344,1001,5,"the staff , the accommodations and the food se..."
4585,1408,1001,1,"carpet was full of crumbs . "" non - smoking ro..."
4586,1489,1001,1,i called back by 1:30 pm to talk to the desk a...
4587,1549,1001,4,the front desk clerks were very nice and check...


In [22]:
len(group_by_user_dict)

4950

In [23]:
user_id_list = list(df_train_data['user'].unique())

In [24]:
print(len(user_id_list))

4950


In [25]:
user_to_sent = dict()
cnt_sentence_with_no_feature = 0

for user_id in user_id_list:
    # note this user_id is a str
    # get the dataframe for this user
    user_df = group_by_user_dict[user_id]
    user_reviews = list(user_df['review'])
    user_sent_ids = set()
    for review in user_reviews:
        # tokenize this review (i.e. split into sentences)
        review_sents = sent_tokenize(review)
        # check whether the sentence is in the sentence2id dictionary
        for sent in review_sents:
            if sent in sent_to_id:
                cur_sent_id = sent_to_id[sent]
                assert isinstance(cur_sent_id, str)
                # make sure that this sentence has feature
                # assert cur_sent_id in sentence_to_feature
                # user_sent_ids.add(cur_sent_id)
                if cur_sent_id in sentence_to_feature:
                    # add this sentence into the set of this user
                    user_sent_ids.add(cur_sent_id)
                else:
                    cnt_sentence_with_no_feature += 1
        # TODO: what should we do if there are sentence that appears in multiple reivews?
    if len(user_sent_ids) == 0:
        print("User: {} has no effective sentences, skip it.".format(user_id))
    else:
        user_to_sent[user_id] = user_sent_ids
print('{} sentence with no feature'.format(cnt_sentence_with_no_feature))

0 sentence with no feature


In [26]:
len(user_to_sent)

4950

In [27]:
user_to_sentids = dict()
for user_id, user_sents in user_to_sent.items():
    assert len(user_sents) > 0
    assert isinstance(user_id, str)
    assert isinstance(list(user_sents)[0], str)
    user_to_sentids[user_id] = list(user_sents)

In [28]:
len(user_to_sentids)

4950

In [29]:
user2sentids_filepath = '../Dataset/{}/train/user/user2sentids.json'.format(dataset_name)
with open(user2sentids_filepath, 'w') as f:
    print("Write file: {}".format(user2sentids_filepath))
    json.dump(user_to_sentids, f)

Write file: ../Dataset/tripadvisor/train/user/user2sentids.json


In [30]:
user_side_cdd_sents_num = list()
for key, value in user_to_sentids.items():
    user_side_cdd_sents_num.append(len(value))

In [31]:
print("Mean number of sentence per user: {}".format(
    np.mean(user_side_cdd_sents_num)
))
print("Min number of sentence per user: {}".format(
    np.min(user_side_cdd_sents_num)
))
print("Max number of sentence per user: {}".format(
    np.max(user_side_cdd_sents_num)
))

Mean number of sentence per user: 160.76848484848486
Min number of sentence per user: 12
Max number of sentence per user: 12781


In [32]:
print("Top-10 least numbber of sentences per user: {}".format(
    sorted(user_side_cdd_sents_num)[:10]
))
print("Top-10 most numbber of sentences per user: {}".format(
    sorted(user_side_cdd_sents_num)[-10:]
))

Top-10 least numbber of sentences per user: [12, 13, 14, 16, 16, 17, 17, 20, 20, 21]
Top-10 most numbber of sentences per user: [1026, 1037, 1072, 1114, 1141, 1142, 1296, 1326, 1336, 12781]


## GroupBy Item

In [33]:
group_by_item = df_train_data.groupby('item')
group_by_item_dict = dict(tuple(group_by_item))

In [34]:
group_by_item_dict['1267']

Unnamed: 0,item,user,rating,review
264,1267,0,5,close to many other cleveland attractions . we...
12954,1267,1128,2,both rooms were very small . air conditioning ...
31017,1267,1420,4,we stayed in a newly renovated room which was ...
39340,1267,1570,2,i also found their customer service to be incr...
41250,1267,1602,3,we really enjoyed our weekend stay in clevelan...
41572,1267,161,5,the award winning sans souci restaurant overlo...
43275,1267,1641,4,the lobby and lobby bar are beautiful - soarin...
45556,1267,1689,5,we did not think that the garage was small or ...
46382,1267,170,5,my husband and i recently stayed here and had ...
48144,1267,1734,5,i 've stopped going anywhere else ... the rena...


In [35]:
len(group_by_item_dict)

4493

In [36]:
item_id_list = list(df_train_data['item'].unique())
item_to_sent = dict()
cnt_sentence_with_no_feature = 0
for item_id in item_id_list:
    # note this item_id is a str
    # get the dataframe for this item
    assert isinstance(item_id, str)
    item_df = group_by_item_dict[item_id]
    item_reviews = list(item_df['review'])
    item_sent_ids = set()
    for review in item_reviews:
        # tokenize this review (i.e. split into sentences)
        review_sents = sent_tokenize(review)
        # check whether the sentence is in the sentence2id dictionary
        for sent in review_sents:
            if sent in sent_to_id:
                cur_sent_id = sent_to_id[sent]
                assert isinstance(cur_sent_id, str)
                # make sure that this sentence has feature
                if cur_sent_id in sentence_to_feature:
                    # add this sentence into the set of this user
                    item_sent_ids.add(cur_sent_id)
                else:
                    cnt_sentence_with_no_feature += 1
        # TODO: what should we do if there are sentence that appears in multiple reivews?
    if len(item_sent_ids) == 0:
        print("Item {} has no effective sentence, skip it.".format(item_id))
    else:
        item_to_sent[item_id] = item_sent_ids

print('{} sentence with no feature'.format(cnt_sentence_with_no_feature))

0 sentence with no feature


In [37]:
len(item_to_sent)

4493

In [38]:
item_to_sentids = dict()
for item_id, item_sents in item_to_sent.items():
    assert len(item_sents) > 0
    assert isinstance(list(item_sents)[0], str)
    item_to_sentids[item_id] = list(item_sents)

In [39]:
len(item_to_sentids)

4493

In [40]:
item2sentids_filepath = '../Dataset/{}/train/item/item2sentids.json'.format(dataset_name)
with open(item2sentids_filepath, 'w') as f:
    print("Write file: {}".format(item2sentids_filepath))
    json.dump(item_to_sentids, f)

Write file: ../Dataset/tripadvisor/train/item/item2sentids.json


In [41]:
item_side_cdd_sents_num = list()
for key, value in item_to_sentids.items():
    item_side_cdd_sents_num.append(len(value))

In [42]:
print("Mean number of sentence per item: {}".format(
    np.mean(item_side_cdd_sents_num)
))
print("Min number of sentence per item: {}".format(
    np.min(item_side_cdd_sents_num)
))
print("Max number of sentence per item: {}".format(
    np.max(item_side_cdd_sents_num)
))

Mean number of sentence per item: 185.0462942354774
Min number of sentence per item: 20
Max number of sentence per item: 1943


In [43]:
print("Top-10 least numbber of sentence per item: {}".format(
    sorted(item_side_cdd_sents_num)[:10]
))
print("Top-10 most numbber of sentence per item: {}".format(
    sorted(item_side_cdd_sents_num)[-10:]
))

Top-10 least numbber of sentence per item: [20, 21, 22, 23, 24, 25, 25, 28, 28, 28]
Top-10 most numbber of sentence per item: [1288, 1294, 1312, 1377, 1469, 1506, 1594, 1883, 1907, 1943]


# For Each Data Instance in TrainSet

In [45]:
import random
sample_sent_num = 500
user_item_candidate_sent_ids = dict()
# Loop over all User
user_cnt = 0
review_cnt = 0
review_with_no_selectd_label_sentence = 0
review_cdd_sentences_down_sampled = 0
review_with_duplicate_sentences = 0
review_with_duplicate_sentences_list = list()
useable_review_cnt = 0
sentence_with_no_feature_cnt = 0
sentence_not_tracked = set()
empty_feature_review = list()
for user_df_chunk in list(group_by_user):
    user_id = int(user_df_chunk[0])
    user_id_str = str(user_df_chunk[0])
    user_df = user_df_chunk[1]
    # get user sents
    cur_user_sent_ids = user_to_sent[user_id_str]
    # item-level dict
    item_candidate_sent_ids = dict()
    for idx, row in user_df.iterrows():
        item_id = int(row['item'])
        item_id_str = str(row['item'])
        review_text = row['review']
        review_cnt += 1
        # get item sents
        cur_item_sent_ids = item_to_sent[item_id_str]
        # get review_text's sent ids
        cur_review_sent_ids = set()
        cur_review_sent_ids_list = list()
        ## tokenize this review
        review_sents = sent_tokenize(review_text)
        ## check whether this sentence is in the sentence2id dictionary
        review_has_duplicate_sentences = False
        for sent in review_sents:
            if sent in sent_to_id:
                cur_sent_id = sent_to_id[sent]
                assert isinstance(cur_sent_id, str)
                # make sure that this sentence has feature
                if cur_sent_id in sentence_to_feature:
                    # add this sentence into the set of current review
                    if cur_sent_id not in cur_review_sent_ids:
                        cur_review_sent_ids.add(cur_sent_id)
                        cur_review_sent_ids_list.append(cur_sent_id)
                    else:
                        review_has_duplicate_sentences = True
                else:
                    sentence_with_no_feature_cnt += 1
            else:
                # if this sentence is not being tracked by the sentence-id mapping
                # we add it into this set to see how many sentences are being ignored
                sentence_not_tracked.add(sent)
        if review_has_duplicate_sentences:
            review_with_duplicate_sentences += 1
            review_with_duplicate_sentences_list.append(review_text)
        try:
            assert cur_review_sent_ids == set(cur_review_sent_ids_list)
        except:
            print(cur_review_sent_ids, cur_review_sent_ids_list)
        ## check whether the true label of the sentence is an empty list of sent_ids
        if len(cur_review_sent_ids) == 0:
            review_with_no_selectd_label_sentence += 1
            # print("user: {}\titem: {}\treview: {}".format(
            #     user_id, item_id, review_text
            # ))
            empty_feature_review.append([user_id, item_id, review_text])
        else:
            # construct the candidate set which is an union of user sentences and item sentences
            cur_useritem_sent_ids = cur_user_sent_ids | cur_item_sent_ids
            # sample some sentences
            if len(cur_useritem_sent_ids) > sample_sent_num:
                review_cdd_sentences_down_sampled += 1
                sample_useritem_sent_ids = set(random.sample(cur_useritem_sent_ids, sample_sent_num))
            else:
                sample_useritem_sent_ids = cur_useritem_sent_ids
            # union sampled sentences with true labeled sentences
            final_useritem_sent_ids = sample_useritem_sent_ids | cur_review_sent_ids
            # add this into the dict
            item_candidate_sent_ids[item_id_str] = [list(final_useritem_sent_ids), cur_review_sent_ids_list]
            # add useable review cnt
            useable_review_cnt += 1
    if len(item_candidate_sent_ids) == 0:
        print("User: {} has no useful item, skip this user ...".format(user_id_str))
    else:
        # add the item_candidate_sent_ids dict into the user-level dict
        user_item_candidate_sent_ids[user_id_str] = item_candidate_sent_ids
    user_cnt += 1
    if user_cnt % 1000 == 0:
        print("{} user processed".format(user_cnt))

print('Finish.')
print('Totally {} users'.format(user_cnt))
print('Totally {0} reviews. Among them {1} reviews has empty true label sentence'.format(
    review_cnt, review_with_no_selectd_label_sentence))
print("{} sentences have 0 feature".format(sentence_with_no_feature_cnt))
print("{} reviews have duplicate sentences".format(review_with_duplicate_sentences))
print("{} review's cdd sentences are being down-sampled".format(review_cdd_sentences_down_sampled))
print("{} sentences are not being tracked in the sent2id mapping".format(len(sentence_not_tracked)))

1000 user processed
2000 user processed
3000 user processed
4000 user processed
Finish.
Totally 4950 users
Totally 205595 reviews. Among them 63 reviews has empty true label sentence
0 sentences have 0 feature
99 reviews have duplicate sentences
106761 review's cdd sentences are being down-sampled
5710 sentences are not being tracked in the sent2id mapping


In [47]:
review_with_duplicate_sentences_list[1]

'great stay - wonderful staff - make it your choice - great value ! great stay - wonderful staff - make it your choice - great value ! great stay - wonderful staff - make it your choice - great value ! great stay - wonderful staff - make it your choice - great value ! great stay - wonderful staff - make it your choice - great value ! great stay - wonderful staff - make it your choice - great value !'

In [48]:
len(user_item_candidate_sent_ids)

4950

In [49]:
# let's check how many unique reviews are there

cnt_unique_reviews = 0
cnt_empty_true_sent = 0
sentence_per_review = []
candidate_sentence_num_cnt_per_review = []
# [user-level] Loop for each user
for user_chunk in list(user_item_candidate_sent_ids.items()):
    user_id_str = str(user_chunk[0])
    # assert isinstance(user_chunk[0], str)
    # [item-level] Loop for each user-item pair
    user_item_chunks = list(user_chunk[1].items())
    for item_chunk in user_item_chunks:
        item_id_str = str(item_chunk[0])
        # assert isinstance(item_chunk[0], str)
        candidate_sent_ids = item_chunk[1][0]
        true_sent_ids = item_chunk[1][1]
        if len(true_sent_ids) == 0:
            cnt_empty_true_sent += 1
        else:
            assert isinstance(candidate_sent_ids[0], str)
            assert isinstance(true_sent_ids[0], str)
            # make sure that all true label sent_ids appears in the corresponding candidate set
            for true_sent_id in true_sent_ids:
                assert true_sent_id in candidate_sent_ids
            sentence_per_review.append(len(true_sent_ids))
            candidate_sentence_num_cnt_per_review.append(len(candidate_sent_ids))
        cnt_unique_reviews += 1

print("Total number of unique selected reviews: {}".format(cnt_unique_reviews))
print("Total number of review with empty true sentences: {}".format(cnt_empty_true_sent))
print("Total number of unique review with non-empty true sentences: {}".format(
    cnt_unique_reviews - cnt_empty_true_sent))

Total number of unique selected reviews: 205532
Total number of review with empty true sentences: 0
Total number of unique review with non-empty true sentences: 205532


In [50]:
print("Totally {} user-item pairs in the trainset".format(len(sentence_per_review)))
print("max number of true sentence per review: {}".format(np.max(sentence_per_review)))
print("min number of true sentence per review: {}".format(np.min(sentence_per_review)))
print("mean number of true sentence per review: {}".format(np.mean(sentence_per_review)))
print("max number of candidate sentence per review: {}".format(np.max(candidate_sentence_num_cnt_per_review)))
print("min number of candidate sentence per review: {}".format(np.min(candidate_sentence_num_cnt_per_review)))
print("mean number of candidate sentence per review: {}".format(np.mean(candidate_sentence_num_cnt_per_review)))

Totally 205532 user-item pairs in the trainset
max number of true sentence per review: 62
min number of true sentence per review: 1
mean number of true sentence per review: 4.050074927505206
max number of candidate sentence per review: 545
min number of candidate sentence per review: 36
mean number of candidate sentence per review: 417.56846622423757


In [51]:
useritem2sentids_filepath = '../Dataset/{}/train/useritem2sentids.json'.format(dataset_name)
with open(useritem2sentids_filepath, 'w') as f:
    print("Write file: {}".format(useritem2sentids_filepath))
    json.dump(user_item_candidate_sent_ids, f)

Write file: ../Dataset/tripadvisor/train/useritem2sentids.json


In [52]:
example_user_id = '1001'
example_item_id = '1964'
print("User: {0}\tItem: {1}".format(example_user_id, example_item_id))
print(
    "Number of (sampled) cdd sents: ", 
    len(user_item_candidate_sent_ids[example_user_id][example_item_id][0]))
print("Number of user side cdd sents:", len(user_to_sent[example_user_id]))
print("Number of item side cdd sents:", len(item_to_sent[example_item_id]))
print("Number of GT sentences: ", 
    len(user_item_candidate_sent_ids[example_user_id][example_item_id][1]))

User: 1001	Item: 1964
Number of (sampled) cdd sents:  252
Number of user side cdd sents: 148
Number of item side cdd sents: 106
Number of GT sentences:  2


In [53]:
for sent_id in user_item_candidate_sent_ids[example_user_id][example_item_id][1]:
    assert sent_id in user_item_candidate_sent_ids[example_user_id][example_item_id][0]

## Write train useritem_cdd in a line-by-line format

In [55]:
# Write useritem2sentids into a line-by-line format
train_useritem2sentid_multiline_file = '../Dataset/{}/train/useritem2sentids_multilines.json'.format(dataset_name)
if os.path.exists(train_useritem2sentid_multiline_file):
    print("File: {} exists, remove it.".format(train_useritem2sentid_multiline_file))
    os.remove(train_useritem2sentid_multiline_file)
else:
    print("File: {} doesn't exist, creat it.".format(train_useritem2sentid_multiline_file))

File: ../Dataset/tripadvisor/train/useritem2sentids_multilines.json doesn't exist, creat it.


In [56]:
with open(train_useritem2sentid_multiline_file, 'a') as f1:
    print("Write file: {}".format(train_useritem2sentid_multiline_file))
    cnt_user = 0
    cnt_review = 0
    user_set = set()
    item_set = set()
    useritem_set = set()
    for trainset_user_chunk in list(user_item_candidate_sent_ids.items()):
        assert isinstance(trainset_user_chunk[0], str)
        user_id_str = trainset_user_chunk[0]
        user_id = int(trainset_user_chunk[0])
        user_item_chunks = list(trainset_user_chunk[1].items())
        for item_chunk in user_item_chunks:
            assert isinstance(item_chunk[0], str)
            item_id_str = item_chunk[0]
            item_id = int(item_chunk[0])
            candidate_sent_ids = item_chunk[1][0]
            gold_revw_sent_ids = item_chunk[1][1]
            assert isinstance(candidate_sent_ids, list)
            assert isinstance(gold_revw_sent_ids, list)
            assert isinstance(candidate_sent_ids[0], str)
            assert isinstance(gold_revw_sent_ids[0], str)
            for cur_id in gold_revw_sent_ids:
                assert cur_id in candidate_sent_ids
            cur_data_dict = {'user_id': user_id_str, 'item_id': item_id_str, 'candidate': candidate_sent_ids, "review": gold_revw_sent_ids}
            # write this into the json file
            json.dump(cur_data_dict, f1)
            f1.write("\n")
            cnt_review += 1
            useritem_set.add((user_id_str, item_id_str))
            item_set.add(item_id_str)
            if cnt_review % 50000 == 0:
                print("{} lines of train data written.".format(cnt_review))
        cnt_user += 1
        user_set.add(user_id_str)

assert cnt_user == len(user_set)
assert cnt_review == len(useritem_set)
print("Totally {} users".format(cnt_user))
print("Totally {} items".format(len(item_set)))
print("Totally {} reviews".format(cnt_review))

Write file: ../Dataset/tripadvisor/train/useritem2sentids_multilines.json
50000 lines of train data written.
100000 lines of train data written.
150000 lines of train data written.
200000 lines of train data written.
Totally 4950 users
Totally 4493 items
Totally 205532 reviews


In [57]:
!head -n 1 '../Dataset/yelp/train/useritem2sentids_multilines.json'

{"user_id": "1001", "item_id": "1071", "candidate": ["6144", "412871", "438927", "6155", "6127", "6137", "6133", "165483", "45990", "438928", "6163", "45989", "6166", "6187", "409044", "485587", "412869", "6097", "6217", "6227", "6200", "6115", "6119", "6153", "6230", "865", "6102", "453", "23482", "305680", "6210", "208077", "169420", "6168", "6195", "31", "880", "6122", "198007", "394026", "6223", "124875", "98241", "6152", "280101", "6199", "246", "454130", "198008", "107329", "231995", "438926", "6214", "6183", "169415", "124874", "6150", "263583", "45985", "45986", "6103", "305679", "6130", "6182", "30778", "6160", "6146", "231994", "443587", "57278", "84019", "454133", "6156", "6179", "107328", "6196", "6104", "169419", "280534", "23478", "231996", "68133", "6139", "57279", "6140", "6194", "6205", "98507", "6189", "6212", "6116", "6099", "6201", "336082", "6164", "45987", "6221", "169417", "6158", "6142", "6188", "6176", "6174", "98508", "280533", "84017", "23479", "347514", "610

## Save User-Item Pairs (Train Set)

In [58]:
user_item_pairs = dict()
cnt_user_item_pairs = 0
for trainset_user_chunk in list(user_item_candidate_sent_ids.items()):
    assert isinstance(trainset_user_chunk[0], str)
    user_id_str = trainset_user_chunk[0]
    user_item_chunks = list(trainset_user_chunk[1].items())
    assert user_id_str not in user_item_pairs
    user_item_pairs[user_id_str] = list()
    for item_chunk in user_item_chunks:
        assert isinstance(item_chunk[0], str)
        item_id_str = item_chunk[0]
        assert item_id_str not in user_item_pairs[user_id_str]
        user_item_pairs[user_id_str].append(item_id_str)
        cnt_user_item_pairs += 1
print("Total number of user-item pair on trainset: {}".format(cnt_user_item_pairs))

Total number of user-item pair on trainset: 205532


In [59]:
train_useritempairs_file = '../Dataset/{}/train/useritem_pairs.json'.format(dataset_name)
with open(train_useritempairs_file, 'w') as f:
    print("write file: {}".format(train_useritempairs_file))
    json.dump(user_item_pairs, f)

write file: ../Dataset/tripadvisor/train/useritem_pairs.json
