In [1]:
!which python

/u/pw7nc/anaconda3/bin/python


In [5]:
import re
import json
import os

import numpy as np
import pandas as pd
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer

import spacy
import nltk
from nltk.tokenize import sent_tokenize
from spacy.lang.en import English
nlp = English()
# Create a Tokenizer with the default settings for English
# including punctuation rules and exceptions
tokenizer = nlp.tokenizer
import string
punct = string.punctuation
from sklearn.feature_extraction import _stop_words

In [4]:
dataset_name = "tripadvisor"

# Read Data

## Load Dataset

In [6]:
dir_path = '../Dataset/{}'.format(dataset_name)
# Load test dataset
test_review = []
cnt = 0
file_path = os.path.join(dir_path, 'test_review_filtered_clean.json')
with open(file_path) as f:
    print("Load file: {}".format(file_path))
    for line in f:
        line_data = json.loads(line)
        user_id = line_data['user']
        item_id = line_data['item']
        rating = line_data['rating']
        review = line_data['review']
        test_review.append([item_id, user_id, rating, review])
        cnt += 1
        if cnt % 10000 == 0:
            print('{} lines loaded.'.format(cnt))
print('Finish loading test dataset, totally {} lines.'.format(len(test_review)))

Load file: ../Dataset/tripadvisor/test_review_filtered_clean.json
10000 lines loaded.
Finish loading test dataset, totally 19444 lines.


In [7]:
dir_path = '../Dataset/{}'.format(dataset_name)
# Load train dataset
train_review = []
cnt = 0
file_path = os.path.join(dir_path, 'train_review_filtered.json')
with open(file_path) as f:
    print("Load file: {}".format(file_path))
    for line in f:
        line_data = json.loads(line)
        user_id = line_data['user']
        item_id = line_data['item']
        rating = line_data['rating']
        review = line_data['review']
        train_review.append([item_id, user_id, rating, review])
        cnt += 1
        if cnt % 100000 == 0:
            print('{} lines loaded.'.format(cnt))
print('Finish loading train dataset, totally {} lines.'.format(len(train_review)))

Load file: ../Dataset/tripadvisor/train_review_filtered.json
100000 lines loaded.
200000 lines loaded.
Finish loading train dataset, totally 205595 lines.


In [8]:
df_train_data = pd.DataFrame(train_review, columns=['item', 'user', 'rating', 'review'])
df_test_data = pd.DataFrame(test_review, columns=['item', 'user', 'rating', 'review'])

In [9]:
df_test_data

Unnamed: 0,item,user,rating,review
0,1111,0,2,when i mentioned this to the front desk they d...
1,1379,0,3,"the service was good . our room , was not the ..."
2,1391,0,5,we stayed at the signature for four days to ce...
3,1579,0,4,the lake buena vista is a perfect place to sta...
4,1689,0,5,summer ( at the front desk ) was perfect ! she...
...,...,...,...,...
19439,0,999,5,"this was a pleasant place , and with our annua..."
19440,128,999,5,we enjoyed our stay at the hilton very much ! ...
19441,429,999,5,from the moment we arrived at the front desk u...
19442,816,999,4,"wifi gratuit , nous n avons pas essayé le brea..."


In [10]:
train_groupby_item_user = df_train_data.groupby(['item', 'user'])
train_groupby_item_user_dict = dict(tuple(train_groupby_item_user))

In [11]:
for idx, row in df_test_data.iterrows():
    user_id_str = row['user']
    item_id_str = row['item']
    assert (item_id_str, user_id_str) not in train_groupby_item_user_dict

## Load Sentence2ID and ID2Sentence Mapping From Training Set

In [12]:
sentence2id_filepath = '../Dataset/{}/train/sentence/sentence2id.json'.format(dataset_name)
with open(sentence2id_filepath, 'r') as f:
    print("Load file: {}".format(sentence2id_filepath))
    trainset_sent_to_id = json.load(f)

Load file: ../Dataset/tripadvisor/train/sentence/sentence2id.json


In [13]:
id2sentence_filepath = '../Dataset/{}/train/sentence/id2sentence.json'.format(dataset_name)
with open(id2sentence_filepath, 'r') as f:
    print("Load file: {}".format(id2sentence_filepath))
    trainset_id_to_sent = json.load(f)

Load file: ../Dataset/tripadvisor/train/sentence/id2sentence.json


In [14]:
assert len(trainset_sent_to_id) == len(trainset_id_to_sent)
print("There are {} sentences in the training set.".format(len(trainset_id_to_sent)))

There are 740398 sentences in the training set.


## Load Feature Words

In [15]:
# Feature words are the same between training and testing
# since we can only know the review text from training set
feature2id_filepath = '../Dataset/{}/train/feature/feature2id.json'.format(dataset_name)
with open(feature2id_filepath, 'r') as f:
    print("Load file: {}".format(feature2id_filepath))
    feature_vocab = json.load(f)

Load file: ../Dataset/tripadvisor/train/feature/feature2id.json


In [16]:
feature_word_list = list(feature_vocab.keys())
assert len(feature_word_list) == len(feature_vocab)
print('Number of feature words: {}'.format(len(feature_word_list)))

Number of feature words: 503


In [17]:
id2feature_filepath = '../Dataset/{}/train/feature/id2feature.json'.format(dataset_name)
with open(id2feature_filepath, 'r') as f:
    print("Load file: {}".format(id2feature_filepath))
    id2feature_train = json.load(f)

Load file: ../Dataset/tripadvisor/train/feature/id2feature.json


# Build Sentence Vocab on TestSet
## Check Whether there are reviews with no sentence

In [18]:
invalid_data = 0
for idx, row in df_test_data.iterrows():
    review_text = row['review']
    review_sents = sent_tokenize(review_text)
    if len(review_sents) == 0:
        print(row)
        invalid_data += 1

In [19]:
print(invalid_data)

0


In [20]:
def get_tf_score(text, feature_word_list):
    vectorizer = CountVectorizer(lowercase=True, vocabulary=feature_word_list)
    word_count = vectorizer.fit_transform(text)
    return word_count.toarray()

In [21]:
# sentence vocab
sentence_count = dict()
sentence_with_no_feature = 0
# Loop for each review
# TODO: Do we need to filter out sentences with less than 3 tokens same as during training?
for idx, row in df_test_data.iterrows():
    review_text = row['review']
    review_sents = sent_tokenize(review_text)
    tf_score = get_tf_score(review_sents, feature_word_list)
    tf_sum_sents = np.sum(tf_score, axis=1)
    for i in range(len(review_sents)):
        if tf_sum_sents[i] != 0.0:
            cur_sent = review_sents[i]
            sentence_count[cur_sent] = 1 + sentence_count.get(cur_sent, 0)
        else:
            sentence_with_no_feature += 1
    if (idx+1) % 2000 == 0:
        print("Processed {} lines".format(idx+1))
print("Totally {} tracked sentences".format(len(sentence_count)))
print("There are {} sentences with no feature words".format(sentence_with_no_feature))

Processed 2000 lines
Processed 4000 lines
Processed 6000 lines
Processed 8000 lines
Processed 10000 lines
Processed 12000 lines
Processed 14000 lines
Processed 16000 lines
Processed 18000 lines
Totally 75502 tracked sentences
There are 1095 sentences with no feature words


In [22]:
len(sentence_count)

75502

In [23]:
# sort sentence based on counts (the majority should be 1)
sorted_sent_counts = sorted(sentence_count.items(), key = lambda x: -x[1])

In [24]:
# sentence_vocab_list = list(sentence_count.keys())
# Building mappings from sentences to ids and ids to sentences
testset_sent_to_id = {entry[0]: str(id) for (id, entry) in enumerate(sorted_sent_counts)}
# Since we loaded all the tokenized sentences, we don't need to add the special UNK token
testset_id_to_sent = {str(id): sent for (sent, id) in testset_sent_to_id.items()}

## Save Sentence2ID into Json File (Test / Valid Set)

In [25]:
with open('../Dataset/{}/test/sentence/id2sentence.json'.format(dataset_name), 'w') as f:
    json.dump(testset_id_to_sent, f)

with open('../Dataset/{}/test/sentence/sentence2id.json'.format(dataset_name), 'w') as f:
    json.dump(testset_sent_to_id, f)

with open('../Dataset/{}/valid/sentence/id2sentence.json'.format(dataset_name), 'w') as f:
    json.dump(testset_id_to_sent, f)

with open('../Dataset/{}/valid/sentence/sentence2id.json'.format(dataset_name), 'w') as f:
    json.dump(testset_sent_to_id, f)

## Get Sentence Feature (Test / Valid Set)

In [26]:
# Some Reviews on Test may not contain any features. Remove these reviews.
test_review_has_feature = []
review_has_no_feature_num = 0
review_has_no_feature_num_list = list()
review_has_duplicate_sentences_num = 0
review_has_duplicate_sentences_list = list()

for idx, test_rvw_j in enumerate(test_review):
    rvw_text_j = test_rvw_j[-1]
    rvw_sents_j = sent_tokenize(rvw_text_j)
    rvw_sent_ids_j = set()
    rvw_sent_ids_j_list = list()
    review_has_duplicate_sentences = False
    for sent in rvw_sents_j:
        if sent in testset_sent_to_id:
            sent_id = testset_sent_to_id[sent]
            if sent_id not in rvw_sent_ids_j:
                rvw_sent_ids_j.add(sent_id)
                rvw_sent_ids_j_list.append(sent_id)
            else:
                review_has_duplicate_sentences = True
    if review_has_duplicate_sentences:
        review_has_duplicate_sentences_num += 1
        review_has_duplicate_sentences_list.append(rvw_text_j)
    cnt_num_sents_j = len(rvw_sent_ids_j)
    assert rvw_sent_ids_j == set(rvw_sent_ids_j_list)
    # remove reviews that has no sentences with feature(s)
    if cnt_num_sents_j == 0:
        print(test_rvw_j)
        review_has_no_feature_num += 1
        review_has_no_feature_num_list.append(test_rvw_j)
    else:
        test_review_has_feature.append(test_rvw_j)

print("Total number of test reviews: {}".format(len(test_review)))
print("Number of reviews that have at least 1 features: {}".format(
    len(test_review_has_feature)))
print("Number of reviews that have no features: {}".format(
    review_has_no_feature_num))
print("Number of reviews that have duplicate sentences: {}".format(
    review_has_duplicate_sentences_num))

['1370', '1', 5, '100 % empfehlenswert .']
['160', '1186', 4, '$ 9.00 for 9 pages was very pricey .']
['3120', '1482', 1, 'parken 25 $ + tax pro tag .']
['126', '2', 5, 'parkering var veldig enkelt , men det koster 16 $ natten .']
['69', '260', 5, 'side note , wi - fi is an additional $ 10 .']
Total number of test reviews: 19444
Number of reviews that have at least 1 features: 19439
Number of reviews that have no features: 5
Number of reviews that have duplicate sentences: 9


In [27]:
review_has_duplicate_sentences_list

['dinner in the lobby tim was outstanding service and friendly . dinner in the lobby tim was outstanding service and friendly . dinner in the lobby tim was outstanding service and friendly . dinner in the lobby tim was outstanding service and friendly',
 'nice isolated hotel from main stream austin ; ie not near any real restaurants . rooms large , furniture nice , bed very comfy , towels soft & clean . very friendly front desk staff , fast internet . nice isolated hotel from main stream austin ; ie not near any real restaurants . rooms large , furniture nice , bed very comfy , towels soft & clean . very friendly front desk staff , fast internet .',
 "stay someplace else . do n't stay here . do n't stay here . do n't stay here .",
 "happy to have on - site parking , but @ $ 28 / day is a little pricey . the bathroom was tiny , but at least we did n't see any pubic hairs that did n't belong to us . when we returned from dinner we were ready to retire , but not so our neighbors . the sca

In [28]:
df_test_data

Unnamed: 0,item,user,rating,review
0,1111,0,2,when i mentioned this to the front desk they d...
1,1379,0,3,"the service was good . our room , was not the ..."
2,1391,0,5,we stayed at the signature for four days to ce...
3,1579,0,4,the lake buena vista is a perfect place to sta...
4,1689,0,5,summer ( at the front desk ) was perfect ! she...
...,...,...,...,...
19439,0,999,5,"this was a pleasant place , and with our annua..."
19440,128,999,5,we enjoyed our stay at the hilton very much ! ...
19441,429,999,5,from the moment we arrived at the front desk u...
19442,816,999,4,"wifi gratuit , nous n avons pas essayé le brea..."


In [31]:
df_test_data_feat = pd.DataFrame(
    test_review_has_feature, columns=['item', 'user', 'rating', 'review'])

In [32]:
df_test_data_feat

Unnamed: 0,item,user,rating,review
0,1111,0,2,when i mentioned this to the front desk they d...
1,1379,0,3,"the service was good . our room , was not the ..."
2,1391,0,5,we stayed at the signature for four days to ce...
3,1579,0,4,the lake buena vista is a perfect place to sta...
4,1689,0,5,summer ( at the front desk ) was perfect ! she...
...,...,...,...,...
19434,0,999,5,"this was a pleasant place , and with our annua..."
19435,128,999,5,we enjoyed our stay at the hilton very much ! ...
19436,429,999,5,from the moment we arrived at the front desk u...
19437,816,999,4,"wifi gratuit , nous n avons pas essayé le brea..."


In [33]:
print("Number of users on test set: {}".format(len(df_test_data_feat['user'].unique())))
print("Number of items on test set: {}".format(len(df_test_data_feat['item'].unique())))

Number of users on test set: 4936
Number of items on test set: 4120


In [34]:
# groupby item
group_by_item_test = df_test_data_feat.groupby('item')
group_by_item_dict = dict(tuple(group_by_item_test))

In [35]:
assert len(group_by_item_dict) == len(df_test_data_feat['item'].unique())

In [36]:
peritem_num_sent_testset = dict()
peritemreview_num_sent_testset = list()
for key, item_df_test in group_by_item_dict.items():
    # print(key)
    # print(item_df_test)
    reviews_list = item_df_test['review']
    sentence_count = 0
    for review in reviews_list:
        review_sent_count = 0
        sentences_review = sent_tokenize(review)
        for sent in sentences_review:
            if sent in testset_sent_to_id:
                sentence_count += 1
                review_sent_count += 1
        peritemreview_num_sent_testset.append(review_sent_count)
    peritem_num_sent_testset[key] = sentence_count

In [37]:
assert len(peritemreview_num_sent_testset) == len(df_test_data_feat)

In [38]:
print("Number of review in testset: {}".format(
    len(peritemreview_num_sent_testset)))
print("Mean number of sentence per review in testset: {}".format(
    np.mean(peritemreview_num_sent_testset)))
print("Min number of sentence per review in testset {}".format(
    np.min(peritemreview_num_sent_testset)))
print("Max number of sentence per review in testset {}".format(
    np.max(peritemreview_num_sent_testset)))

Number of review in testset: 19439
Mean number of sentence per review in testset: 4.0509799886825455
Min number of sentence per review in testset 1
Max number of sentence per review in testset 61


In [39]:
print("Number of items in testset: {}".format(
    len(list(peritem_num_sent_testset.values()))
))
print("Mean number of sentence per item in testset: {}".format(
    np.mean(list(peritem_num_sent_testset.values()))
))
print("Min number of sentence per item in testset: {}".format(
    np.min(list(peritem_num_sent_testset.values()))
))
print("Max number of sentence per item in testset: {}".format(
    np.max(list(peritem_num_sent_testset.values()))
))

Number of items in testset: 4120
Mean number of sentence per item in testset: 19.113349514563108
Min number of sentence per item in testset: 1
Max number of sentence per item in testset: 179


In [41]:
# groupby user
group_by_user_test = df_test_data_feat.groupby('user')
group_by_user_dict = dict(tuple(group_by_user_test))
assert len(group_by_user_dict) == len(df_test_data_feat['user'].unique())

In [42]:
peruser_num_sent_testset = dict()
peruserreview_num_sent_testset = list()
for key, user_df_test in group_by_user_dict.items():
    reviews_list = user_df_test['review']
    sentence_count = 0
    for review in reviews_list:
        review_sent_count = 0
        sentences_review = sent_tokenize(review)
        for sent in sentences_review:
            if sent in testset_sent_to_id:
                sentence_count += 1
                review_sent_count += 1
        peruserreview_num_sent_testset.append(review_sent_count)
    peruser_num_sent_testset[key] = sentence_count

In [43]:
assert len(peruserreview_num_sent_testset) == len(df_test_data_feat)

In [44]:
print("Number of review in testset: {}".format(
    len(peruserreview_num_sent_testset)))
print("Mean number of sentence per review in testset: {}".format(
    np.mean(peruserreview_num_sent_testset)))
print("Min number of sentence per review in testset {}".format(
    np.min(peruserreview_num_sent_testset)))
print("Max number of sentence per review in testset {}".format(
    np.max(peruserreview_num_sent_testset)))

Number of review in testset: 19439
Mean number of sentence per review in testset: 4.0509799886825455
Min number of sentence per review in testset 1
Max number of sentence per review in testset 61


In [45]:
print("Number of users in testset: {}".format(
    len(list(peruser_num_sent_testset.values()))
))
print("Mean number of sentence per user in testset: {}".format(
    np.mean(list(peruser_num_sent_testset.values()))
))
print("Min number of sentence per user in testset: {}".format(
    np.min(list(peruser_num_sent_testset.values()))
))
print("Max number of sentence per user in testset: {}".format(
    np.max(list(peruser_num_sent_testset.values()))
))

Number of users in testset: 4936
Mean number of sentence per user in testset: 15.953606158833063
Min number of sentence per user in testset: 1
Max number of sentence per user in testset: 143


## Compute Tf-idf

In [46]:
# get the list of sentence text on testset
testset_sent_text_list = list(testset_sent_to_id.keys())
testset_sent_text_list[:20]
# NOTE: Based on the examples, \
# should we set the short sentence threshold to be 3 instead of 2 on train?
# Currently the threshold is 2 .

['great location .',
 'i would stay here again .',
 'the staff was very friendly and helpful .',
 'i would definitely stay here again .',
 'friendly staff .',
 'the room was clean .',
 'the staff was friendly and helpful .',
 'staff was friendly .',
 'would stay here again .',
 'good location .',
 'i would stay there again .',
 'the staff was very friendly .',
 'the bed was very comfortable .',
 'we would definitely stay here again .',
 'i would stay again .',
 'the beds were very comfortable .',
 'the bed was comfortable .',
 'would definitely stay here again .',
 'great place to stay .',
 'very clean .']

In [47]:
def get_tfidf_embedding(text, feature_word_list):
    """
    :param: text: list, sent_number * word
    :return: 
        vectorizer: 
            vocabulary_: word2id
            get_feature_names(): id2word
        tfidf: array [sent_number, max_word_number]
    """
    vectorizer = CountVectorizer(lowercase=True, vocabulary=feature_word_list)
    word_count = vectorizer.fit_transform(text)
    tfidf_transformer = TfidfTransformer()
    tfidf = tfidf_transformer.fit_transform(word_count)
    tfidf_weight = tfidf.toarray()
    return vectorizer, tfidf_weight

In [48]:
cntvector, tfidf_weight = get_tfidf_embedding(testset_sent_text_list, feature_word_list)

In [49]:
tfidf_weight.shape

(75502, 503)

In [50]:
def check_vocab_is_same(sklearn_vocab, feature_vocab):
    if len(sklearn_vocab) == len(feature_vocab):
        for key, value in sklearn_vocab.items():
            sklearn_vocab_id = value
            feature_vocab_id = feature_vocab[key]
            if int(feature_vocab_id) == sklearn_vocab_id:
                continue
            else:
                return False
    else:
        return False
    return True

In [51]:
check_vocab_is_same(cntvector.vocabulary_, feature_vocab)

True

In [52]:
testset_sentence_to_feature = dict()
sentence_with_no_feature = 0
tfidf_sum_sents = np.sum(tfidf_weight, axis=1)
for i in range(len(testset_sent_text_list)):
    cur_sent = testset_sent_text_list[i]
    # if this sentence is in the sent_to_id vocabulary
    assert cur_sent in testset_sent_to_id
    # get the sentence_id (str)
    cur_sent_id = testset_sent_to_id[cur_sent]
    assert int(cur_sent_id) == i
    # find all the feature that has non-zero tf-idf weight
    feature_dict = dict()
    for j in range(len(tfidf_weight[i])):
        if tfidf_weight[i][j] != 0.0:
            # get the feature
            feature_id = str(j)
            feature = feature_word_list[j]
            feature_tfidf = tfidf_weight[i][j]
            feature_dict[feature_id] = feature_tfidf
    if len(feature_dict) > 0:
        testset_sentence_to_feature[cur_sent_id] = feature_dict
    else:
        sentence_with_no_feature += 1
    if (i+1) % 10000 == 0:
        print("Processed {} lines".format(i+1))
print("Finish. Totally {} lines".format(i+1))
print("Totally {} sentences has at least 1 feature and {} sentences don't have feature.".format(
    len(testset_sentence_to_feature), sentence_with_no_feature))

Processed 10000 lines
Processed 20000 lines
Processed 30000 lines
Processed 40000 lines
Processed 50000 lines
Processed 60000 lines
Processed 70000 lines
Finish. Totally 75502 lines
Totally 75502 sentences has at least 1 feature and 0 sentences don't have feature.


In [53]:
# save testset_sentence_to_feature into json file
# testset sent_to_id is same as validset, also save this to validset
sentence2feature_filepath = '../Dataset/{}/test/sentence/sentence2feature.json'.format(dataset_name)
with open(sentence2feature_filepath, 'w') as f:
    print("Write file: {}".format(sentence2feature_filepath))
    json.dump(testset_sentence_to_feature, f)
sentence2feature_filepath = '../Dataset/{}/valid/sentence/sentence2feature.json'.format(dataset_name)
with open(sentence2feature_filepath, 'w') as f:
    print("Write file: {}".format(sentence2feature_filepath))
    json.dump(testset_sentence_to_feature, f)

Write file: ../Dataset/tripadvisor/test/sentence/sentence2feature.json
Write file: ../Dataset/tripadvisor/valid/sentence/sentence2feature.json


In [54]:
print(testset_sentence_to_feature['0'])
print(testset_id_to_sent['0'])

{'9': 1.0}
great location .


In [55]:
print(testset_sentence_to_feature['75501'])
print(testset_id_to_sent['75501'])
for fea_id in testset_sentence_to_feature['75501'].keys():
    print(id2feature_train[fea_id])

{'101': 1.0}
when we were there they had a deal buy 1 entree and get a second for $ 2 .
deal


In [56]:
# get some statistics of sentence_to_feature on testset/validset
num_feature_per_sentence = []
for key, value in testset_sentence_to_feature.items():
    num_feature_per_sentence.append(len(value))
    assert len(value) > 0       # every sentence should have at least 1 feature

In [57]:
print("Mean number of features per sentence: {}".format(np.mean(num_feature_per_sentence)))
print("Max number of features per sentence: {}".format(np.max(num_feature_per_sentence)))
print("Min number of features per sentence: {}".format(np.min(num_feature_per_sentence)))

Mean number of features per sentence: 2.5262377155572038
Max number of features per sentence: 11
Min number of features per sentence: 1


## Load User to SentenceID

In [58]:
train_user2sentids_filepath = '../Dataset/{}/train/user/user2sentids.json'.format(dataset_name)
with open(train_user2sentids_filepath, 'r') as f:
    print("Load file: {}".format(train_user2sentids_filepath))
    trainset_user_to_sent_id = json.load(f)

Load file: ../Dataset/tripadvisor/train/user/user2sentids.json


## Load Item to SentenceID

In [59]:
train_item2sentids_filepath = '../Dataset/{}/train/item/item2sentids.json'.format(dataset_name)
with open(train_item2sentids_filepath, 'r') as f:
    print("Load file: {}".format(train_item2sentids_filepath))
    trainset_item_to_sent_id = json.load(f)

Load file: ../Dataset/tripadvisor/train/item/item2sentids.json


## Load User-Item Pairs on TrainSet

In [60]:
train_useritem_pairs_filepath = '../Dataset/{}/train/useritem_pairs.json'.format(dataset_name)
with open(train_useritem_pairs_filepath, 'r') as f:
    print("Load file: {}".format(train_useritem_pairs_filepath))
    trainset_useritem_pairs = json.load(f)

Load file: ../Dataset/tripadvisor/train/useritem_pairs.json


In [61]:
# Get the user set and item set on train-set
train_user_set = set()
train_item_set = set()
for key,value in trainset_useritem_pairs.items():
    uid = key
    assert uid not in train_user_set
    train_user_set.add(uid)
    for iid in value:
        train_item_set.add(iid)
print("Number of users on the constructed train set: {}".format(len(train_user_set)))
print("Number of items on the constructed train set: {}".format(len(train_item_set)))

Number of users on the constructed train set: 4950
Number of items on the constructed train set: 4493


# For Each Data Instance on TestSet

## GroupBy User

In [62]:
group_by_user_test = df_test_data_feat.groupby('user')
print("Number of users on test-set: {}".format(len(group_by_user_test)))

Number of users on test-set: 4936


## Construct Valid Dataset

In [63]:
import random
sample_sent_num = 500           # this should be among 30, 200 and 500
user_item_candidate_sent_ids_validset = dict()
cnt_empty_true_sentence = 0
user_cnt = 0
review_cnt = 0
review_with_duplicate_sentences = 0
review_with_duplicate_sentences_list = list()
user_item_candidate_sentence_num = list()
user_item_candidate_sentence_num_sampled = list()
cnt_being_cut_useritem = 0
testset_user_id_set = set()
testset_item_id_set = set()
# Loop over all users
user_cnt = 0
for user_df_chunk in list(group_by_user_test):
    user_id = int(user_df_chunk[0])
    user_id_str = str(user_df_chunk[0])
    user_df = user_df_chunk[1]
    if user_id_str not in train_user_set:
        print("User: {} not in train but in test!".format(user_id_str))
        continue
    assert user_id_str not in testset_user_id_set
    testset_user_id_set.add(user_id_str)
    # get user sentences, these sentences are on TRAIN set
    cur_user_sent_ids = set(trainset_user_to_sent_id[user_id_str])
    # item-level dict
    item_candidate_sent_ids = dict()
    for idx, row in user_df.iterrows():
        item_id = int(row['item'])
        item_id_str = str(row['item'])
        if item_id_str not in train_item_set:
            print("Item: {} not in train but in test!".format(item_id_str))
            continue
        testset_item_id_set.add(item_id_str)
        review_text = row['review']
        review_cnt += 1
        # get item sentences, they are on TRAIN set
        cur_item_sent_ids = set(trainset_item_to_sent_id[item_id_str])
        # get review_text's sent ids, they are on TEST set
        cur_review_sent_ids = set()
        cur_review_sent_ids_list = list()
        ## tokenize this review
        review_sents = sent_tokenize(review_text)
        ## check whether this sentence is in the testset_sent_to_id dict
        review_has_duplicate_sentences = False
        for sent in review_sents:
            if sent in testset_sent_to_id:
                cur_sent_id = testset_sent_to_id[sent]
                if cur_sent_id not in cur_review_sent_ids:
                    # add this sentence into the set of current review
                    cur_review_sent_ids.add(cur_sent_id)
                    cur_review_sent_ids_list.append(cur_sent_id)
                else:
                    review_has_duplicate_sentences = True
        if review_has_duplicate_sentences:
            review_with_duplicate_sentences += 1
            review_with_duplicate_sentences_list.append([item_id_str, user_id_str, review_text])
        try:
            assert cur_review_sent_ids == set(cur_review_sent_ids_list)
        except:
            print(cur_review_sent_ids, cur_review_sent_ids_list)
        # construct the candidate set which is an union of user sentence and item sentence
        cur_useritem_sent_ids = cur_user_sent_ids | cur_item_sent_ids
        # sample some sentences (they are on TRAIN set)
        if len(cur_useritem_sent_ids) > sample_sent_num:
            sample_useritem_sent_ids = set(random.sample(cur_useritem_sent_ids, sample_sent_num))
            cnt_being_cut_useritem += 1
        else:
            sample_useritem_sent_ids = cur_useritem_sent_ids
        # add this into the dict
        if len(cur_review_sent_ids) != 0:
            # only add the ones that contain at least 1 true label sentence (on valid/test set)
            item_candidate_sent_ids[item_id_str] = [list(sample_useritem_sent_ids), cur_review_sent_ids_list]
            user_item_candidate_sentence_num.append(len(cur_useritem_sent_ids))
            user_item_candidate_sentence_num_sampled.append(len(sample_useritem_sent_ids))
        else:
            cnt_empty_true_sentence += 1

    # add this item-level dict into the user-level dict
    if len(item_candidate_sent_ids) == 0:
        print("User: {} has no useful items, skip it.".format(user_id_str))
    else:
        user_item_candidate_sent_ids_validset[user_id_str] = item_candidate_sent_ids
    user_cnt += 1
    if user_cnt % 1000 == 0:
        print("{} user processed.".format(user_cnt))

print('Finish.')
print('Totally {0} users, {1} items.'.format(len(testset_user_id_set), len(testset_item_id_set)))
print('Totally {0} reviews. Among them {1} reviews has empty true label sentence'.format(
    review_cnt, cnt_empty_true_sentence))
print('During constructing, {} user-item pair are being cut due to their length'.format(
    cnt_being_cut_useritem))

1000 user processed.
2000 user processed.
3000 user processed.
4000 user processed.
Finish.
Totally 4936 users, 4120 items.
Totally 19439 reviews. Among them 0 reviews has empty true label sentence
During constructing, 10075 user-item pair are being cutted due to their length


In [64]:
print("Totally {} user item pairs in the testset".format(len(user_item_candidate_sentence_num_sampled)))
print("mean number of candidate sentence: {}".format(np.median(user_item_candidate_sentence_num_sampled)))
print("max number of candidate sentence: {}".format(np.max(user_item_candidate_sentence_num_sampled)))
print("min number of candidate sentence: {}".format(np.min(user_item_candidate_sentence_num_sampled)))

Totally 19439 user item pairs in the testset
mean number of candidate sentence: 500.0
max number of candidate sentence: 500
min number of candidate sentence: 62


In [66]:
print(sorted(user_item_candidate_sentence_num)[-200:])

""" This shows that if we restrict the candidate sentences to have a maximum number of 2000,
we will cut-off about 200 reviews. This will be applied on test set to avoid user-item paris
with too many candidate sentences.
"""

[1977, 1979, 1981, 1984, 1985, 1986, 1988, 1990, 1992, 1993, 1995, 1997, 1997, 1998, 1998, 1999, 2001, 2003, 2004, 2009, 2009, 2011, 2014, 2016, 2017, 2018, 2018, 2019, 2019, 2022, 2024, 2024, 2024, 2026, 2029, 2031, 2031, 2039, 2041, 2043, 2043, 2044, 2044, 2046, 2050, 2050, 2054, 2057, 2059, 2060, 2060, 2062, 2062, 2063, 2063, 2067, 2067, 2069, 2070, 2073, 2074, 2077, 2081, 2081, 2084, 2085, 2087, 2088, 2089, 2089, 2089, 2091, 2092, 2100, 2101, 2103, 2111, 2114, 2125, 2128, 2136, 2140, 2140, 2141, 2144, 2148, 2148, 2155, 2163, 2166, 2168, 2171, 2179, 2179, 2181, 2188, 2191, 2192, 2194, 2204, 2205, 2206, 2218, 2225, 2226, 2227, 2237, 2238, 2240, 2242, 2243, 2247, 2251, 2259, 2261, 2263, 2266, 2268, 2269, 2269, 2278, 2281, 2294, 2296, 2296, 2298, 2299, 2318, 2329, 2333, 2344, 2344, 2344, 2346, 2361, 2365, 2379, 2398, 2425, 2445, 2449, 2461, 2518, 2562, 2569, 2599, 2666, 2681, 2757, 2789, 2832, 3055, 12819, 12822, 12825, 12826, 12833, 12835, 12835, 12836, 12839, 12840, 12844, 12847, 128

' This shows that if we restrict the candidate sentences to have a maximum number of 2000,\nwe will cut-off about 1000 reviews. This will be applied on test set to avoid user-item paris\nwith too many candidate sentences.\n'

In [67]:
len(user_item_candidate_sent_ids_validset)

4936

In [68]:
# save this into json file
valid_useritem2sentids_filepath = '../Dataset/{}/valid/useritem2sentids_test.json'.format(dataset_name)
with open(valid_useritem2sentids_filepath, 'w') as f:
    print("Write file: {}".format(valid_useritem2sentids_filepath))
    json.dump(user_item_candidate_sent_ids_validset, f)

Write file: ../Dataset/tripadvisor/valid/useritem2sentids_test.json


In [70]:
check_user_id = "0"
check_item_id = "1689"
print("user: {0} \t item: {1}".format(check_user_id, check_item_id))
print("number of sentence in candidate set: {}".format(
    len(user_item_candidate_sent_ids_validset[check_user_id][check_item_id][0])))
print("number of sentence in true review set: {}".format(
    len(user_item_candidate_sent_ids_validset[check_user_id][check_item_id][1])))

user: 0 	 item: 1689
number of sentence in candidate set: 500
number of sentence in true review set: 3


In [71]:
for sentid in user_item_candidate_sent_ids_validset[check_user_id][check_item_id][1]:
    print(testset_id_to_sent[sentid])

summer ( at the front desk ) was perfect !
she alone is worth the stay at this hotel !
the rest of the staff went out of their way to keep me comfortable for the 5 weeks i was there !


In [72]:
# Checking How Many User/Item/Review are in the valid set
cnt_user = 0
cnt_review = 0
cnt_item_set = set()
for trainset_user_chunk in list(user_item_candidate_sent_ids_validset.items()):
    user_id_str = str(trainset_user_chunk[0])
    user_id = int(trainset_user_chunk[0])
    user_item_chunks = list(trainset_user_chunk[1].items())
    for item_chunk in user_item_chunks:
        item_id_str = str(item_chunk[0])
        item_id = int(item_chunk[0])
        # candidate_true_sent_ids = item_chunk[1]
        # cur_data_dict = {'user_id': user_id, 'item_id': item_id, 'sent_id': candidate_true_sent_ids}
        # write this into the json file
        # json.dump(cur_data_dict, f1)
        # f1.write("\n")
        # assert user_id_str in train_user_id_set
        # assert item_id_str in train_item_id_set
        cnt_item_set.add(item_id_str)
        cnt_review += 1
    cnt_user += 1

print("Total number of reviews: {}".format(cnt_review))
print("Total number of user: {}".format(cnt_user))
print("Total number of item: {}".format(len(cnt_item_set)))

Total number of reviews: 19439
Total number of user: 4936
Total number of item: 4120


In [73]:
# Write useritem2sentids_test into a line-by-line format
valid_useritem2sentids_multiline_filepath = '../Dataset/{}/valid/useritem2sentids_test_multilines.json'.format(dataset_name)
with open(valid_useritem2sentids_multiline_filepath, 'w') as f1:
    print("Write file: {}".format(valid_useritem2sentids_multiline_filepath))
    cnt_user = 0
    cnt_review = 0
    user_set = set()
    item_set = set()
    useritem_set = set()
    for trainset_user_chunk in list(user_item_candidate_sent_ids_validset.items()):
        user_id_str = str(trainset_user_chunk[0])
        user_id = int(trainset_user_chunk[0])
        user_item_chunks = list(trainset_user_chunk[1].items())
        for item_chunk in user_item_chunks:
            item_id_str = str(item_chunk[0])
            item_id = int(item_chunk[0])
            item_set.add(item_id_str)
            candidate_sent_ids = item_chunk[1][0]
            true_revw_sent_ids = item_chunk[1][1]
            cur_data_dict = {'user_id':user_id, 'item_id':item_id, 'candidate':candidate_sent_ids, "review":true_revw_sent_ids}
            # write this into the json file
            json.dump(cur_data_dict, f1)
            f1.write("\n")
            cnt_review += 1
            useritem_set.add((user_id_str, item_id_str))
        cnt_user += 1
        user_set.add(user_id_str)

assert len(user_set) == cnt_user
assert len(useritem_set) == cnt_review
print("Total {} users".format(cnt_user))
print("Total {} items".format(len(item_set)))
print("Totat {} reviews".format(cnt_review))

Write file: ../Dataset/tripadvisor/valid/useritem2sentids_test_multilines.json
Total 4936 users
Total 4120 items
Totat 19439 reviews


# Construct Test Dataset

In [74]:
sample_sent_num = 2000
user_item_candidate_sent_ids_testset = dict()
cnt_empty_true_sentence = 0
user_cnt = 0
review_cnt = 0
review_with_duplicate_sentences = 0
review_with_duplicate_sentences_list = list()
user_item_candidate_sentence_num = list()
user_item_candidate_sentence_num_sampled = list()
cnt_being_cut_useritem = 0
testset_user_id_set = set()
testset_item_id_set = set()
# Loop over all users
user_cnt = 0
for user_df_chunk in list(group_by_user_test):
    user_id = int(user_df_chunk[0])
    user_id_str = str(user_df_chunk[0])
    if user_id_str not in train_user_set:
        print("User: {} not in train but in test!".format(user_id_str))
        continue
    assert user_id_str not in testset_user_id_set
    testset_user_id_set.add(user_id_str)
    user_df = user_df_chunk[1]
    # get user sentences, these sentences are on TRAIN set
    cur_user_sent_ids = set(trainset_user_to_sent_id[user_id_str])
    # item-level dict
    item_candidate_sent_ids = dict()
    for idx, row in user_df.iterrows():
        item_id = int(row['item'])
        item_id_str = str(row['item'])
        if item_id_str not in train_item_set:
            print("Item: {} not in train but in test!".format(item_id_str))
            continue
        testset_item_id_set.add(item_id_str)
        review_text = row['review']
        review_cnt += 1
        # get item sentences, they are on TRAIN set
        cur_item_sent_ids = set(trainset_item_to_sent_id[item_id_str])
        # get review_text's sent ids, they are on TEST set
        cur_review_sent_ids = set()
        cur_review_sent_ids_list = list()
        ## tokenize this review
        review_sents = sent_tokenize(review_text)
        ## check whether this sentence is in the testset_sent_to_id dict
        review_has_duplicate_sentences = False
        for sent in review_sents:
            if sent in testset_sent_to_id:
                cur_sent_id = testset_sent_to_id[sent]
                if cur_sent_id not in cur_review_sent_ids:
                    # add this sentence into the set of current review
                    cur_review_sent_ids.add(cur_sent_id)
                    cur_review_sent_ids_list.append(cur_sent_id)
                else:
                    review_has_duplicate_sentences = True
        if review_has_duplicate_sentences:
            review_with_duplicate_sentences += 1
            review_with_duplicate_sentences_list.append([item_id_str, user_id_str, review_text])
        try:
            assert cur_review_sent_ids == set(cur_review_sent_ids_list)
        except:
            print(cur_review_sent_ids, cur_review_sent_ids_list)
        # construct the candidate set which is an union of user sentence and item sentence
        cur_useritem_sent_ids = cur_user_sent_ids | cur_item_sent_ids
        # sample some sentences (they are on TRAIN set)
        if len(cur_useritem_sent_ids) > sample_sent_num:
            sample_useritem_sent_ids = set(random.sample(cur_useritem_sent_ids, sample_sent_num))
            cnt_being_cut_useritem += 1
        else:
            sample_useritem_sent_ids = cur_useritem_sent_ids
        # add this into the dict
        if len(cur_review_sent_ids) != 0:
            item_candidate_sent_ids[item_id_str] = [list(sample_useritem_sent_ids), cur_review_sent_ids_list]
            user_item_candidate_sentence_num.append(len(cur_useritem_sent_ids))
            user_item_candidate_sentence_num_sampled.append(len(sample_useritem_sent_ids))
        else:
            cnt_empty_true_sentence += 1

    # add this item-level dict into the user-level dict
    if len(item_candidate_sent_ids) == 0:
        print("User: {} has no useful items, skip it.".format(user_id_str))
    else:
        user_item_candidate_sent_ids_testset[user_id_str] = item_candidate_sent_ids
    user_cnt += 1
    if user_cnt % 1000 == 0:
        print("{} user processed.".format(user_cnt))

print('Finish.')
print('Totally {0} users, {1} items.'.format(len(testset_user_id_set), len(testset_item_id_set)))
print('Totally {0} reviews. Among them {1} reviews has empty true label sentence'.format(
    review_cnt, cnt_empty_true_sentence))
print("{} reviews have duplicate sentences.".format(review_with_duplicate_sentences))
print('During constructing, {} user-item pair are being cutted due to their length'.format(cnt_being_cut_useritem))

1000 user processed.
2000 user processed.
3000 user processed.
4000 user processed.
Finish.
Totally 4936 users, 4120 items.
Totally 19439 reviews. Among them 0 reviews has empty true label sentence
9 reviews have duplicate sentences.
During constructing, 184 user-item pair are being cutted due to their length


In [75]:
print("Totally {} user item pairs in the testset".format(
    len(user_item_candidate_sentence_num)))
print("mean number of candidate sentence: {}".format(
    np.mean(user_item_candidate_sentence_num)))
print("max number of candidate sentence: {}".format(
    np.max(user_item_candidate_sentence_num)))
print("min number of candidate sentence: {}".format(
    np.min(user_item_candidate_sentence_num)))
print("mean number of sampled candidate sentence: {}".format(
    np.mean(user_item_candidate_sentence_num_sampled)))
print("max number of sampled candidate sentence: {}".format(
    np.max(user_item_candidate_sentence_num_sampled)))
print("min number of sampled candidate sentence: {}".format(
    np.min(user_item_candidate_sentence_num_sampled)))

Totally 19439 user item pairs in the testset
mean number of candidate sentence: 638.9731982097844
max number of candidate sentence: 13121
min number of candidate sentence: 62
mean number of sampled candidate sentence: 610.6922166778127
max number of sampled candidate sentence: 2000
min number of sampled candidate sentence: 62


In [76]:
print(sorted(user_item_candidate_sentence_num)[-40:])
print(sorted(user_item_candidate_sentence_num_sampled)[-40:])

[12839, 12840, 12844, 12847, 12848, 12849, 12849, 12849, 12850, 12852, 12853, 12855, 12855, 12855, 12860, 12860, 12861, 12867, 12867, 12881, 12887, 12894, 12916, 12917, 12919, 12920, 12920, 12928, 12950, 12957, 12975, 12981, 13000, 13011, 13023, 13041, 13055, 13067, 13117, 13121]
[2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000]


In [77]:
len(user_item_candidate_sent_ids_testset)

4936

In [78]:
# save this into json file
test_useritem2sentids_filepath = '../Dataset/{}/test/useritem2sentids_test.json'.format(dataset_name)
with open(test_useritem2sentids_filepath, 'w') as f:
    print("Write file: {}".format(test_useritem2sentids_filepath))
    json.dump(user_item_candidate_sent_ids_testset, f)

Write file: ../Dataset/tripadvisor/test/useritem2sentids_test.json


In [79]:
review_test_cnt = 0
for user_chunk in user_item_candidate_sent_ids_testset.items():
    user_id = user_chunk[0]
    user_dict = user_chunk[1]
    for user_item_chunk in user_dict.items():
        item_id = user_item_chunk[0]
        candidate_sents = user_item_chunk[0]
        true_label_sents = user_item_chunk[1]
        review_test_cnt += 1
print(review_test_cnt)

19439


In [80]:
check_user_id = "0"
check_item_id = "1689"
print("user: {0} \t item: {1}".format(check_user_id, check_item_id))
print("number of sentence in candidate set: {}".format(
    len(user_item_candidate_sent_ids_testset[check_user_id][check_item_id][0])))
print("number of sentence in true review set: {}".format(
    len(user_item_candidate_sent_ids_testset[check_user_id][check_item_id][1])))

user: 0 	 item: 1689
number of sentence in candidate set: 2000
number of sentence in true review set: 3


In [81]:
# Write useritem2sentids_test into a line-by-line format
test_useritem2sentids_multiline_filepath = '../Dataset/{}/test/useritem2sentids_test_multilines.json'.format(dataset_name)
with open(test_useritem2sentids_multiline_filepath, 'w') as f1:
    print("Write file: {}".format(test_useritem2sentids_multiline_filepath))
    cnt_user = 0
    cnt_review = 0
    user_set = set()
    item_set = set()
    useritem_set = set()
    for trainset_user_chunk in list(user_item_candidate_sent_ids_testset.items()):
        user_id_str = str(trainset_user_chunk[0])
        user_id = int(trainset_user_chunk[0])
        user_item_chunks = list(trainset_user_chunk[1].items())
        for item_chunk in user_item_chunks:
            item_id_str = str(item_chunk[0])
            item_id = int(item_chunk[0])
            item_set.add(item_id_str)
            candidate_sent_ids = item_chunk[1][0]
            true_revw_sent_ids = item_chunk[1][1]
            cur_data_dict = {
                'user_id': user_id,
                'item_id': item_id,
                'candidate': candidate_sent_ids,
                'review': true_revw_sent_ids
            }
            # write this into the json file
            json.dump(cur_data_dict, f1)
            f1.write("\n")
            cnt_review += 1
            useritem_set.add((user_id_str, item_id_str))
        cnt_user += 1
        user_set.add(user_id_str)

assert len(user_set) == cnt_user
assert len(useritem_set) == cnt_review
print("Total {} users".format(cnt_user))
print("Total {} items".format(len(item_set)))
print("Totat {} reviews".format(cnt_review))

Write file: ../Dataset/tripadvisor/test/useritem2sentids_test_multilines.json
Total 4936 users
Total 4120 items
Totat 19439 reviews


In [82]:
check_user_id = "999"
check_item_id = "128"
print("user: {0} \t item: {1}".format(check_user_id, check_item_id))
print("[VALID] number of sentence in candidate set: {}".format(
    len(user_item_candidate_sent_ids_validset[check_user_id][check_item_id][0])))
print("[VALID] number of sentence in true review set: {}".format(
    len(user_item_candidate_sent_ids_validset[check_user_id][check_item_id][1])))
print("[TEST]  number of sentence in candidate set: {}".format(
    len(user_item_candidate_sent_ids_testset[check_user_id][check_item_id][0])))
print("[TEST]  number of sentence in true review set: {}".format(
    len(user_item_candidate_sent_ids_testset[check_user_id][check_item_id][1])))

user: 999 	 item: 128
[VALID] number of sentence in candidate set: 500
[VALID] number of sentence in true review set: 3
[TEST]  number of sentence in candidate set: 845
[TEST]  number of sentence in true review set: 3


In [83]:
check_user_id = "0"
check_item_id = "1111"
print("user: {0} \t item: {1}".format(check_user_id, check_item_id))
print("[VALID] number of sentence in candidate set: {}".format(
    len(user_item_candidate_sent_ids_validset[check_user_id][check_item_id][0])))
print("[VALID] number of sentence in true review set: {}".format(
    len(user_item_candidate_sent_ids_validset[check_user_id][check_item_id][1])))
print("[TEST]  number of sentence in candidate set: {}".format(
    len(user_item_candidate_sent_ids_testset[check_user_id][check_item_id][0])))
print("[TEST]  number of sentence in true review set: {}".format(
    len(user_item_candidate_sent_ids_testset[check_user_id][check_item_id][1])))

user: 0 	 item: 1111
[VALID] number of sentence in candidate set: 500
[VALID] number of sentence in true review set: 1
[TEST]  number of sentence in candidate set: 2000
[TEST]  number of sentence in true review set: 1
