In [1]:
!which python

/u/pw7nc/anaconda3/bin/python


In [2]:
import re
import json
import os

import numpy as np
import pandas as pd
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer

import spacy
import nltk
from nltk.tokenize import sent_tokenize
from spacy.lang.en import English
nlp = English()
# Create a Tokenizer with the default settings for English
# including punctuation rules and exceptions
tokenizer = nlp.tokenizer
import string
punct = string.punctuation
from sklearn.feature_extraction import _stop_words

In [3]:
dataset_name = "yelp"

# Read Data

## Load Dataset

In [4]:
dir_path = '../Dataset/{}'.format(dataset_name)
# Load test dataset
test_review = []
cnt = 0
file_path = os.path.join(dir_path, 'test_review_filtered_clean.json')
with open(file_path) as f:
    print("Load file: {}".format(file_path))
    for line in f:
        line_data = json.loads(line)
        user_id = line_data['user']
        item_id = line_data['item']
        rating = line_data['rating']
        review = line_data['review']
        test_review.append([item_id, user_id, rating, review])
        cnt += 1
        if cnt % 10000 == 0:
            print('{} lines loaded.'.format(cnt))
print('Finish loading test dataset, totally {} lines.'.format(len(test_review)))

Load file: ../Dataset/yelp/test_review_filtered_clean.json
10000 lines loaded.
20000 lines loaded.
30000 lines loaded.
40000 lines loaded.
Finish loading test dataset, totally 42702 lines.


In [5]:
dir_path = '../Dataset/{}'.format(dataset_name)
# Load train dataset
train_review = []
cnt = 0
file_path = os.path.join(dir_path, 'train_review_filtered.json')
with open(file_path) as f:
    print("Load file: {}".format(file_path))
    for line in f:
        line_data = json.loads(line)
        user_id = line_data['user']
        item_id = line_data['item']
        rating = line_data['rating']
        review = line_data['review']
        train_review.append([item_id, user_id, rating, review])
        cnt += 1
        if cnt % 100000 == 0:
            print('{} lines loaded.'.format(cnt))
print('Finish loading train dataset, totally {} lines.'.format(len(train_review)))

Load file: ../Dataset/yelp/train_review_filtered.json
100000 lines loaded.
Finish loading train dataset, totally 191227 lines.


In [6]:
df_train_data = pd.DataFrame(train_review, columns=['item', 'user', 'rating', 'review'])
df_test_data = pd.DataFrame(test_review, columns=['item', 'user', 'rating', 'review'])

In [7]:
df_test_data

Unnamed: 0,item,user,rating,review
0,1098,1001,4,"great food , great price , great atmosphere ....."
1,1473,1001,4,the bbq pork also was way different this time ...
2,157,1001,4,"but that dumb naan , or pita bread stuff was a..."
3,1707,1001,4,_ price - average - please recognize fresh veg...
4,2911,1001,4,"pizza was very good , fresh ingredients , , no..."
...,...,...,...,...
42697,3933,9999,4,"they do n't have a matinee price , but then ag..."
42698,4154,9999,3,the main draw to this casinos over the others ...
42699,4565,9999,5,it 's not like normal stouts and the flavor is...
42700,624,9999,5,my two favorite meats for tacos are carne asad...


In [8]:
train_groupby_item_user = df_train_data.groupby(['item', 'user'])
train_groupby_item_user_dict = dict(tuple(train_groupby_item_user))

In [9]:
for idx, row in df_test_data.iterrows():
    user_id_str = row['user']
    item_id_str = row['item']
    assert (item_id_str, user_id_str) not in train_groupby_item_user_dict

## Load Sentence2ID and ID2Sentence Mapping From Training Set

In [10]:
sentence2id_filepath = '../Dataset/{}/train/sentence/sentence2id.json'.format(dataset_name)
with open(sentence2id_filepath, 'r') as f:
    print("Load file: {}".format(sentence2id_filepath))
    trainset_sent_to_id = json.load(f)

Load file: ../Dataset/yelp/train/sentence/sentence2id.json


In [11]:
id2sentence_filepath = '../Dataset/{}/train/sentence/id2sentence.json'.format(dataset_name)
with open(id2sentence_filepath, 'r') as f:
    print("Load file: {}".format(id2sentence_filepath))
    trainset_id_to_sent = json.load(f)

Load file: ../Dataset/yelp/train/sentence/id2sentence.json


In [12]:
assert len(trainset_sent_to_id) == len(trainset_id_to_sent)
print("There are {} sentences in the training set.".format(len(trainset_id_to_sent)))

There are 492739 sentences in the training set.


## Load Feature Words

In [13]:
# Feature words are the same between training and testing
# since we can only know the review text from training set
feature2id_filepath = '../Dataset/{}/train/feature/feature2id.json'.format(dataset_name)
with open(feature2id_filepath, 'r') as f:
    print("Load file: {}".format(feature2id_filepath))
    feature_vocab = json.load(f)

Load file: ../Dataset/yelp/train/feature/feature2id.json


In [14]:
feature_word_list = list(feature_vocab.keys())
assert len(feature_word_list) == len(feature_vocab)
print('Number of feature words: {}'.format(len(feature_word_list)))

Number of feature words: 498


In [15]:
id2feature_filepath = '../Dataset/{}/train/feature/id2feature.json'.format(dataset_name)
with open(id2feature_filepath, 'r') as f:
    print("Load file: {}".format(id2feature_filepath))
    id2feature_train = json.load(f)

Load file: ../Dataset/yelp/train/feature/id2feature.json


# Build Sentence Vocab on TestSet
## Check Whether there are reviews with no sentence

In [16]:
invalid_data = 0
for idx, row in df_test_data.iterrows():
    review_text = row['review']
    review_sents = sent_tokenize(review_text)
    if len(review_sents) == 0:
        print(row)
        invalid_data += 1

In [17]:
print(invalid_data)

0


In [18]:
def get_tf_score(text, feature_word_list):
    vectorizer = CountVectorizer(lowercase=True, vocabulary=feature_word_list)
    word_count = vectorizer.fit_transform(text)
    return word_count.toarray()

In [19]:
# sentence vocab
sentence_count = dict()
sentence_with_no_feature = 0
# Loop for each review
# TODO: Do we need to filter out sentences with less than 3 tokens same as during training?
for idx, row in df_test_data.iterrows():
    review_text = row['review']
    review_sents = sent_tokenize(review_text)
    tf_score = get_tf_score(review_sents, feature_word_list)
    tf_sum_sents = np.sum(tf_score, axis=1)
    for i in range(len(review_sents)):
        if tf_sum_sents[i] != 0.0:
            cur_sent = review_sents[i]
            sentence_count[cur_sent] = 1 + sentence_count.get(cur_sent, 0)
        else:
            sentence_with_no_feature += 1
    if (idx+1) % 2000 == 0:
        print("Processed {} lines".format(idx+1))
print("Totally {} tracked sentences".format(len(sentence_count)))
print("There are {} sentences with no feature words".format(sentence_with_no_feature))

Processed 2000 lines
Processed 4000 lines
Processed 6000 lines
Processed 8000 lines
Processed 10000 lines
Processed 12000 lines
Processed 14000 lines
Processed 16000 lines
Processed 18000 lines
Processed 20000 lines
Processed 22000 lines
Processed 24000 lines
Processed 26000 lines
Processed 28000 lines
Processed 30000 lines
Processed 32000 lines
Processed 34000 lines
Processed 36000 lines
Processed 38000 lines
Processed 40000 lines
Processed 42000 lines
Totally 109833 tracked sentences
There are 897 sentences with no feature words


In [20]:
len(sentence_count)

109833

In [21]:
# sort sentence based on counts (the majority should be 1)
sorted_sent_counts = sorted(sentence_count.items(), key = lambda x: -x[1])

In [22]:
# sentence_vocab_list = list(sentence_count.keys())
# Building mappings from sentences to ids and ids to sentences
testset_sent_to_id = {entry[0]: str(id) for (id, entry) in enumerate(sorted_sent_counts)}
# Since we loaded all the tokenized sentences, we don't need to add the special UNK token
testset_id_to_sent = {str(id): sent for (sent, id) in testset_sent_to_id.items()}

## Save Sentence2ID into Json File (Test / Valid Set)

In [23]:
with open('../Dataset/{}/test/sentence/id2sentence.json'.format(dataset_name), 'w') as f:
    json.dump(testset_id_to_sent, f)

with open('../Dataset/{}/test/sentence/sentence2id.json'.format(dataset_name), 'w') as f:
    json.dump(testset_sent_to_id, f)

with open('../Dataset/{}/valid/sentence/id2sentence.json'.format(dataset_name), 'w') as f:
    json.dump(testset_id_to_sent, f)

with open('../Dataset/{}/valid/sentence/sentence2id.json'.format(dataset_name), 'w') as f:
    json.dump(testset_sent_to_id, f)

## Get Sentence Feature (Test / Valid Set)

In [24]:
df_test_data

Unnamed: 0,item,user,rating,review
0,1098,1001,4,"great food , great price , great atmosphere ....."
1,1473,1001,4,the bbq pork also was way different this time ...
2,157,1001,4,"but that dumb naan , or pita bread stuff was a..."
3,1707,1001,4,_ price - average - please recognize fresh veg...
4,2911,1001,4,"pizza was very good , fresh ingredients , , no..."
...,...,...,...,...
42697,3933,9999,4,"they do n't have a matinee price , but then ag..."
42698,4154,9999,3,the main draw to this casinos over the others ...
42699,4565,9999,5,it 's not like normal stouts and the flavor is...
42700,624,9999,5,my two favorite meats for tacos are carne asad...


In [25]:
print("Number of users on test set: {}".format(len(df_test_data['user'].unique())))
print("Number of items on test set: {}".format(len(df_test_data['item'].unique())))

Number of users on test set: 4604
Number of items on test set: 7602


In [26]:
# groupby item
group_by_item_test = df_test_data.groupby('item')
group_by_item_dict = dict(tuple(group_by_item_test))

In [27]:
assert len(group_by_item_dict) == len(df_test_data['item'].unique())

In [28]:
peritem_num_sent_testset = dict()
peritemreview_num_sent_testset = list()
for key, item_df_test in group_by_item_dict.items():
    # print(key)
    # print(item_df_test)
    reviews_list = item_df_test['review']
    sentence_count = 0
    for review in reviews_list:
        review_sent_count = 0
        sentences_review = sent_tokenize(review)
        for sent in sentences_review:
            if sent in testset_sent_to_id:
                sentence_count += 1
                review_sent_count += 1
        peritemreview_num_sent_testset.append(review_sent_count)
    peritem_num_sent_testset[key] = sentence_count

In [29]:
assert len(peritemreview_num_sent_testset) == len(df_test_data)

In [30]:
print("Number of review in testset: {}".format(
    len(peritemreview_num_sent_testset)))
print("Mean number of sentence per review in testset: {}".format(
    np.mean(peritemreview_num_sent_testset)))
print("Min number of sentence per review in testset {}".format(
    np.min(peritemreview_num_sent_testset)))
print("Max number of sentence per review in testset {}".format(
    np.max(peritemreview_num_sent_testset)))

Number of review in testset: 42702
Mean number of sentence per review in testset: 2.6423118355112174
Min number of sentence per review in testset 1
Max number of sentence per review in testset 20


In [31]:
print("Number of items in testset: {}".format(
    len(list(peritem_num_sent_testset.values()))
))
print("Mean number of sentence per item in testset: {}".format(
    np.mean(list(peritem_num_sent_testset.values()))
))
print("Min number of sentence per item in testset: {}".format(
    np.min(list(peritem_num_sent_testset.values()))
))
print("Max number of sentence per item in testset: {}".format(
    np.max(list(peritem_num_sent_testset.values()))
))

Number of items in testset: 7602
Mean number of sentence per item in testset: 14.842409892133649
Min number of sentence per item in testset: 1
Max number of sentence per item in testset: 157


In [32]:
# groupby user
group_by_user_test = df_test_data.groupby('user')
group_by_user_dict = dict(tuple(group_by_user_test))
assert len(group_by_user_dict) == len(df_test_data['user'].unique())

In [33]:
peruser_num_sent_testset = dict()
peruserreview_num_sent_testset = list()
for key, user_df_test in group_by_user_dict.items():
    reviews_list = user_df_test['review']
    sentence_count = 0
    for review in reviews_list:
        review_sent_count = 0
        sentences_review = sent_tokenize(review)
        for sent in sentences_review:
            if sent in testset_sent_to_id:
                sentence_count += 1
                review_sent_count += 1
        peruserreview_num_sent_testset.append(review_sent_count)
    peruser_num_sent_testset[key] = sentence_count

In [34]:
assert len(peruserreview_num_sent_testset) == len(df_test_data)

In [35]:
print("Number of review in testset: {}".format(
    len(peruserreview_num_sent_testset)))
print("Mean number of sentence per review in testset: {}".format(
    np.mean(peruserreview_num_sent_testset)))
print("Min number of sentence per review in testset {}".format(
    np.min(peruserreview_num_sent_testset)))
print("Max number of sentence per review in testset {}".format(
    np.max(peruserreview_num_sent_testset)))

Number of review in testset: 42702
Mean number of sentence per review in testset: 2.6423118355112174
Min number of sentence per review in testset 1
Max number of sentence per review in testset 20


In [36]:
print("Number of users in testset: {}".format(
    len(list(peruser_num_sent_testset.values()))
))
print("Mean number of sentence per user in testset: {}".format(
    np.mean(list(peruser_num_sent_testset.values()))
))
print("Min number of sentence per user in testset: {}".format(
    np.min(list(peruser_num_sent_testset.values()))
))
print("Max number of sentence per user in testset: {}".format(
    np.max(list(peruser_num_sent_testset.values()))
))

Number of users in testset: 4604
Mean number of sentence per user in testset: 24.507384882710685
Min number of sentence per user in testset: 1
Max number of sentence per user in testset: 202


## Compute Tf-idf

In [37]:
# get the list of sentence text on testset
testset_sent_text_list = list(testset_sent_to_id.keys())
testset_sent_text_list[:20]
# NOTE: Based on the examples, \
# should we set the short sentence threshold to be 3 instead of 2 on train?
# Currently the threshold is 2 .

['service was good .',
 'great service .',
 'the service was great .',
 'service was great .',
 'great food .',
 'friendly staff .',
 'the service was excellent .',
 'good service .',
 'the food was good .',
 'service was excellent .',
 'good food .',
 'the food was great .',
 'food was good .',
 'friendly service .',
 'prices are reasonable .',
 'excellent service .',
 'everything was delicious .',
 'great service !',
 'the food was ok .',
 'the service was good .']

In [38]:
def get_tfidf_embedding(text, feature_word_list):
    """
    :param: text: list, sent_number * word
    :return: 
        vectorizer: 
            vocabulary_: word2id
            get_feature_names(): id2word
        tfidf: array [sent_number, max_word_number]
    """
    vectorizer = CountVectorizer(lowercase=True, vocabulary=feature_word_list)
    word_count = vectorizer.fit_transform(text)
    tfidf_transformer = TfidfTransformer()
    tfidf = tfidf_transformer.fit_transform(word_count)
    tfidf_weight = tfidf.toarray()
    return vectorizer, tfidf_weight

In [39]:
cntvector, tfidf_weight = get_tfidf_embedding(testset_sent_text_list, feature_word_list)

In [40]:
tfidf_weight.shape

(109833, 498)

In [41]:
def check_vocab_is_same(sklearn_vocab, feature_vocab):
    if len(sklearn_vocab) == len(feature_vocab):
        for key, value in sklearn_vocab.items():
            sklearn_vocab_id = value
            feature_vocab_id = feature_vocab[key]
            if int(feature_vocab_id) == sklearn_vocab_id:
                continue
            else:
                return False
    else:
        return False
    return True

In [42]:
check_vocab_is_same(cntvector.vocabulary_, feature_vocab)

True

In [43]:
testset_sentence_to_feature = dict()
sentence_with_no_feature = 0
tfidf_sum_sents = np.sum(tfidf_weight, axis=1)
for i in range(len(testset_sent_text_list)):
    cur_sent = testset_sent_text_list[i]
    # if this sentence is in the sent_to_id vocabulary
    assert cur_sent in testset_sent_to_id
    # get the sentence_id (str)
    cur_sent_id = testset_sent_to_id[cur_sent]
    assert int(cur_sent_id) == i
    # find all the feature that has non-zero tf-idf weight
    feature_dict = dict()
    for j in range(len(tfidf_weight[i])):
        if tfidf_weight[i][j] != 0.0:
            # get the feature
            feature_id = str(j)
            feature = feature_word_list[j]
            feature_tfidf = tfidf_weight[i][j]
            feature_dict[feature_id] = feature_tfidf
    if len(feature_dict) > 0:
        testset_sentence_to_feature[cur_sent_id] = feature_dict
    else:
        sentence_with_no_feature += 1
    if (i+1) % 10000 == 0:
        print("Processed {} lines".format(i+1))
print("Finish. Totally {} lines".format(i+1))
print("Totally {} sentences has at least 1 feature and {} sentences don't have feature.".format(
    len(testset_sentence_to_feature), sentence_with_no_feature))

Processed 10000 lines
Processed 20000 lines
Processed 30000 lines
Processed 40000 lines
Processed 50000 lines
Processed 60000 lines
Processed 70000 lines
Processed 80000 lines
Processed 90000 lines
Processed 100000 lines
Finish. Totally 109833 lines
Totally 109833 sentences has at least 1 feature and 0 sentences don't have feature.


In [44]:
# save testset_sentence_to_feature into json file
# testset sent_to_id is same as validset, also save this to validset
sentence2feature_filepath = '../Dataset/{}/test/sentence/sentence2feature.json'.format(dataset_name)
with open(sentence2feature_filepath, 'w') as f:
    print("Write file: {}".format(sentence2feature_filepath))
    json.dump(testset_sentence_to_feature, f)
sentence2feature_filepath = '../Dataset/{}/valid/sentence/sentence2feature.json'.format(dataset_name)
with open(sentence2feature_filepath, 'w') as f:
    print("Write file: {}".format(sentence2feature_filepath))
    json.dump(testset_sentence_to_feature, f)

Write file: ../Dataset/yelp/test/sentence/sentence2feature.json
Write file: ../Dataset/yelp/valid/sentence/sentence2feature.json


In [45]:
print(testset_sentence_to_feature['0'])
print(testset_id_to_sent['0'])

{'1': 1.0}
service was good .


In [46]:
print(testset_sentence_to_feature['12310'])
print(testset_id_to_sent['12310'])
for fea_id in testset_sentence_to_feature['12310'].keys():
    print(id2feature_train[fea_id])

{'12': 0.28007497234632434, '15': 0.5606947392266455, '163': 0.7792171836328879}
i tried the chili cheese fries and to my surprise the fries were crisp yet when you go to town on the chili , softness !
cheese
fries
chili


In [47]:
# get some statistics of sentence_to_feature on testset/validset
num_feature_per_sentence = []
for key, value in testset_sentence_to_feature.items():
    num_feature_per_sentence.append(len(value))
    assert len(value) > 0       # every sentence should have at least 1 feature

In [48]:
print("Mean number of features per sentence: {}".format(np.mean(num_feature_per_sentence)))
print("Max number of features per sentence: {}".format(np.max(num_feature_per_sentence)))
print("Min number of features per sentence: {}".format(np.min(num_feature_per_sentence)))

Mean number of features per sentence: 2.0614114155126417
Max number of features per sentence: 28
Min number of features per sentence: 1


## Load User to SentenceID

In [49]:
train_user2sentids_filepath = '../Dataset/{}/train/user/user2sentids.json'.format(dataset_name)
with open(train_user2sentids_filepath, 'r') as f:
    print("Load file: {}".format(train_user2sentids_filepath))
    trainset_user_to_sent_id = json.load(f)

Load file: ../Dataset/yelp/train/user/user2sentids.json


## Load Item to SentenceID

In [50]:
train_item2sentids_filepath = '../Dataset/{}/train/item/item2sentids.json'.format(dataset_name)
with open(train_item2sentids_filepath, 'r') as f:
    print("Load file: {}".format(train_item2sentids_filepath))
    trainset_item_to_sent_id = json.load(f)

Load file: ../Dataset/yelp/train/item/item2sentids.json


## Load User-Item Pairs on TrainSet

In [51]:
train_useritem_pairs_filepath = '../Dataset/{}/train/useritem_pairs.json'.format(dataset_name)
with open(train_useritem_pairs_filepath, 'r') as f:
    print("Load file: {}".format(train_useritem_pairs_filepath))
    trainset_useritem_pairs = json.load(f)

Load file: ../Dataset/yelp/train/useritem_pairs.json


In [52]:
# Get the user set and item set on train-set
train_user_set = set()
train_item_set = set()
for key,value in trainset_useritem_pairs.items():
    uid = key
    assert uid not in train_user_set
    train_user_set.add(uid)
    for iid in value:
        train_item_set.add(iid)
print("Number of users on the constructed train set: {}".format(len(train_user_set)))
print("Number of items on the constructed train set: {}".format(len(train_item_set)))

Number of users on the constructed train set: 4604
Number of items on the constructed train set: 7837


# For Each Data Instance on TestSet

## GroupBy User

In [53]:
group_by_user_test = df_test_data.groupby('user')
print("Number of users on test-set: {}".format(len(group_by_user_test)))

Number of users on test-set: 4604


## Construct Valid Dataset

In [54]:
import random
sample_sent_num = 500           # this should be among 30, 200 and 500
user_item_candidate_sent_ids_validset = dict()
cnt_empty_true_sentence = 0
user_cnt = 0
review_cnt = 0
user_item_candidate_sentence_num = list()
user_item_candidate_sentence_num_sampled = list()
cnt_being_cut_useritem = 0
# Loop over all users
user_cnt = 0
for user_df_chunk in list(group_by_user_test):
    user_id = int(user_df_chunk[0])
    user_id_str = str(user_df_chunk[0])
    user_df = user_df_chunk[1]
    if user_id_str not in train_user_set:
        continue
    # get user sentences, these sentences are on TRAIN set
    cur_user_sent_ids = set(trainset_user_to_sent_id[user_id_str])
    # item-level dict
    item_candidate_sent_ids = dict()
    for idx, row in user_df.iterrows():
        item_id = int(row['item'])
        item_id_str = str(row['item'])
        if item_id_str not in train_item_set:
            continue
        review_text = row['review']
        review_cnt += 1
        # get item sentences, they are on TRAIN set
        cur_item_sent_ids = set(trainset_item_to_sent_id[item_id_str])
        # get review_text's sent ids, they are on TEST set
        cur_review_sent_ids = set()
        ## tokenize this review
        review_sents = sent_tokenize(review_text)
        ## check whether this sentence is in the testset_sent_to_id dict
        for sent in review_sents:
            if sent in testset_sent_to_id:
                cur_sent_id = testset_sent_to_id[sent]
                # add this sentence into the set of current review
                cur_review_sent_ids.add(cur_sent_id)
        # construct the candidate set which is an union of user sentence and item sentence
        cur_useritem_sent_ids = cur_user_sent_ids | cur_item_sent_ids
        # sample some sentences (they are on TRAIN set)
        if len(cur_useritem_sent_ids) > sample_sent_num:
            sample_useritem_sent_ids = set(random.sample(cur_useritem_sent_ids, sample_sent_num))
            cnt_being_cut_useritem += 1
        else:
            # FIXED!!
            sample_useritem_sent_ids = cur_useritem_sent_ids
        # add this into the dict
        if len(cur_review_sent_ids) != 0:
            # only add the ones that contain at least 1 true label sentence (on valid/test set)
            item_candidate_sent_ids[item_id_str] = [list(sample_useritem_sent_ids), list(cur_review_sent_ids)]
            user_item_candidate_sentence_num.append(len(cur_useritem_sent_ids))
            user_item_candidate_sentence_num_sampled.append(len(sample_useritem_sent_ids))
        else:
            cnt_empty_true_sentence += 1

    # add this item-level dict into the user-level dict
    if len(item_candidate_sent_ids) == 0:
        print("User: {} has no useful items, skip it.".format(user_id_str))
    else:
        user_item_candidate_sent_ids_validset[user_id_str] = item_candidate_sent_ids
    user_cnt += 1
    if user_cnt % 1000 == 0:
        print("{} user processed.".format(user_cnt))

print('Finish.')
print('Totally {} users'.format(user_cnt))
print('Totally {0} reviews. Among them {1} reviews has empty true label sentence'.format(
    review_cnt, cnt_empty_true_sentence))
print('During constructing, {} user-item pair are being cutted due to their length'.format(cnt_being_cut_useritem))

1000 user processed.
2000 user processed.
3000 user processed.
4000 user processed.
Finish.
Totally 4604 users
Totally 42702 reviews. Among them 0 reviews has empty true label sentence
During constructing, 3237 user-item pair are being cutted due to their length


In [55]:
print("Totally {} user item pairs in the testset".format(len(user_item_candidate_sentence_num_sampled)))
print("mean number of candidate sentence: {}".format(np.median(user_item_candidate_sentence_num_sampled)))
print("max number of candidate sentence: {}".format(np.max(user_item_candidate_sentence_num_sampled)))
print("min number of candidate sentence: {}".format(np.min(user_item_candidate_sentence_num_sampled)))

Totally 42702 user item pairs in the testset
mean number of candidate sentence: 186.0
max number of candidate sentence: 500
min number of candidate sentence: 35


In [60]:
print(sorted(user_item_candidate_sentence_num)[-200:-100])

""" This shows that if we restrict the candidate sentences to have a maximum number of 1200,
we will cut-off about 1000 reviews. This will be applied on test set to avoid user-item paris
with too many candidate sentences.
"""

[1087, 1088, 1088, 1089, 1089, 1091, 1092, 1092, 1092, 1093, 1094, 1097, 1100, 1100, 1100, 1101, 1101, 1102, 1103, 1103, 1103, 1105, 1106, 1106, 1108, 1108, 1112, 1112, 1115, 1117, 1117, 1119, 1119, 1119, 1122, 1124, 1124, 1129, 1130, 1131, 1134, 1136, 1140, 1141, 1142, 1144, 1148, 1154, 1156, 1156, 1158, 1162, 1163, 1165, 1166, 1167, 1167, 1168, 1168, 1169, 1172, 1174, 1174, 1175, 1177, 1178, 1181, 1183, 1195, 1208, 1219, 1225, 1226, 1242, 1246, 1253, 1265, 1270, 1277, 1303, 1312, 1326, 1335, 1336, 1345, 1356, 1359, 1363, 1368, 1372, 1372, 1376, 1383, 1387, 1388, 1392, 1414, 1415, 1416, 1432]


' This shows that if we restrict the candidate sentences to have a maximum number of 1200,\nwe will cut-off about 1000 reviews. This will be applied on test set to avoid user-item paris\nwith too many candidate sentences.\n'

In [61]:
len(user_item_candidate_sent_ids_validset)

4604

In [62]:
# save this into json file
valid_useritem2sentids_filepath = '../Dataset/{}/valid/useritem2sentids_test.json'.format(dataset_name)
with open(valid_useritem2sentids_filepath, 'w') as f:
    print("Write file: {}".format(valid_useritem2sentids_filepath))
    json.dump(user_item_candidate_sent_ids_validset, f)

Write file: ../Dataset/yelp/valid/useritem2sentids_test.json


In [66]:
check_user_id = "9999"
check_item_id = "4154"
print("user: {0} \t item: {1}".format(check_user_id, check_item_id))
print("number of sentence in candidate set: {}".format(
    len(user_item_candidate_sent_ids_validset[check_user_id][check_item_id][0])))
print("number of sentence in true review set: {}".format(
    len(user_item_candidate_sent_ids_validset[check_user_id][check_item_id][1])))

user: 9999 	 item: 4154
number of sentence in candidate set: 116
number of sentence in true review set: 1


In [67]:
for sentid in user_item_candidate_sent_ids_validset[check_user_id][check_item_id][1]:
    print(testset_id_to_sent[sentid])

the main draw to this casinos over the others nearby is their restaurant venues .


In [68]:
# Checking How Many User/Item/Review are in the valid set
cnt_user = 0
cnt_review = 0
cnt_item_set = set()
for trainset_user_chunk in list(user_item_candidate_sent_ids_validset.items()):
    user_id_str = str(trainset_user_chunk[0])
    user_id = int(trainset_user_chunk[0])
    user_item_chunks = list(trainset_user_chunk[1].items())
    for item_chunk in user_item_chunks:
        item_id_str = str(item_chunk[0])
        item_id = int(item_chunk[0])
        # candidate_true_sent_ids = item_chunk[1]
        # cur_data_dict = {'user_id': user_id, 'item_id': item_id, 'sent_id': candidate_true_sent_ids}
        # write this into the json file
        # json.dump(cur_data_dict, f1)
        # f1.write("\n")
        # assert user_id_str in train_user_id_set
        # assert item_id_str in train_item_id_set
        cnt_item_set.add(item_id_str)
        cnt_review += 1
    cnt_user += 1

print("Total number of reviews: {}".format(cnt_review))
print("Total number of user: {}".format(cnt_user))
print("Total number of item: {}".format(len(cnt_item_set)))

Total number of reviews: 42702
Total number of user: 4604
Total number of item: 7602


In [69]:
# Write useritem2sentids_test into a line-by-line format
valid_useritem2sentids_multiline_filepath = '../Dataset/{}/valid/useritem2sentids_test_multilines.json'.format(dataset_name)
with open(valid_useritem2sentids_multiline_filepath, 'w') as f1:
    print("Write file: {}".format(valid_useritem2sentids_multiline_filepath))
    cnt_user = 0
    cnt_review = 0
    user_set = set()
    item_set = set()
    useritem_set = set()
    for trainset_user_chunk in list(user_item_candidate_sent_ids_validset.items()):
        user_id_str = str(trainset_user_chunk[0])
        user_id = int(trainset_user_chunk[0])
        user_item_chunks = list(trainset_user_chunk[1].items())
        for item_chunk in user_item_chunks:
            item_id_str = str(item_chunk[0])
            item_id = int(item_chunk[0])
            item_set.add(item_id_str)
            candidate_sent_ids = item_chunk[1][0]
            true_revw_sent_ids = item_chunk[1][1]
            cur_data_dict = {'user_id':user_id, 'item_id':item_id, 'candidate':candidate_sent_ids, "review":true_revw_sent_ids}
            # write this into the json file
            json.dump(cur_data_dict, f1)
            f1.write("\n")
            cnt_review += 1
            useritem_set.add((user_id_str, item_id_str))
        cnt_user += 1
        user_set.add(user_id_str)

assert len(user_set) == cnt_user
assert len(useritem_set) == cnt_review
print("Total {} users".format(cnt_user))
print("Total {} items".format(len(item_set)))
print("Totat {} reviews".format(cnt_review))

Write file: ../Dataset/yelp/valid/useritem2sentids_test_multilines.json
Total 4604 users
Total 7602 items
Totat 42702 reviews


# Construct Test Dataset

In [70]:
sample_sent_num = 1200
user_item_candidate_sent_ids_testset = dict()
cnt_empty_true_sentence = 0
user_cnt = 0
review_cnt = 0
user_item_candidate_sentence_num = list()
user_item_candidate_sentence_num_sampled = list()
cnt_being_cut_useritem = 0
# Loop over all users
user_cnt = 0
for user_df_chunk in list(group_by_user_test):
    user_id = int(user_df_chunk[0])
    user_id_str = str(user_df_chunk[0])
    if user_id_str not in train_user_set:
        continue
    user_df = user_df_chunk[1]
    # get user sentences, these sentences are on TRAIN set
    cur_user_sent_ids = set(trainset_user_to_sent_id[user_id_str])
    # item-level dict
    item_candidate_sent_ids = dict()
    for idx, row in user_df.iterrows():
        item_id = int(row['item'])
        item_id_str = str(row['item'])
        if item_id_str not in train_item_set:
            continue
        review_text = row['review']
        review_cnt += 1
        # get item sentences, they are on TRAIN set
        cur_item_sent_ids = set(trainset_item_to_sent_id[item_id_str])
        # get review_text's sent ids, they are on TEST set
        cur_review_sent_ids = set()
        ## tokenize this review
        review_sents = sent_tokenize(review_text)
        ## check whether this sentence is in the testset_sent_to_id dict
        for sent in review_sents:
            if sent in testset_sent_to_id:
                cur_sent_id = testset_sent_to_id[sent]
                # add this sentence into the set of current review
                cur_review_sent_ids.add(cur_sent_id)
        # set union
        cur_useritem_sent_ids = cur_user_sent_ids | cur_item_sent_ids
        # sample some sentences (they are on TRAIN set)
        if len(cur_useritem_sent_ids) > sample_sent_num:
            sample_useritem_sent_ids = set(random.sample(cur_useritem_sent_ids, sample_sent_num))
            cnt_being_cut_useritem += 1
        else:
            # FIXED!!
            # sample_useritem_sent_ids = cur_user_sent_ids
            sample_useritem_sent_ids = cur_useritem_sent_ids
        # add this into the dict
        if len(cur_review_sent_ids) != 0:
            item_candidate_sent_ids[item_id_str] = [list(sample_useritem_sent_ids), list(cur_review_sent_ids)]
            user_item_candidate_sentence_num.append(len(cur_useritem_sent_ids))
            user_item_candidate_sentence_num_sampled.append(len(sample_useritem_sent_ids))
        else:
            cnt_empty_true_sentence += 1

    # add this item-level dict into the user-level dict
    user_item_candidate_sent_ids_testset[user_id_str] = item_candidate_sent_ids
    user_cnt += 1
    if user_cnt % 1000 == 0:
        print("{} user processed.".format(user_cnt))

print('Finish.')
print('Totally {} users'.format(user_cnt))
print('Totally {0} reviews. Among them {1} reviews has empty true label sentence'.format(
    review_cnt, cnt_empty_true_sentence))
print('During constructing, {} user-item pair are being cutted due to their length'.format(cnt_being_cut_useritem))

1000 user processed.
2000 user processed.
3000 user processed.
4000 user processed.
Finish.
Totally 4604 users
Totally 42702 reviews. Among them 0 reviews has empty true label sentence
During constructing, 131 user-item pair are being cutted due to their length


In [71]:
print("Totally {} user item pairs in the testset".format(
    len(user_item_candidate_sentence_num)))
print("mean number of candidate sentence: {}".format(
    np.mean(user_item_candidate_sentence_num)))
print("max number of candidate sentence: {}".format(
    np.max(user_item_candidate_sentence_num)))
print("min number of candidate sentence: {}".format(
    np.min(user_item_candidate_sentence_num)))
print("mean number of sampled candidate sentence: {}".format(
    np.mean(user_item_candidate_sentence_num_sampled)))
print("max number of sampled candidate sentence: {}".format(
    np.max(user_item_candidate_sentence_num_sampled)))
print("min number of sampled candidate sentence: {}".format(
    np.min(user_item_candidate_sentence_num_sampled)))

Totally 42702 user item pairs in the testset
mean number of candidate sentence: 239.02236429207062
max number of candidate sentence: 2211
min number of candidate sentence: 35
mean number of sampled candidate sentence: 237.80698796309306
max number of sampled candidate sentence: 1200
min number of sampled candidate sentence: 35


In [72]:
print(sorted(user_item_candidate_sentence_num)[-40:])
print(sorted(user_item_candidate_sentence_num_sampled)[-40:])

[1768, 1772, 1781, 1790, 1791, 1792, 1795, 1798, 1801, 1808, 1814, 1815, 1815, 1818, 1819, 1820, 1821, 1826, 1831, 1832, 1833, 1851, 1852, 1857, 1874, 1880, 1889, 1892, 1897, 1910, 1914, 1914, 1923, 1928, 1955, 1963, 2017, 2153, 2176, 2211]
[1200, 1200, 1200, 1200, 1200, 1200, 1200, 1200, 1200, 1200, 1200, 1200, 1200, 1200, 1200, 1200, 1200, 1200, 1200, 1200, 1200, 1200, 1200, 1200, 1200, 1200, 1200, 1200, 1200, 1200, 1200, 1200, 1200, 1200, 1200, 1200, 1200, 1200, 1200, 1200]


In [73]:
len(user_item_candidate_sent_ids_testset)

4604

In [74]:
# save this into json file
test_useritem2sentids_filepath = '../Dataset/{}/test/useritem2sentids_test.json'.format(dataset_name)
with open(test_useritem2sentids_filepath, 'w') as f:
    print("Write file: {}".format(test_useritem2sentids_filepath))
    json.dump(user_item_candidate_sent_ids_testset, f)

Write file: ../Dataset/yelp/test/useritem2sentids_test.json


In [75]:
review_test_cnt = 0
for user_chunk in user_item_candidate_sent_ids_testset.items():
    user_id = user_chunk[0]
    user_dict = user_chunk[1]
    for user_item_chunk in user_dict.items():
        item_id = user_item_chunk[0]
        candidate_sents = user_item_chunk[0]
        true_label_sents = user_item_chunk[1]
        review_test_cnt += 1
print(review_test_cnt)

42702


In [76]:
check_user_id = "9999"
check_item_id = "4154"
print("user: {0} \t item: {1}".format(check_user_id, check_item_id))
print("number of sentence in candidate set: {}".format(
    len(user_item_candidate_sent_ids_testset[check_user_id][check_item_id][0])))
print("number of sentence in true review set: {}".format(
    len(user_item_candidate_sent_ids_testset[check_user_id][check_item_id][1])))

user: 9999 	 item: 4154
number of sentence in candidate set: 116
number of sentence in true review set: 1


In [77]:
# Write useritem2sentids_test into a line-by-line format
test_useritem2sentids_multiline_filepath = '../Dataset/{}/test/useritem2sentids_test_multilines.json'.format(dataset_name)
with open(test_useritem2sentids_multiline_filepath, 'w') as f1:
    print("Write file: {}".format(test_useritem2sentids_multiline_filepath))
    cnt_user = 0
    cnt_review = 0
    user_set = set()
    item_set = set()
    useritem_set = set()
    for trainset_user_chunk in list(user_item_candidate_sent_ids_testset.items()):
        user_id_str = str(trainset_user_chunk[0])
        user_id = int(trainset_user_chunk[0])
        user_item_chunks = list(trainset_user_chunk[1].items())
        for item_chunk in user_item_chunks:
            item_id_str = str(item_chunk[0])
            item_id = int(item_chunk[0])
            item_set.add(item_id_str)
            candidate_sent_ids = item_chunk[1][0]
            true_revw_sent_ids = item_chunk[1][1]
            cur_data_dict = {
                'user_id': user_id,
                'item_id': item_id,
                'candidate': candidate_sent_ids,
                'review': true_revw_sent_ids
            }
            # write this into the json file
            json.dump(cur_data_dict, f1)
            f1.write("\n")
            cnt_review += 1
            useritem_set.add((user_id_str, item_id_str))
        cnt_user += 1
        user_set.add(user_id_str)

assert len(user_set) == cnt_user
assert len(useritem_set) == cnt_review
print("Total {} users".format(cnt_user))
print("Total {} items".format(len(item_set)))
print("Totat {} reviews".format(cnt_review))

Write file: ../Dataset/yelp/test/useritem2sentids_test_multilines.json
Total 4604 users
Total 7602 items
Totat 42702 reviews


In [79]:
check_user_id = "30"
check_item_id = "354"
print("user: {0} \t item: {1}".format(check_user_id, check_item_id))
print("[VALID] number of sentence in candidate set: {}".format(
    len(user_item_candidate_sent_ids_validset[check_user_id][check_item_id][0])))
print("[VALID] number of sentence in true review set: {}".format(
    len(user_item_candidate_sent_ids_validset[check_user_id][check_item_id][1])))
print("[TEST]  number of sentence in candidate set: {}".format(
    len(user_item_candidate_sent_ids_testset[check_user_id][check_item_id][0])))
print("[TEST]  number of sentence in true review set: {}".format(
    len(user_item_candidate_sent_ids_testset[check_user_id][check_item_id][1])))

user: 30 	 item: 354
[VALID] number of sentence in candidate set: 500
[VALID] number of sentence in true review set: 2
[TEST]  number of sentence in candidate set: 1078
[TEST]  number of sentence in true review set: 2
