In [1]:
!which python

/u/pw7nc/anaconda3/bin/python


In [2]:
import re
import json
import os

import numpy as np
import pandas as pd
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer

import spacy
import nltk
from nltk.tokenize import sent_tokenize
from spacy.lang.en import English
nlp = English()
# Create a Tokenizer with the default settings for English
# including punctuation rules and exceptions
tokenizer = nlp.tokenizer
import string
punct = string.punctuation
from sklearn.feature_extraction import _stop_words

In [3]:
dataset_name = 'wine'

# Read Data

## Load Dataset

In [4]:
dir_path = '../Dataset/{}'.format(dataset_name)
# Load test dataset
test_review = []
cnt = 0
file_path = os.path.join(dir_path, 'test_filtered.json')
with open(file_path) as f:
    print("Load file: {}".format(file_path))
    for line in f:
        line_data = json.loads(line)
        user_id = line_data['user']
        item_id = line_data['item']
        rating = line_data['rating']
        review = line_data['review']
        test_review.append([item_id, user_id, rating, review])
        cnt += 1
        if cnt % 10000 == 0:
            print('{} lines loaded.'.format(cnt))
print('Finish loading test dataset, totally {} lines.'.format(len(test_review)))

Load file: ../Dataset/wine/test_filtered.json
10000 lines loaded.
20000 lines loaded.
30000 lines loaded.
40000 lines loaded.
50000 lines loaded.
Finish loading test dataset, totally 59294 lines.


In [5]:
dir_path = '../Dataset/{}'.format(dataset_name)
# Load train dataset
train_review = []
cnt = 0
file_path = os.path.join(dir_path, 'train_filtered.json')
with open(file_path) as f:
    print("Load file: {}".format(file_path))
    for line in f:
        line_data = json.loads(line)
        user_id = line_data['user']
        item_id = line_data['item']
        rating = line_data['rating']
        review = line_data['review']
        train_review.append([item_id, user_id, rating, review])
        cnt += 1
        if cnt % 100000 == 0:
            print('{} lines loaded.'.format(cnt))
print('Finish loading train dataset, totally {} lines.'.format(len(train_review)))

Load file: ../Dataset/wine/train_filtered.json
100000 lines loaded.
200000 lines loaded.
Finish loading train dataset, totally 248452 lines.


In [6]:
df_train_data = pd.DataFrame(train_review, columns=['item', 'user', 'rating', 'review'])
df_test_data = pd.DataFrame(test_review, columns=['item', 'user', 'rating', 'review'])

In [7]:
df_test_data

Unnamed: 0,item,user,rating,review
0,1176422,131074,91,"ripe red currant jam , cedar box , spices and ..."
1,892706,131074,92,this wine is an absolute beauty . quite burgun...
2,598605,131074,93,wonderful complex nose . youthful fruit of str...
3,684257,131074,90,sensual cabernet with velvety tannins and coco...
4,544877,131074,97,"explosion of aromas on the nose , from spice t..."
...,...,...,...,...
59289,224276,130851,89,"purple red color . blackberries , currants , c..."
59290,4058,130971,94,"ooof , this wine was incredible ! insomnia due..."
59291,260890,130971,91,decanted 1/2 hour and this nice . lots of choc...
59292,1760,152917,93,"interesting bottle , but let me start by sayin..."


In [8]:
train_groupby_item_user = df_train_data.groupby(['item', 'user'])
train_groupby_item_user_dict = dict(tuple(train_groupby_item_user))

In [9]:
for idx, row in df_test_data.iterrows():
    user_id_str = row['user']
    item_id_str = row['item']
    assert (item_id_str, user_id_str) not in train_groupby_item_user_dict

## Load Sentence2ID and ID2Sentence Mapping From Training Set

In [10]:
sentence2id_filepath = '../Dataset/{}/train/sentence/sentence2id.json'.format(dataset_name)
with open(sentence2id_filepath, 'r') as f:
    print("Load file: {}".format(sentence2id_filepath))
    trainset_sent_to_id = json.load(f)

Load file: ../Dataset/wine/train/sentence/sentence2id.json


In [11]:
id2sentence_filepath = '../Dataset/{}/train/sentence/id2sentence.json'.format(dataset_name)
with open(id2sentence_filepath, 'r') as f:
    print("Load file: {}".format(id2sentence_filepath))
    trainset_id_to_sent = json.load(f)

Load file: ../Dataset/wine/train/sentence/id2sentence.json


In [12]:
assert len(trainset_sent_to_id) == len(trainset_id_to_sent)
print("There are {} sentences in the training set.".format(len(trainset_id_to_sent)))

There are 554564 sentences in the training set.


## Load Feature Words

In [13]:
# Feature words are the same between training and testing
# since we can only know the review text from training set
feature2id_filepath = '../Dataset/{}/train/feature/feature2id.json'.format(dataset_name)
with open(feature2id_filepath, 'r') as f:
    print("Load file: {}".format(feature2id_filepath))
    feature_vocab = json.load(f)

Load file: ../Dataset/wine/train/feature/feature2id.json


In [14]:
feature_word_list = list(feature_vocab.keys())
assert len(feature_word_list) == len(feature_vocab)
print('Number of feature words: {}'.format(len(feature_word_list)))

Number of feature words: 215


In [15]:
id2feature_filepath = '../Dataset/{}/train/feature/id2feature.json'.format(dataset_name)
with open(id2feature_filepath, 'r') as f:
    print("Load file: {}".format(id2feature_filepath))
    id2feature_train = json.load(f)

Load file: ../Dataset/wine/train/feature/id2feature.json


# Build Sentence Vocab on TestSet

## Check Whether there are reviews with no sentence

In [16]:
invalid_data = 0
for idx, row in df_test_data.iterrows():
    review_text = row['review']
    review_sents = sent_tokenize(review_text)
    if len(review_sents) == 0:
        print(row)
        invalid_data += 1

In [17]:
print(invalid_data)

0


In [18]:
def get_tf_score(text, feature_word_list):
    vectorizer = CountVectorizer(lowercase=True, vocabulary=feature_word_list)
    word_count = vectorizer.fit_transform(text)
    return word_count.toarray()

In [19]:
# sentence vocab
sentence_count = dict()
sentence_with_no_feature = 0
# Loop for each review
# TODO: Do we need to filter out sentences with less than 3 tokens same as during training?
for idx, row in df_test_data.iterrows():
    review_text = row['review']
    review_sents = sent_tokenize(review_text)
    tf_score = get_tf_score(review_sents, feature_word_list)
    tf_sum_sents = np.sum(tf_score, axis=1)
    for i in range(len(review_sents)):
        if tf_sum_sents[i] != 0.0:
            cur_sent = review_sents[i]
            sentence_count[cur_sent] = 1 + sentence_count.get(cur_sent, 0)
        else:
            sentence_with_no_feature += 1
    if (idx+1) % 2000 == 0:
        print("Processed {} lines".format(idx+1))
print("Totally {} tracked sentences".format(len(sentence_count)))
print("There are {} sentences with no feature words".format(sentence_with_no_feature))

Processed 2000 lines
Processed 4000 lines
Processed 6000 lines
Processed 8000 lines
Processed 10000 lines
Processed 12000 lines
Processed 14000 lines
Processed 16000 lines
Processed 18000 lines
Processed 20000 lines
Processed 22000 lines
Processed 24000 lines
Processed 26000 lines
Processed 28000 lines
Processed 30000 lines
Processed 32000 lines
Processed 34000 lines
Processed 36000 lines
Processed 38000 lines
Processed 40000 lines
Processed 42000 lines
Processed 44000 lines
Processed 46000 lines
Processed 48000 lines
Processed 50000 lines
Processed 52000 lines
Processed 54000 lines
Processed 56000 lines
Processed 58000 lines
Totally 136421 tracked sentences
There are 115424 sentences with no feature words


In [20]:
len(sentence_count)

136421

In [21]:
# sort sentence based on counts (the majority should be 1)
sorted_sent_counts = sorted(sentence_count.items(), key = lambda x: -x[1])

In [22]:
# sentence_vocab_list = list(sentence_count.keys())
# Building mappings from sentences to ids and ids to sentences
testset_sent_to_id = {entry[0]: str(id) for (id, entry) in enumerate(sorted_sent_counts)}
# Since we loaded all the tokenized sentences, we don't need to add the special UNK token
testset_id_to_sent = {str(id): sent for (sent, id) in testset_sent_to_id.items()}

## Save Sentence2ID into Json File (Test / Valid set)

In [23]:
with open('../Dataset/{}/test/sentence/id2sentence.json'.format(dataset_name), 'w') as f:
    json.dump(testset_id_to_sent, f)

with open('../Dataset/{}/test/sentence/sentence2id.json'.format(dataset_name), 'w') as f:
    json.dump(testset_sent_to_id, f)

with open('../Dataset/{}/valid/sentence/id2sentence.json'.format(dataset_name), 'w') as f:
    json.dump(testset_id_to_sent, f)

with open('../Dataset/{}/valid/sentence/sentence2id.json'.format(dataset_name), 'w') as f:
    json.dump(testset_sent_to_id, f)

## Get Sentence Feature (Test/Valid Set)

In [24]:
df_test_data

Unnamed: 0,item,user,rating,review
0,1176422,131074,91,"ripe red currant jam , cedar box , spices and ..."
1,892706,131074,92,this wine is an absolute beauty . quite burgun...
2,598605,131074,93,wonderful complex nose . youthful fruit of str...
3,684257,131074,90,sensual cabernet with velvety tannins and coco...
4,544877,131074,97,"explosion of aromas on the nose , from spice t..."
...,...,...,...,...
59289,224276,130851,89,"purple red color . blackberries , currants , c..."
59290,4058,130971,94,"ooof , this wine was incredible ! insomnia due..."
59291,260890,130971,91,decanted 1/2 hour and this nice . lots of choc...
59292,1760,152917,93,"interesting bottle , but let me start by sayin..."


In [25]:
print("Number of users on test set: {}".format(len(df_test_data['user'].unique())))
print("Number of items on test set: {}".format(len(df_test_data['item'].unique())))

Number of users on test set: 6080
Number of items on test set: 14529


In [26]:
# groupby item
group_by_item_test = df_test_data.groupby('item')
group_by_item_dict = dict(tuple(group_by_item_test))

In [27]:
assert len(group_by_item_dict) == len(df_test_data['item'].unique())

In [28]:
group_by_item_dict['246']

Unnamed: 0,item,user,rating,review
32,246,5,89,there is nothing similar to a leonetti nose . ...
3231,246,808,90,"still young in colour , dark cherry center , c..."
31295,246,163680,87,i agree with my previous posting . not a bad w...
49005,246,78074,88,"nose was very weak , barely perceptible . howe..."


In [29]:
peritem_num_sent_testset = dict()
peritemreview_num_sent_testset = list()
for key, item_df_test in group_by_item_dict.items():
    # print(key)
    # print(item_df_test)
    reviews_list = item_df_test['review']
    sentence_count = 0
    for review in reviews_list:
        review_sent_count = 0
        sentences_review = sent_tokenize(review)
        for sent in sentences_review:
            if sent in testset_sent_to_id:
                sentence_count += 1
                review_sent_count += 1
        peritemreview_num_sent_testset.append(review_sent_count)
    peritem_num_sent_testset[key] = sentence_count

In [30]:
assert len(peritemreview_num_sent_testset) == len(df_test_data)

In [31]:
print("Number of review in testset: {}".format(
    len(peritemreview_num_sent_testset)))
print("Mean number of sentence per review in testset: {}".format(
    np.mean(peritemreview_num_sent_testset)))
print("Min number of sentence per review in testset {}".format(
    np.min(peritemreview_num_sent_testset)))
print("Max number of sentence per review in testset {}".format(
    np.max(peritemreview_num_sent_testset)))

Number of review in testset: 59294
Mean number of sentence per review in testset: 2.398556346341957
Min number of sentence per review in testset 1
Max number of sentence per review in testset 16


In [33]:
print("Number of items in testset: {}".format(
    len(list(peritem_num_sent_testset.values()))
))
print("Mean number of sentence per item in testset: {}".format(
    np.mean(list(peritem_num_sent_testset.values()))
))
print("Min number of sentence per item in testset: {}".format(
    np.min(list(peritem_num_sent_testset.values()))
))
print("Max number of sentence per item in testset: {}".format(
    np.max(list(peritem_num_sent_testset.values()))
))

Number of items in testset: 14529
Mean number of sentence per item in testset: 9.788698465138689
Min number of sentence per item in testset: 1
Max number of sentence per item in testset: 128


## Compute Tf-idf

In [34]:
# get the list of sentence text on testset
testset_sent_text_list = list(testset_sent_to_id.keys())
testset_sent_text_list[:10]
# NOTE: Based on the examples, \
# should we set the short sentence threshold to be 2 instead of 3 on train?

['great wine .',
 'nice wine .',
 'very nice wine .',
 'nose',
 'good acidity .',
 'dark purple color .',
 'dark ruby color .',
 'dark purple .',
 'nice acidity .',
 'excellent wine .']

In [35]:
def get_tfidf_embedding(text, feature_word_list):
    """
    :param: text: list, sent_number * word
    :return: 
        vectorizer: 
            vocabulary_: word2id
            get_feature_names(): id2word
        tfidf: array [sent_number, max_word_number]
    """
    vectorizer = CountVectorizer(lowercase=True, vocabulary=feature_word_list)
    word_count = vectorizer.fit_transform(text)
    tfidf_transformer = TfidfTransformer()
    tfidf = tfidf_transformer.fit_transform(word_count)
    tfidf_weight = tfidf.toarray()
    return vectorizer, tfidf_weight

In [36]:
cntvector, tfidf_weight = get_tfidf_embedding(testset_sent_text_list, feature_word_list)

In [37]:
tfidf_weight.shape

(136421, 215)

In [38]:
def check_vocab_is_same(sklearn_vocab, feature_vocab):
    if len(sklearn_vocab) == len(feature_vocab):
        for key, value in sklearn_vocab.items():
            sklearn_vocab_id = value
            feature_vocab_id = feature_vocab[key]
            if int(feature_vocab_id) == sklearn_vocab_id:
                continue
            else:
                return False
    else:
        return False
    return True

In [39]:
check_vocab_is_same(cntvector.vocabulary_, feature_vocab)

True

In [40]:
testset_sentence_to_feature = dict()
sentence_with_no_feature = 0
tfidf_sum_sents = np.sum(tfidf_weight, axis=1)
for i in range(len(testset_sent_text_list)):
    cur_sent = testset_sent_text_list[i]
    # if this sentence is in the sent_to_id vocabulary
    assert cur_sent in testset_sent_to_id
    # get the sentence_id (str)
    cur_sent_id = testset_sent_to_id[cur_sent]
    assert int(cur_sent_id) == i
    # find all the feature that has non-zero tf-idf weight
    feature_dict = dict()
    for j in range(len(tfidf_weight[i])):
        if tfidf_weight[i][j] != 0.0:
            # get the feature
            feature_id = str(j)
            feature = feature_word_list[j]
            feature_tfidf = tfidf_weight[i][j]
            feature_dict[feature_id] = feature_tfidf
    if len(feature_dict) > 0:
        testset_sentence_to_feature[cur_sent_id] = feature_dict
    else:
        sentence_with_no_feature += 1
    if (i+1) % 10000 == 0:
        print("Processed {} lines".format(i+1))
print("Finish. Totally {} lines".format(i+1))
print("Totally {} sentences has at least 1 feature and {} sentences don't have feature.".format(
    len(testset_sentence_to_feature), sentence_with_no_feature))

Processed 10000 lines
Processed 20000 lines
Processed 30000 lines
Processed 40000 lines
Processed 50000 lines
Processed 60000 lines
Processed 70000 lines
Processed 80000 lines
Processed 90000 lines
Processed 100000 lines
Processed 110000 lines
Processed 120000 lines
Processed 130000 lines
Finish. Totally 136421 lines
Totally 136421 sentences has at least 1 feature and 0 sentences don't have feature.


In [41]:
# save testset_sentence_to_feature into json file
# testset sent_to_id is same as validset, also save this to validset
sentence2feature_filepath = '../Dataset/{}/test/sentence/sentence2feature.json'.format(dataset_name)
with open(sentence2feature_filepath, 'w') as f:
    print("Write file: {}".format(sentence2feature_filepath))
    json.dump(testset_sentence_to_feature, f)
sentence2feature_filepath = '../Dataset/{}/valid/sentence/sentence2feature.json'.format(dataset_name)
with open(sentence2feature_filepath, 'w') as f:
    print("Write file: {}".format(sentence2feature_filepath))
    json.dump(testset_sentence_to_feature, f)

Write file: ../Dataset/wine/test/sentence/sentence2feature.json
Write file: ../Dataset/wine/valid/sentence/sentence2feature.json


In [42]:
print(testset_sentence_to_feature['0'])
print(testset_id_to_sent['0'])

{'0': 1.0}
great wine .


In [44]:
print(testset_sentence_to_feature['12310'])
print(testset_id_to_sent['12310'])
for fea_id in testset_sentence_to_feature['12310'].keys():
    print(id2feature_train[fea_id])

{'2': 0.3526602148196162, '3': 0.7377476845216935, '13': 0.5756380172176452}
on the palate , similar notes - lots of citrus , dark chocolate , dark fruits .
palate
dark
citrus


In [45]:
# get some statistics of sentence_to_feature on testset/validset
num_feature_per_sentence = []
for key, value in testset_sentence_to_feature.items():
    num_feature_per_sentence.append(len(value))
    assert len(value) > 0       # every sentence should have at least 1 feature

In [46]:
print("Mean number of features per sentence: {}".format(np.mean(num_feature_per_sentence)))
print("Max number of features per sentence: {}".format(np.max(num_feature_per_sentence)))
print("Min number of features per sentence: {}".format(np.min(num_feature_per_sentence)))

Mean number of features per sentence: 1.7446727410002858
Max number of features per sentence: 13
Min number of features per sentence: 1


## Load User to SentenceID

In [47]:
train_user2sentids_filepath = '../Dataset/{}/train/user/user2sentids.json'.format(dataset_name)
with open(train_user2sentids_filepath, 'r') as f:
    print("Load file: {}".format(train_user2sentids_filepath))
    trainset_user_to_sent_id = json.load(f)

Load file: ../Dataset/wine/train/user/user2sentids.json


## Load Item to SentenceID

In [48]:
train_item2sentids_filepath = '../Dataset/{}/train/item/item2sentids.json'.format(dataset_name)
with open(train_item2sentids_filepath, 'r') as f:
    print("Load file: {}".format(train_item2sentids_filepath))
    trainset_item_to_sent_id = json.load(f)

Load file: ../Dataset/wine/train/item/item2sentids.json


## Load User-Item Pairs on TrainSet

In [49]:
train_useritem_pairs_filepath = '../Dataset/{}/train/useritem_pairs.json'.format(dataset_name)
with open(train_useritem_pairs_filepath, 'r') as f:
    print("Load file: {}".format(train_useritem_pairs_filepath))
    trainset_useritem_pairs = json.load(f)

Load file: ../Dataset/wine/train/useritem_pairs.json


In [50]:
# Get the user set and item set on train-set
train_user_set = set()
train_item_set = set()
for key,value in trainset_useritem_pairs.items():
    uid = key
    assert uid not in train_user_set
    train_user_set.add(uid)
    for iid in value:
        train_item_set.add(iid)
print("Number of users on the constructed train set: {}".format(len(train_user_set)))
print("Number of items on the constructed train set: {}".format(len(train_item_set)))

Number of users on the constructed train set: 6080
Number of items on the constructed train set: 15253


# For Each Data Instance in TestSet

## GroupBy User

In [51]:
group_by_user_test = df_test_data.groupby('user')
print("Number of users on test-set: {}".format(len(group_by_user_test)))

Number of users on test-set: 6080


## Construct Valid Dataset

#### Remember in validset we are doing samping as what we did on the train set

In [52]:
import random
sample_sent_num = 500           # this should be among 30, 200 and 500
user_item_candidate_sent_ids_validset = dict()
cnt_empty_true_sentence = 0
user_cnt = 0
review_cnt = 0
user_item_candidate_sentence_num = list()
user_item_candidate_sentence_num_sampled = list()
cnt_being_cut_useritem = 0
# Loop over all users
user_cnt = 0
for user_df_chunk in list(group_by_user_test):
    user_id = int(user_df_chunk[0])
    user_id_str = str(user_df_chunk[0])
    user_df = user_df_chunk[1]
    if user_id_str not in train_user_set:
        continue
    # get user sentences, these sentences are on TRAIN set
    cur_user_sent_ids = set(trainset_user_to_sent_id[user_id_str])
    # item-level dict
    item_candidate_sent_ids = dict()
    for idx, row in user_df.iterrows():
        item_id = int(row['item'])
        item_id_str = str(row['item'])
        if item_id_str not in train_item_set:
            continue
        review_text = row['review']
        review_cnt += 1
        # get item sentences, they are on TRAIN set
        cur_item_sent_ids = set(trainset_item_to_sent_id[item_id_str])
        # get review_text's sent ids, they are on TEST set
        cur_review_sent_ids = set()
        ## tokenize this review
        review_sents = sent_tokenize(review_text)
        ## check whether this sentence is in the testset_sent_to_id dict
        for sent in review_sents:
            if sent in testset_sent_to_id:
                cur_sent_id = testset_sent_to_id[sent]
                # add this sentence into the set of current review
                cur_review_sent_ids.add(cur_sent_id)
        # construct the candidate set which is an union of user sentence and item sentence
        cur_useritem_sent_ids = cur_user_sent_ids | cur_item_sent_ids
        # sample some sentences (they are on TRAIN set)
        if len(cur_useritem_sent_ids) > sample_sent_num:
            sample_useritem_sent_ids = set(random.sample(cur_useritem_sent_ids, sample_sent_num))
            cnt_being_cut_useritem += 1
        else:
            # FIXED!!
            sample_useritem_sent_ids = cur_useritem_sent_ids
        # add this into the dict
        if len(cur_review_sent_ids) != 0:
            # only add the ones that contain at least 1 true label sentence (on valid/test set)
            item_candidate_sent_ids[item_id_str] = [list(sample_useritem_sent_ids), list(cur_review_sent_ids)]
            user_item_candidate_sentence_num.append(len(cur_useritem_sent_ids))
            user_item_candidate_sentence_num_sampled.append(len(sample_useritem_sent_ids))
        else:
            cnt_empty_true_sentence += 1

    # add this item-level dict into the user-level dict
    if len(item_candidate_sent_ids) == 0:
        print("User: {} has no useful items, skip it.".format(user_id_str))
    else:
        user_item_candidate_sent_ids_validset[user_id_str] = item_candidate_sent_ids
    user_cnt += 1
    if user_cnt % 500 == 0:
        print("{} user processed.".format(user_cnt))

print('Finish.')
print('Totally {} users'.format(user_cnt))
print('Totally {0} reviews. Among them {1} reviews has empty true label sentence'.format(
    review_cnt, cnt_empty_true_sentence))
print('During constructing, {} user-item pair are being cutted due to their length'.format(cnt_being_cut_useritem))

500 user processed.
1000 user processed.
1500 user processed.
2000 user processed.
2500 user processed.
3000 user processed.
3500 user processed.
4000 user processed.
4500 user processed.
5000 user processed.
5500 user processed.
6000 user processed.
Finish.
Totally 6080 users
Totally 59294 reviews. Among them 0 reviews has empty true label sentence
During constructing, 9853 user-item pair are being cutted due to their length


In [53]:
print("Totally {} user item pairs in the testset".format(len(user_item_candidate_sentence_num_sampled)))
print("mean number of candidate sentence: {}".format(np.median(user_item_candidate_sentence_num_sampled)))
print("max number of candidate sentence: {}".format(np.max(user_item_candidate_sentence_num_sampled)))
print("min number of candidate sentence: {}".format(np.min(user_item_candidate_sentence_num_sampled)))

Totally 59294 user item pairs in the testset
mean number of candidate sentence: 224.0
max number of candidate sentence: 500
min number of candidate sentence: 12


In [72]:
print(sorted(user_item_candidate_sentence_num)[-200:-100])

""" This shows that if we restrict the candidate sentences to have a maximum number of 1500,
we will cut-off about 200 reviews. This will be applied on test set to avoid user-item paris
with too many candidate sentences.
"""

[1320, 1323, 1324, 1326, 1326, 1329, 1329, 1330, 1330, 1330, 1335, 1337, 1338, 1340, 1341, 1341, 1345, 1348, 1349, 1350, 1350, 1354, 1354, 1355, 1357, 1359, 1359, 1360, 1362, 1372, 1372, 1373, 1374, 1399, 1419, 1453, 1454, 1457, 1483, 1486, 1488, 1489, 1489, 1490, 1490, 1490, 1491, 1492, 1492, 1493, 1493, 1493, 1494, 1494, 1494, 1497, 1497, 1498, 1498, 1498, 1498, 1498, 1498, 1499, 1500, 1500, 1501, 1501, 1502, 1502, 1503, 1503, 1503, 1505, 1507, 1507, 1508, 1508, 1508, 1509, 1510, 1512, 1512, 1512, 1513, 1514, 1515, 1515, 1519, 1520, 1521, 1523, 1523, 1524, 1525, 1527, 1527, 1531, 1532, 1532]


' This shows that if we restrict the candidate sentences to have a maximum number of 1000,\nwe will cut-off about 1500 reviews. This will be applied on test set to avoid user-item paris\nwith too many candidate sentences.\n'

In [62]:
len(user_item_candidate_sent_ids_validset)

6080

In [63]:
# save this into json file
valid_useritem2sentids_filepath = '../Dataset/{}/valid/useritem2sentids_test.json'.format(dataset_name)
with open(valid_useritem2sentids_filepath, 'w') as f:
    print("Write file: {}".format(valid_useritem2sentids_filepath))
    json.dump(user_item_candidate_sent_ids_validset, f)

Write file: ../Dataset/wine/valid/useritem2sentids_test.json


In [65]:
check_user_id = "17048"
check_item_id = "17185"
print("user: {0} \t item: {1}".format(check_user_id, check_item_id))
print("number of sentence in candidate set: {}".format(len(user_item_candidate_sent_ids_validset[check_user_id][check_item_id][0])))
print("number of sentence in true review set: {}".format(len(user_item_candidate_sent_ids_validset[check_user_id][check_item_id][1])))

user: 17048 	 item: 17185
number of sentence in candidate set: 286
number of sentence in true review set: 2


In [67]:
for sentid in user_item_candidate_sent_ids_validset[check_user_id][check_item_id][1]:
    print(testset_id_to_sent[sentid])

wow after 3 hours of air this has really opened up , smooth coffee , leather and fruit flavors .
decent nose of ccocoa , black fruit and graphite .


### However, the GT text is actually: "first bottle of a dozen . needs at least 2 hours of air before it opens . decent nose of ccocoa , black fruit and graphite . exceptioanlly dry but smoothes out with some air . hope it improves with time . wow after 3 hours of air this has really opened up , smooth coffee , leather and fruit flavors .". Should think more about how to select features.

In [68]:
# Checking How Many User/Item/Review are in the valid set
cnt_user = 0
cnt_review = 0
cnt_item_set = set()
for trainset_user_chunk in list(user_item_candidate_sent_ids_validset.items()):
    user_id_str = str(trainset_user_chunk[0])
    user_id = int(trainset_user_chunk[0])
    user_item_chunks = list(trainset_user_chunk[1].items())
    for item_chunk in user_item_chunks:
        item_id_str = str(item_chunk[0])
        item_id = int(item_chunk[0])
        # candidate_true_sent_ids = item_chunk[1]
        # cur_data_dict = {'user_id': user_id, 'item_id': item_id, 'sent_id': candidate_true_sent_ids}
        # write this into the json file
        # json.dump(cur_data_dict, f1)
        # f1.write("\n")
        # assert user_id_str in train_user_id_set
        # assert item_id_str in train_item_id_set
        cnt_item_set.add(item_id_str)
        cnt_review += 1
    cnt_user += 1

print("Total number of reviews: {}".format(cnt_review))
print("Total number of user: {}".format(cnt_user))
print("Total number of item: {}".format(len(cnt_item_set)))

Total number of reviews: 59294
Total number of user: 6080
Total number of item: 14529


In [69]:
# Write useritem2sentids_test into a line-by-line format
valid_useritem2sentids_multiline_filepath = '../Dataset/{}/valid/useritem2sentids_test_multilines.json'.format(dataset_name)
with open(valid_useritem2sentids_multiline_filepath, 'w') as f1:
    print("Write file: {}".format(valid_useritem2sentids_multiline_filepath))
    cnt_user = 0
    cnt_review = 0
    user_set = set()
    item_set = set()
    useritem_set = set()
    for trainset_user_chunk in list(user_item_candidate_sent_ids_validset.items()):
        user_id_str = str(trainset_user_chunk[0])
        user_id = int(trainset_user_chunk[0])
        user_item_chunks = list(trainset_user_chunk[1].items())
        for item_chunk in user_item_chunks:
            item_id_str = str(item_chunk[0])
            item_id = int(item_chunk[0])
            item_set.add(item_id_str)
            candidate_sent_ids = item_chunk[1][0]
            true_revw_sent_ids = item_chunk[1][1]
            cur_data_dict = {'user_id':user_id, 'item_id':item_id, 'candidate':candidate_sent_ids, "review":true_revw_sent_ids}
            # write this into the json file
            json.dump(cur_data_dict, f1)
            f1.write("\n")
            cnt_review += 1
            useritem_set.add((user_id_str, item_id_str))
        cnt_user += 1
        user_set.add(user_id_str)

assert len(user_set) == cnt_user
assert len(useritem_set) == cnt_review
print("Total {} users".format(cnt_user))
print("Total {} items".format(len(item_set)))
print("Totat {} reviews".format(cnt_review))

Write file: ../Dataset/wine/valid/useritem2sentids_test_multilines.json
Total 6080 users
Total 14529 items
Totat 59294 reviews


# Construct Test Dataset

In [73]:
sample_sent_num = 1500
user_item_candidate_sent_ids_testset = dict()
cnt_empty_true_sentence = 0
user_cnt = 0
review_cnt = 0
user_item_candidate_sentence_num = list()
user_item_candidate_sentence_num_sampled = list()
cnt_being_cut_useritem = 0
# Loop over all users
user_cnt = 0
for user_df_chunk in list(group_by_user_test):
    user_id = int(user_df_chunk[0])
    user_id_str = str(user_df_chunk[0])
    if user_id_str not in train_user_set:
        continue
    user_df = user_df_chunk[1]
    # get user sentences, these sentences are on TRAIN set
    cur_user_sent_ids = set(trainset_user_to_sent_id[user_id_str])
    # item-level dict
    item_candidate_sent_ids = dict()
    for idx, row in user_df.iterrows():
        item_id = int(row['item'])
        item_id_str = str(row['item'])
        if item_id_str not in train_item_set:
            continue
        review_text = row['review']
        review_cnt += 1
        # get item sentences, they are on TRAIN set
        cur_item_sent_ids = set(trainset_item_to_sent_id[item_id_str])
        # get review_text's sent ids, they are on TEST set
        cur_review_sent_ids = set()
        ## tokenize this review
        review_sents = sent_tokenize(review_text)
        ## check whether this sentence is in the testset_sent_to_id dict
        for sent in review_sents:
            if sent in testset_sent_to_id:
                cur_sent_id = testset_sent_to_id[sent]
                # add this sentence into the set of current review
                cur_review_sent_ids.add(cur_sent_id)
        # set union
        cur_useritem_sent_ids = cur_user_sent_ids | cur_item_sent_ids
        # sample some sentences (they are on TRAIN set)
        if len(cur_useritem_sent_ids) > sample_sent_num:
            sample_useritem_sent_ids = set(random.sample(cur_useritem_sent_ids, sample_sent_num))
            cnt_being_cut_useritem += 1
        else:
            # FIXED!!
            # sample_useritem_sent_ids = cur_user_sent_ids
            sample_useritem_sent_ids = cur_useritem_sent_ids
        # add this into the dict
        if len(cur_review_sent_ids) != 0:
            item_candidate_sent_ids[item_id_str] = [list(sample_useritem_sent_ids), list(cur_review_sent_ids)]
            user_item_candidate_sentence_num.append(len(cur_useritem_sent_ids))
            user_item_candidate_sentence_num_sampled.append(len(sample_useritem_sent_ids))
        else:
            cnt_empty_true_sentence += 1

    # add this item-level dict into the user-level dict
    user_item_candidate_sent_ids_testset[user_id_str] = item_candidate_sent_ids
    user_cnt += 1
    if user_cnt % 500 == 0:
        print("{} user processed.".format(user_cnt))

print('Finish.')
print('Totally {} users'.format(user_cnt))
print('Totally {0} reviews. Among them {1} reviews has empty true label sentence'.format(
    review_cnt, cnt_empty_true_sentence))
print('During constructing, {} user-item pair are being cutted due to their length'.format(cnt_being_cut_useritem))

500 user processed.
1000 user processed.
1500 user processed.
2000 user processed.
2500 user processed.
3000 user processed.
3500 user processed.
4000 user processed.
4500 user processed.
5000 user processed.
5500 user processed.
6000 user processed.
Finish.
Totally 6080 users
Totally 59294 reviews. Among them 0 reviews has empty true label sentence
During constructing, 134 user-item pair are being cutted due to their length


In [74]:
print("Totally {} user item pairs in the testset".format(
    len(user_item_candidate_sentence_num)))
print("mean number of candidate sentence: {}".format(
    np.mean(user_item_candidate_sentence_num)))
print("max number of candidate sentence: {}".format(
    np.max(user_item_candidate_sentence_num)))
print("min number of candidate sentence: {}".format(
    np.min(user_item_candidate_sentence_num)))
print("mean number of sampled candidate sentence: {}".format(
    np.mean(user_item_candidate_sentence_num_sampled)))
print("max number of sampled candidate sentence: {}".format(
    np.max(user_item_candidate_sentence_num_sampled)))
print("min number of sampled candidate sentence: {}".format(
    np.min(user_item_candidate_sentence_num_sampled)))

Totally 59294 user item pairs in the testset
mean number of candidate sentence: 301.1362363814214
max number of candidate sentence: 1981
min number of candidate sentence: 12
mean number of sampled candidate sentence: 300.9716497453368
max number of sampled candidate sentence: 1500
min number of sampled candidate sentence: 12


In [75]:
print(sorted(user_item_candidate_sentence_num)[-40:])
print(sorted(user_item_candidate_sentence_num_sampled)[-40:])

[1585, 1586, 1590, 1592, 1596, 1597, 1598, 1599, 1601, 1604, 1605, 1606, 1607, 1608, 1610, 1612, 1613, 1613, 1616, 1616, 1618, 1620, 1626, 1626, 1628, 1628, 1633, 1634, 1638, 1640, 1641, 1650, 1659, 1680, 1692, 1734, 1739, 1831, 1849, 1981]
[1500, 1500, 1500, 1500, 1500, 1500, 1500, 1500, 1500, 1500, 1500, 1500, 1500, 1500, 1500, 1500, 1500, 1500, 1500, 1500, 1500, 1500, 1500, 1500, 1500, 1500, 1500, 1500, 1500, 1500, 1500, 1500, 1500, 1500, 1500, 1500, 1500, 1500, 1500, 1500]


In [76]:
len(user_item_candidate_sent_ids_testset)

6080

In [77]:
# save this into json file
test_useritem2sentids_filepath = '../Dataset/{}/test/useritem2sentids_test.json'.format(dataset_name)
with open(test_useritem2sentids_filepath, 'w') as f:
    print("Write file: {}".format(test_useritem2sentids_filepath))
    json.dump(user_item_candidate_sent_ids_testset, f)

Write file: ../Dataset/wine/test/useritem2sentids_test.json


In [78]:
review_test_cnt = 0
for user_chunk in user_item_candidate_sent_ids_testset.items():
    user_id = user_chunk[0]
    user_dict = user_chunk[1]
    for user_item_chunk in user_dict.items():
        item_id = user_item_chunk[0]
        candidate_sents = user_item_chunk[0]
        true_label_sents = user_item_chunk[1]
        review_test_cnt += 1
print(review_test_cnt)

59294


In [79]:
check_user_id = "17048"
check_item_id = "17185"
print("user: {0} \t item: {1}".format(check_user_id, check_item_id))
print("number of sentence in candidate set: {}".format(len(user_item_candidate_sent_ids_testset[check_user_id][check_item_id][0])))
print("number of sentence in true review set: {}".format(len(user_item_candidate_sent_ids_testset[check_user_id][check_item_id][1])))

user: 17048 	 item: 17185
number of sentence in candidate set: 286
number of sentence in true review set: 2


In [80]:
# Write useritem2sentids_test into a line-by-line format
test_useritem2sentids_multiline_filepath = '../Dataset/{}/test/useritem2sentids_test_multilines.json'.format(dataset_name)
with open(test_useritem2sentids_multiline_filepath, 'w') as f1:
    print("Write file: {}".format(test_useritem2sentids_multiline_filepath))
    cnt_user = 0
    cnt_review = 0
    user_set = set()
    item_set = set()
    useritem_set = set()
    for trainset_user_chunk in list(user_item_candidate_sent_ids_testset.items()):
        user_id_str = str(trainset_user_chunk[0])
        user_id = int(trainset_user_chunk[0])
        user_item_chunks = list(trainset_user_chunk[1].items())
        for item_chunk in user_item_chunks:
            item_id_str = str(item_chunk[0])
            item_id = int(item_chunk[0])
            item_set.add(item_id_str)
            candidate_sent_ids = item_chunk[1][0]
            true_revw_sent_ids = item_chunk[1][1]
            cur_data_dict = {'user_id':user_id, 'item_id':item_id, 'candidate':candidate_sent_ids, "review":true_revw_sent_ids}
            # write this into the json file
            json.dump(cur_data_dict, f1)
            f1.write("\n")
            cnt_review += 1
            useritem_set.add((user_id_str, item_id_str))
        cnt_user += 1
        user_set.add(user_id_str)

assert len(user_set) == cnt_user
assert len(useritem_set) == cnt_review
print("Total {} users".format(cnt_user))
print("Total {} items".format(len(item_set)))
print("Totat {} reviews".format(cnt_review))

Write file: ../Dataset/wine/test/useritem2sentids_test_multilines.json
Total 6080 users
Total 14529 items
Totat 59294 reviews


In [81]:
check_user_id = "2124"
check_item_id = "1372"
print("user: {0} \t item: {1}".format(check_user_id, check_item_id))
print("[VALID] number of sentence in candidate set: {}".format(len(user_item_candidate_sent_ids_validset[check_user_id][check_item_id][0])))
print("[VALID] number of sentence in true review set: {}".format(len(user_item_candidate_sent_ids_validset[check_user_id][check_item_id][1])))
print("[TEST]  number of sentence in candidate set: {}".format(len(user_item_candidate_sent_ids_testset[check_user_id][check_item_id][0])))
print("[TEST]  number of sentence in true review set: {}".format(len(user_item_candidate_sent_ids_testset[check_user_id][check_item_id][1])))

user: 2124 	 item: 1372
[VALID] number of sentence in candidate set: 500
[VALID] number of sentence in true review set: 1
[TEST]  number of sentence in candidate set: 603
[TEST]  number of sentence in true review set: 1
