In [1]:
!which python

/u/pw7nc/anaconda3/bin/python


In [2]:
import re
import json
import os

import numpy as np
import pandas as pd
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer

import spacy
import nltk
from nltk.tokenize import sent_tokenize
from spacy.lang.en import English
nlp = English()
# Create a Tokenizer with the default settings for English
# including punctuation rules and exceptions
tokenizer = nlp.tokenizer
import string
punct = string.punctuation
from sklearn.feature_extraction import _stop_words

In [3]:
dataset_name = "wine"

# Load Data

## Load Dataset

In [4]:
dir_path = '../Dataset/{}'.format(dataset_name)
# Load train dataset
train_review = []
cnt = 0
file_path = os.path.join(dir_path, 'train_filtered.json')
with open(file_path) as f:
    print("Load file: {}".format(file_path))
    for line in f:
        line_data = json.loads(line)
        user_id = line_data['user']
        item_id = line_data['item']
        rating = line_data['rating']
        review = line_data['review']
        train_review.append([item_id, user_id, rating, review])
        cnt += 1
        if cnt % 100000 == 0:
            print('{} lines loaded.'.format(cnt))
print('Finish loading train dataset, totally {} lines.'.format(len(train_review)))
# Load test dataset
test_review = []
cnt = 0
file_path = os.path.join(dir_path, 'test_filtered.json')
with open(file_path) as f:
    print("Load file: {}".format(file_path))
    for line in f:
        line_data = json.loads(line)
        user_id = line_data['user']
        item_id = line_data['item']
        rating = line_data['rating']
        review = line_data['review']
        test_review.append([item_id, user_id, rating, review])
        cnt += 1
        if cnt % 10000 == 0:
            print('{} lines loaded.'.format(cnt))
print('Finish loading test dataset, totally {} lines.'.format(len(test_review)))

Load file: ../Dataset/wine/train_filtered.json
100000 lines loaded.
200000 lines loaded.
Finish loading train dataset, totally 248452 lines.
Load file: ../Dataset/wine/test_filtered.json
10000 lines loaded.
20000 lines loaded.
30000 lines loaded.
40000 lines loaded.
50000 lines loaded.
Finish loading test dataset, totally 59294 lines.


In [5]:
df_train_data = pd.DataFrame(train_review, columns=['item', 'user', 'rating', 'review'])
df_test_data = pd.DataFrame(test_review, columns=['item', 'user', 'rating', 'review'])

In [6]:
df_train_data

Unnamed: 0,item,user,rating,review
0,1015705,131074,93,this wine is simply outstanding . the nose has...
1,661436,131074,83,"ripe but diluted bing cherry , a hint of spice..."
2,458829,131074,92,blackberry fruit and smoke . remarkable depth ...
3,887649,131074,88,"clear and crisp , tangy lemon and fresh vegeta..."
4,744973,131074,90,juicy young fruit . darker . currant . some pl...
...,...,...,...,...
248447,70969,152917,93,"very structured , tannins still firm , a tad b..."
248448,470778,152917,88,me thinks this wine is passing it 's prime as ...
248449,1785,152917,90,wonderfully aged bordeaux . it took about 5 to...
248450,481908,152917,95,outstanding effort from kosta browne . i had t...


In [7]:
# groupby multiple columns
groupby_user_item = df_train_data.groupby(['user', 'item'])
cnt = 0
for key, item in groupby_user_item:
    cur_df_user_item = groupby_user_item.get_group(key)
    if len(cur_df_user_item) > 1:
        if cnt <= 10:
            print(cur_df_user_item)
        cnt += 1
print("{} data instance are the same".format(cnt))
# make sure that there are no duplicated reviews

0 data instance are the same


## Load Sentence2ID and ID2Sentence Mapping

In [9]:
sentence2id_filepath = '../Dataset/{}/train/sentence/sentence2id.json'.format(dataset_name)
with open(sentence2id_filepath, 'r') as f:
    print("Load file: {}".format(sentence2id_filepath))
    sent_to_id = json.load(f)

Load file: ../Dataset/wine/train/sentence/sentence2id.json


In [10]:
type(sent_to_id['very nice wine .'])

str

In [11]:
id2sentence_filepath = '../Dataset/{}/train/sentence/id2sentence.json'.format(dataset_name)
with open(id2sentence_filepath, 'r') as f:
    print("Load file: {}".format(id2sentence_filepath))
    id_to_sent = json.load(f)

Load file: ../Dataset/wine/train/sentence/id2sentence.json


In [12]:
id_to_sent['0']

'very nice wine .'

In [13]:
assert len(sent_to_id) == len(id_to_sent)
print("Number of sentence (with feature) on train set: {}".format(len(sent_to_id)))

Number of sentence (with feature) on train set: 554564


## Load Feature Words

In [14]:
feature2id_filepath = '../Dataset/{}/train/feature/feature2id.json'.format(dataset_name)
with open(feature2id_filepath, 'r') as f:
    print("Load file: {}".format(feature2id_filepath))
    feature_vocab = json.load(f)

Load file: ../Dataset/wine/train/feature/feature2id.json


In [15]:
len(feature_vocab)

215

In [16]:
feature_vocab['aroma']

'28'

## Load Sentence2Feature

In [17]:
sentence2feature_filepath = '../Dataset/{}/train/sentence/sentence2feature.json'.format(dataset_name)
with open(sentence2feature_filepath, 'r') as f:
    print("Load file: {}".format(sentence2feature_filepath))
    sentence_to_feature = json.load(f)

Load file: ../Dataset/wine/train/sentence/sentence2feature.json


In [19]:
sentence_to_feature['0']

{'0': 1.0}

In [18]:
assert len(sentence_to_feature) == len(sent_to_id)
len(sentence_to_feature)

554564

# Construct User-Item Pair
## GroupBy User

In [20]:
group_by_user = df_train_data.groupby('user')
group_by_user_dict = dict(tuple(group_by_user))

In [21]:
group_by_user_dict['3']

Unnamed: 0,item,user,rating,review
70,77773,3,90,served with fresh black italian truffles over ...
71,788700,3,91,"beautiful wine - yeast , nutty bread , dry & c..."
72,178465,3,90,"delicious bottle , none of us familiar with it..."
73,246,3,90,a delightful surprise ... wonderful floral ( v...
74,7570,3,89,"a v ery concentrated and big wine ... spciy , ..."
75,251,3,90,a very pleasant surprise ! drank w/markj over ...
76,4260,3,91,"outstanding ! fresh back from spain , and havi..."
77,288673,3,89,"dark , concentrated , very young , but delightful"
78,7103,3,85,"initial nose had touch of ammonia , which diss..."
79,4350,3,90,fantastic ! dominant notes of vibrant fruit an...


In [22]:
len(group_by_user_dict)

6080

In [23]:
user_id_list = list(df_train_data['user'].unique())

In [24]:
print(len(user_id_list))

6080


In [25]:
user_to_sent = dict()
cnt_sentence_with_no_feature = 0

for user_id in user_id_list:
    # note this user_id is a str
    # get the dataframe for this user
    user_df = group_by_user_dict[user_id]
    user_reviews = list(user_df['review'])
    user_sent_ids = set()
    for review in user_reviews:
        # tokenize this review (i.e. split into sentences)
        review_sents = sent_tokenize(review)
        # check whether the sentence is in the sentence2id dictionary
        for sent in review_sents:
            if sent in sent_to_id:
                cur_sent_id = sent_to_id[sent]
                assert isinstance(cur_sent_id, str)
                # make sure that this sentence has feature
                # assert cur_sent_id in sentence_to_feature
                # user_sent_ids.add(cur_sent_id)
                if cur_sent_id in sentence_to_feature:
                    # add this sentence into the set of this user
                    user_sent_ids.add(cur_sent_id)
                else:
                    cnt_sentence_with_no_feature += 1
        # TODO: what should we do if there are sentence that appears in multiple reivews?
    if len(user_sent_ids) == 0:
        print("User: {} has no effective sentences, skip it.".format(user_id))
    else:
        user_to_sent[user_id] = user_sent_ids
print('{} sentence with no feature'.format(cnt_sentence_with_no_feature))

0 sentence with no feature


In [26]:
len(user_to_sent)

6080

In [27]:
user_to_sentids = dict()
for user_id, user_sents in user_to_sent.items():
    assert len(user_sents) > 0
    assert isinstance(user_id, str)
    assert isinstance(list(user_sents)[0], str)
    user_to_sentids[user_id] = list(user_sents)

In [28]:
len(user_to_sentids)

6080

In [29]:
user2sentids_filepath = '../Dataset//{}/train/user/user2sentids.json'.format(dataset_name)
with open(user2sentids_filepath, 'w') as f:
    print("Write file: {}".format(user2sentids_filepath))
    json.dump(user_to_sentids, f)

Write file: ../Dataset//wine/train/user/user2sentids.json


In [40]:
user_side_cdd_sents_num = list()
for key, value in user_to_sentids.items():
    user_side_cdd_sents_num.append(len(value))

In [42]:
print("Mean number of sentence per user: {}".format(
    np.mean(user_side_cdd_sents_num)
))
print("Min number of sentence per user: {}".format(
    np.min(user_side_cdd_sents_num)
))
print("Max number of sentence per user: {}".format(
    np.max(user_side_cdd_sents_num)
))

Mean number of sentence per user: 93.92467105263158
Min number of sentence per user: 1
Max number of sentence per user: 1529


In [43]:
print("Top-10 least numbber of sentences per user: {}".format(
    sorted(user_side_cdd_sents_num)[:10]
))
print("Top-10 most numbber of sentences per user: {}".format(
    sorted(user_side_cdd_sents_num)[-10:]
))

Top-10 least numbber of sentences per user: [1, 2, 3, 4, 5, 5, 6, 6, 7, 7]
Top-10 most numbber of sentences per user: [1032, 1033, 1034, 1077, 1148, 1183, 1197, 1237, 1473, 1529]


## GrounBy Item

In [30]:
group_by_item = df_train_data.groupby('item')
group_by_item_dict = dict(tuple(group_by_item))

In [31]:
group_by_item_dict['14785']

Unnamed: 0,item,user,rating,review
5412,14785,193,90,this winery continues to be a great qpr and th...
13300,14785,809,89,"dinner at plateau club - a very nice cab , cle..."
18527,14785,1417,93,beautiful wine .
45913,14785,7887,91,this is really drinking well right now . plent...
76826,14785,14747,90,"really good now , berry cassis and hint of oak..."
101169,14785,23509,91,dark red with slight fade at rim . wonderfully...
151235,14785,41815,92,that is some kind of cabernet sauvignon i tell...
175729,14785,53459,88,"strawberries and blueberries , pops out at you..."


In [32]:
len(group_by_item_dict)

15253

In [33]:
item_id_list = list(df_train_data['item'].unique())
item_to_sent = dict()
cnt_sentence_with_no_feature = 0
for item_id in item_id_list:
    # note this item_id is a str
    # get the dataframe for this item
    assert isinstance(item_id, str)
    item_df = group_by_item_dict[item_id]
    item_reviews = list(item_df['review'])
    item_sent_ids = set()
    for review in item_reviews:
        # tokenize this review (i.e. split into sentences)
        review_sents = sent_tokenize(review)
        # check whether the sentence is in the sentence2id dictionary
        for sent in review_sents:
            if sent in sent_to_id:
                cur_sent_id = sent_to_id[sent]
                assert isinstance(cur_sent_id, str)
                # make sure that this sentence has feature
                if cur_sent_id in sentence_to_feature:
                    # add this sentence into the set of this user
                    item_sent_ids.add(cur_sent_id)
                else:
                    cnt_sentence_with_no_feature += 1
        # TODO: what should we do if there are sentence that appears in multiple reivews?
    if len(item_sent_ids) == 0:
        print("Item {} has no effective sentence, skip it.".format(item_id))
    else:
        item_to_sent[item_id] = item_sent_ids

print('{} sentence with no feature'.format(cnt_sentence_with_no_feature))

0 sentence with no feature


In [34]:
len(item_to_sent)

15253

In [35]:
item_to_sentids = dict()
for item_id, item_sents in item_to_sent.items():
    assert len(item_sents) > 0
    assert isinstance(list(item_sents)[0], str)
    item_to_sentids[item_id] = list(item_sents)

In [36]:
len(item_to_sentids)

15253

In [38]:
item2sentids_filepath = '../Dataset/{}/train/item/item2sentids.json'.format(dataset_name)
with open(item2sentids_filepath, 'w') as f:
    print("Write file: {}".format(item2sentids_filepath))
    json.dump(item_to_sentids, f)

Write file: ../Dataset/wine/train/item/item2sentids.json


In [44]:
item_side_cdd_sents_num = list()
for key, value in item_to_sentids.items():
    item_side_cdd_sents_num.append(len(value))

In [45]:
print("Mean number of sentence per item: {}".format(
    np.mean(item_side_cdd_sents_num)
))
print("Min number of sentence per item: {}".format(
    np.min(item_side_cdd_sents_num)
))
print("Max number of sentence per item: {}".format(
    np.max(item_side_cdd_sents_num)
))

Mean number of sentence per item: 37.76299744312594
Min number of sentence per item: 5
Max number of sentence per item: 452


In [46]:
print("Top-10 least numbber of sentence per item: {}".format(
    sorted(item_side_cdd_sents_num)[:10]
))
print("Top-10 most numbber of sentence per item: {}".format(
    sorted(item_side_cdd_sents_num)[-10:]
))

Top-10 least numbber of sentence per item: [5, 5, 5, 5, 6, 6, 6, 6, 6, 6]
Top-10 most numbber of sentence per item: [300, 302, 304, 315, 318, 320, 358, 365, 426, 452]


# For Each Data Instance in TrainSet

In [47]:
import random
sample_sent_num = 500
user_item_candidate_sent_ids = dict()
# Loop over all User
user_cnt = 0
review_cnt = 0
review_with_no_selectd_label_sentence = 0
useable_review_cnt = 0
sentence_with_no_feature_cnt = 0
sentence_not_tracked = set()
for user_df_chunk in list(group_by_user):
    user_id = int(user_df_chunk[0])
    user_id_str = str(user_df_chunk[0])
    user_df = user_df_chunk[1]
    # get user sents
    cur_user_sent_ids = user_to_sent[user_id_str]
    # item-level dict
    item_candidate_sent_ids = dict()
    for idx, row in user_df.iterrows():
        item_id = int(row['item'])
        item_id_str = str(row['item'])
        review_text = row['review']
        review_cnt += 1
        # get item sents
        cur_item_sent_ids = item_to_sent[item_id_str]
        # get review_text's sent ids
        cur_review_sent_ids = set()
        ## tokenize this review
        review_sents = sent_tokenize(review_text)
        ## check whether this sentence is in the sentence2id dictionary
        for sent in review_sents:
            if sent in sent_to_id:
                cur_sent_id = sent_to_id[sent]
                assert isinstance(cur_sent_id, str)
                # make sure that this sentence has feature
                if cur_sent_id in sentence_to_feature:
                    # add this sentence into the set of current review
                    cur_review_sent_ids.add(cur_sent_id)
                else:
                    sentence_with_no_feature_cnt += 1
            else:
                # if this sentence is not being tracked by the sentence-id mapping
                # we add it into this set to see how many sentences are being ignored
                sentence_not_tracked.add(sent)
        ## check whether the true label of the sentence is an empty list of sent_ids
        if len(cur_review_sent_ids) == 0:
            review_with_no_selectd_label_sentence += 1
        else:
            # construct the candidate set which is an union of user sentences and item sentences
            cur_useritem_sent_ids = cur_user_sent_ids | cur_item_sent_ids
            # sample some sentences
            if len(cur_useritem_sent_ids) > sample_sent_num:
                sample_useritem_sent_ids = set(random.sample(cur_useritem_sent_ids, sample_sent_num))
            else:
                sample_useritem_sent_ids = cur_useritem_sent_ids
            # union sampled sentences with true labeled sentences
            final_useritem_sent_ids = sample_useritem_sent_ids | cur_review_sent_ids
            # add this into the dict
            item_candidate_sent_ids[item_id_str] = [list(final_useritem_sent_ids), list(cur_review_sent_ids)]
            # add useable review cnt
            useable_review_cnt += 1
    if len(item_candidate_sent_ids) == 0:
        print("User: {} has no useful item, skip this user ...".format(user_id_str))
    else:
        # add the item_candidate_sent_ids dict into the user-level dict
        user_item_candidate_sent_ids[user_id_str] = item_candidate_sent_ids
    user_cnt += 1
    if user_cnt % 200 == 0:
        print("{} user processed".format(user_cnt))

print('Finish.')
print('Totally {} users'.format(user_cnt))
print('Totally {0} reviews. Among them {1} reviews has empty true label sentence'.format(
    review_cnt, review_with_no_selectd_label_sentence))
print("{} sentences has 0 feature".format(sentence_with_no_feature_cnt))
print("{} sentences are not being tracked in the sent2id mapping".format(len(sentence_not_tracked)))

200 user processed
400 user processed
600 user processed
800 user processed
1000 user processed
1200 user processed
1400 user processed
1600 user processed
1800 user processed
2000 user processed
2200 user processed
2400 user processed
2600 user processed
2800 user processed
3000 user processed
3200 user processed
3400 user processed
3600 user processed
3800 user processed
4000 user processed
4200 user processed
4400 user processed
4600 user processed
4800 user processed
5000 user processed
5200 user processed
5400 user processed
5600 user processed
5800 user processed
6000 user processed
Finish.
Totally 6080 users
Totally 248452 reviews. Among them 2651 reviews has empty true label sentence
0 sentences has 0 feature
386202 sentences are not being tracked in the sent2id mapping


In [48]:
len(user_item_candidate_sent_ids)

6080

In [49]:
# let's check how many unique reviews are there

cnt_unique_reviews = 0
cnt_empty_true_sent = 0
sentence_per_review = []
candidate_sentence_num_cnt_per_review = []
# [user-level] Loop for each user
for user_chunk in list(user_item_candidate_sent_ids.items()):
    user_id_str = str(user_chunk[0])
    # assert isinstance(user_chunk[0], str)
    # [item-level] Loop for each user-item pair
    user_item_chunks = list(user_chunk[1].items())
    for item_chunk in user_item_chunks:
        item_id_str = str(item_chunk[0])
        # assert isinstance(item_chunk[0], str)
        candidate_sent_ids = item_chunk[1][0]
        true_sent_ids = item_chunk[1][1]
        if len(true_sent_ids) == 0:
            cnt_empty_true_sent += 1
        else:
            assert isinstance(candidate_sent_ids[0], str)
            assert isinstance(true_sent_ids[0], str)
            # make sure that all true label sent_ids appears in the corresponding candidate set
            for true_sent_id in true_sent_ids:
                assert true_sent_id in candidate_sent_ids
            sentence_per_review.append(len(true_sent_ids))
            candidate_sentence_num_cnt_per_review.append(len(candidate_sent_ids))
        cnt_unique_reviews += 1

print("Total number of unique selected reviews: {}".format(cnt_unique_reviews))
print("Total number of review with empty true sentences: {}".format(cnt_empty_true_sent))
print("Total number of unique review with non-empty true sentences: {}".format(
    cnt_unique_reviews - cnt_empty_true_sent))

Total number of unique selected reviews: 245801
Total number of review with empty true sentences: 0
Total number of unique review with non-empty true sentences: 245801


In [50]:
print("Totally {} user-item pairs in the trainset".format(len(sentence_per_review)))
print("max number of true sentence per review: {}".format(np.max(sentence_per_review)))
print("min number of true sentence per review: {}".format(np.min(sentence_per_review)))
print("mean number of true sentence per review: {}".format(np.mean(sentence_per_review)))
print("max number of candidate sentence per review: {}".format(np.max(candidate_sentence_num_cnt_per_review)))
print("min number of candidate sentence per review: {}".format(np.min(candidate_sentence_num_cnt_per_review)))
print("mean number of candidate sentence per review: {}".format(np.mean(candidate_sentence_num_cnt_per_review)))

Totally 245801 user-item pairs in the trainset
max number of true sentence per review: 30
min number of true sentence per review: 1
mean number of true sentence per review: 2.3441076317834346
max number of candidate sentence per review: 514
min number of candidate sentence per review: 10
mean number of candidate sentence per review: 254.12354709704192


In [51]:
useritem2sentids_filepath = '../Dataset/{}/train/useritem2sentids.json'.format(dataset_name)
with open(useritem2sentids_filepath, 'w') as f:
    print("Write file: {}".format(useritem2sentids_filepath))
    json.dump(user_item_candidate_sent_ids, f)

Write file: ../Dataset/wine/train/useritem2sentids.json


In [52]:
len(user_item_candidate_sent_ids['3']['246'][0])

51

In [53]:
len(user_to_sent['3'])

26

In [55]:
len(item_to_sent['246'])

27

In [56]:
len(user_item_candidate_sent_ids['3']['246'][1])

2

In [57]:
for sent_id in user_item_candidate_sent_ids['3']['246'][1]:
    assert sent_id in user_item_candidate_sent_ids['3']['246'][0]

## Write train useritem_cdd in a line-by-line format

In [58]:
# Write useritem2sentids into a line-by-line format
train_useritem2sentid_multiline_file = '../Dataset/{}/train/useritem2sentids_multilines.json'.format(dataset_name)
if os.path.exists(train_useritem2sentid_multiline_file):
    print("File: {} exists, remove it.".format(train_useritem2sentid_multiline_file))
    os.remove(train_useritem2sentid_multiline_file)
else:
    print("File: {} doesn't exist, creat it.".format(train_useritem2sentid_multiline_file))

File: ../Dataset/wine/train/useritem2sentids_multilines.json doesn't exist, creat it.


In [59]:
with open(train_useritem2sentid_multiline_file, 'a') as f1:
    print("Write file: {}".format(train_useritem2sentid_multiline_file))
    cnt_user = 0
    cnt_review = 0
    user_set = set()
    item_set = set()
    useritem_set = set()
    for trainset_user_chunk in list(user_item_candidate_sent_ids.items()):
        assert isinstance(trainset_user_chunk[0], str)
        user_id_str = trainset_user_chunk[0]
        user_id = int(trainset_user_chunk[0])
        user_item_chunks = list(trainset_user_chunk[1].items())
        for item_chunk in user_item_chunks:
            assert isinstance(item_chunk[0], str)
            item_id_str = item_chunk[0]
            item_id = int(item_chunk[0])
            candidate_sent_ids = item_chunk[1][0]
            gold_revw_sent_ids = item_chunk[1][1]
            assert isinstance(candidate_sent_ids, list)
            assert isinstance(gold_revw_sent_ids, list)
            assert isinstance(candidate_sent_ids[0], str)
            assert isinstance(gold_revw_sent_ids[0], str)
            for cur_id in gold_revw_sent_ids:
                assert cur_id in candidate_sent_ids
            cur_data_dict = {'user_id': user_id_str, 'item_id': item_id_str, 'candidate': candidate_sent_ids, "review": gold_revw_sent_ids}
            # write this into the json file
            json.dump(cur_data_dict, f1)
            f1.write("\n")
            cnt_review += 1
            useritem_set.add((user_id_str, item_id_str))
            item_set.add(item_id_str)
            if cnt_review % 50000 == 0:
                print("{} lines of train data written.".format(cnt_review))
        cnt_user += 1
        user_set.add(user_id_str)

assert cnt_user == len(user_set)
assert cnt_review == len(useritem_set)
print("Totally {} users".format(cnt_user))
print("Totally {} items".format(len(item_set)))
print("Totally {} reviews".format(cnt_review))

Write file: ../Dataset/wine/train/useritem2sentids_multilines.json
50000 lines of train data written.
100000 lines of train data written.
150000 lines of train data written.
200000 lines of train data written.
Totally 6080 users
Totally 15253 items
Totally 245801 reviews


In [60]:
!head -n 1 '../Dataset/wine/train/useritem2sentids_multilines.json'

{"user_id": "100", "item_id": "1113634", "candidate": ["14360", "14601", "14462", "14477", "334751", "14412", "14426", "401123", "14509", "14676", "64804", "352507", "2822", "14507", "14608", "14504", "14585", "14387", "14599", "327393", "14680", "386215", "401122", "346833", "437184", "501682", "14396", "14366", "14416", "329746", "14674", "14634", "14513", "14522", "14700", "320831", "14618", "325917", "14590", "516924", "14503", "14451", "14673", "219511", "875", "14380", "14400", "327390", "14485", "447671", "14393", "516925", "14395", "14576", "14571", "14392", "14381", "14709", "14528", "14406", "14583", "14466", "64803", "14559", "14667", "14478", "14549", "14373", "14708", "14610", "14461", "14473", "14616", "14597", "14611", "14703", "14565", "14632", "14697", "394227", "539457", "14383", "14414", "14415", "437186", "14482", "501683", "14658", "14615", "14584", "14551", "166", "14463", "14517", "386217", "14612", "14564", "14410", "386218", "14550", "293290", "320830", "14644"

## Save User-Item Pairs (Train Set)

In [64]:
user_item_pairs = dict()
cnt_user_item_pairs = 0
for trainset_user_chunk in list(user_item_candidate_sent_ids.items()):
    assert isinstance(trainset_user_chunk[0], str)
    user_id_str = trainset_user_chunk[0]
    user_item_chunks = list(trainset_user_chunk[1].items())
    assert user_id_str not in user_item_pairs
    user_item_pairs[user_id_str] = list()
    for item_chunk in user_item_chunks:
        assert isinstance(item_chunk[0], str)
        item_id_str = item_chunk[0]
        assert item_id_str not in user_item_pairs[user_id_str]
        user_item_pairs[user_id_str].append(item_id_str)
        cnt_user_item_pairs += 1
print("Total number of user-item pair on trainset: {}".format(cnt_user_item_pairs))

Total number of user-item pair on trainset: 245801


In [65]:
train_useritempairs_file = '../Dataset/{}/train/useritem_pairs.json'.format(dataset_name)
with open(train_useritempairs_file, 'w') as f:
    print("write file: {}".format(train_useritempairs_file))
    json.dump(user_item_pairs, f)

write file: ../Dataset/wine/train/useritem_pairs.json
