In [1]:
!which python

/u/pw7nc/anaconda3/bin/python


In [4]:
import re
import json
import os

import numpy as np
import pandas as pd
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer

import spacy
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from spacy.lang.en import English
nlp = English()
# Create a Tokenizer with the default settings for English
# including punctuation rules and exceptions
tokenizer = nlp.tokenizer
import string
punct = string.punctuation
from sklearn.feature_extraction import _stop_words

In [5]:
dataset_name = "wine"

# Read Data

In [6]:
dir_path = '../Dataset/{}'.format(dataset_name)
# Load train dataset
train_review = []
cnt = 0
file_path = os.path.join(dir_path, 'train_filtered.json')
with open(file_path) as f:
    print("Load file: {}".format(file_path))
    for line in f:
        line_data = json.loads(line)
        user_id = line_data['user']
        item_id = line_data['item']
        rating = line_data['rating']
        review = line_data['review']
        train_review.append([item_id, user_id, rating, review])
        cnt += 1
        if cnt % 50000 == 0:
            print('{} lines loaded.'.format(cnt))
print('Finish loading train dataset, totally {} lines.'.format(len(train_review)))
# Load test dataset
test_review = []
cnt = 0
file_path = os.path.join(dir_path, 'test_filtered.json')
with open(file_path) as f:
    print("Load file: {}".format(file_path))
    for line in f:
        line_data = json.loads(line)
        user_id = line_data['user']
        item_id = line_data['item']
        rating = line_data['rating']
        review = line_data['review']
        test_review.append([item_id, user_id, rating, review])
        cnt += 1
        if cnt % 10000 == 0:
            print('{} lines loaded.'.format(cnt))
print('Finish loading test dataset, totally {} lines.'.format(len(test_review)))

Load file: ../Dataset/wine/train_filtered.json
50000 lines loaded.
100000 lines loaded.
150000 lines loaded.
200000 lines loaded.
Finish loading train dataset, totally 248452 lines.
Load file: ../Dataset/wine/test_filtered.json
10000 lines loaded.
20000 lines loaded.
30000 lines loaded.
40000 lines loaded.
50000 lines loaded.
Finish loading test dataset, totally 59294 lines.


## Convert List Data to Pandas Dataframe

In [7]:
df_train_data = pd.DataFrame(train_review, columns=['item', 'user', 'rating', 'review'])
df_test_data = pd.DataFrame(test_review, columns=['item', 'user', 'rating', 'review'])

In [8]:
df_train_data

Unnamed: 0,item,user,rating,review
0,1015705,131074,93,this wine is simply outstanding . the nose has...
1,661436,131074,83,"ripe but diluted bing cherry , a hint of spice..."
2,458829,131074,92,blackberry fruit and smoke . remarkable depth ...
3,887649,131074,88,"clear and crisp , tangy lemon and fresh vegeta..."
4,744973,131074,90,juicy young fruit . darker . currant . some pl...
...,...,...,...,...
248447,70969,152917,93,"very structured , tannins still firm , a tad b..."
248448,470778,152917,88,me thinks this wine is passing it 's prime as ...
248449,1785,152917,90,wonderfully aged bordeaux . it took about 5 to...
248450,481908,152917,95,outstanding effort from kosta browne . i had t...


In [9]:
print("Number of users on train: {}\tNumber of items on train: {}".format(
    len(df_train_data['user'].unique()), len(df_train_data['item'].unique())
))
print("Number of users on test: {}\tNumber of items on test: {}".format(
    len(df_test_data['user'].unique()), len(df_test_data['item'].unique())
))

Number of users on train: 6080	Number of items on train: 15253
Number of users on test: 6080	Number of items on test: 14529


# Compute Sentence Tf-idf

In [10]:
def catDoc(textlist):
    res = []
    for tlist in textlist:
        res.extend(tlist)
    return res

In [11]:
def get_tfidf_embedding(text, feature_word_list):
    """
    :param: text: list, sent_number * word
    :return: 
        vectorizer: 
            vocabulary_: word2id
            get_feature_names(): id2word
        tfidf: array [sent_number, max_word_number]
    """
    vectorizer = CountVectorizer(lowercase=True, vocabulary=feature_word_list)
    word_count = vectorizer.fit_transform(text)
    tfidf_transformer = TfidfTransformer()
    tfidf = tfidf_transformer.fit_transform(word_count)
    tfidf_weight = tfidf.toarray()
    return vectorizer, tfidf_weight

In [12]:
def get_tf_score(text, feature_word_list):
    vectorizer = CountVectorizer(lowercase=True, vocabulary=feature_word_list)
    word_count = vectorizer.fit_transform(text)
    return word_count.toarray()

In [13]:
def get_df_score(text, feature_word_list):
    vectorizer = CountVectorizer(lowercase=True, vocabulary=feature_word_list)
    word_count = vectorizer.fit_transform(text)
    # from word count (i.e. tf) get document frequency (i.e. df)
    df_count = np.sum(word_count.toarray()>0, axis=0)
    return df_count

In [14]:
def compress_array(a, id2word, vocab):
    """
    :param a: matrix, [N, M], N is document number, M is word number
    :param id2word: word id to word
    :return: 
    """
    d = {}
    # Loop over documents
    for i in range(len(a)):
        d[i] = {}
        # Loop over words
        for j in range(len(a[i])):
            if a[i][j] != 0:
                wid_voc = vocab[id2word[j]]
                d[i][wid_voc] = a[i][j]
    return d

# Load Feature Words

In [15]:
feature_2_id_file = '../Dataset/{}/train/feature/feature2id.json'.format(dataset_name)
with open(feature_2_id_file, 'r') as f:
    print("Load file: {}".format(feature_2_id_file))
    feature_vocab = json.load(f)

Load file: ../Dataset/wine/train/feature/feature2id.json


In [16]:
len(feature_vocab)

215

In [17]:
feature_vocab['aroma']

'28'

In [18]:
feature_word_list = list(feature_vocab.keys())
print('Number of feature words: {}'.format(len(feature_word_list)))

Number of feature words: 215


In [19]:
id2feature_dict = dict()
for key,value in feature_vocab.items():
    id2feature_dict[value] = key

In [21]:
id_2_feature_file = '../Dataset/{}/train/feature/id2feature.json'.format(dataset_name)
with open(id_2_feature_file, 'w') as f:
    print("Write file: {}".format(id_2_feature_file))
    json.dump(id2feature_dict, f)

Write file: ../Dataset/wine/train/feature/id2feature.json


# Check Whether there are reviews with no sentence

In [22]:
invalid_data = 0
for idx, row in df_train_data.iterrows():
    review_text = row['review']
    review_sents = sent_tokenize(review_text)
    if len(review_sents) == 0:
        print(row)
        invalid_data += 1

In [23]:
print(invalid_data)

0


# Construct Sentence Vocab

In [25]:
# sentence vocab
sentence_count = dict()
sentence_with_no_feature = 0
# Loop for each review
for idx, row in df_train_data.iterrows():
    review_text = row['review']
    review_sents = sent_tokenize(review_text)
    tf_score = get_tf_score(review_sents, feature_word_list)
    # _, tf_score = get_tfidf_embedding(review_sents, feature_word_list)
    # Sum up the tf-value for each sentence so that if this sum is 0, this sentence should be removed
    tfidf_sum_sents = np.sum(tf_score, axis=1)
    for i in range(len(review_sents)):
        if tfidf_sum_sents[i] != 0.0:
            cur_sent = review_sents[i]
            # check whether this sentence has more than 3 tokens
            tokens = word_tokenize(cur_sent)
            cnt_tokens = 0
            for token in tokens:
                if token.isdigit() or (token in punct):
                    pass
                else:
                    cnt_tokens += 1
            # only sentence with more than 2 effective tokens can be added into the sentence vocab
            if cnt_tokens < 3:
                pass
            else:
                sentence_count[cur_sent] = 1 + sentence_count.get(cur_sent, 0)
        else:
            sentence_with_no_feature += 1
    if (idx+1) % 10000 == 0:
        print("Processed {} lines".format(idx+1))
print('Finish.')

Processed 10000 lines
Processed 20000 lines
Processed 30000 lines
Processed 40000 lines
Processed 50000 lines
Processed 60000 lines
Processed 70000 lines
Processed 80000 lines
Processed 90000 lines
Processed 100000 lines
Processed 110000 lines
Processed 120000 lines
Processed 130000 lines
Processed 140000 lines
Processed 150000 lines
Processed 160000 lines
Processed 170000 lines
Processed 180000 lines
Processed 190000 lines
Processed 200000 lines
Processed 210000 lines
Processed 220000 lines
Processed 230000 lines
Processed 240000 lines
Finish.


In [26]:
print("Number of sentences with feature word(s): {0}\nNumber of sentences w/o feature word: {1}".format(
    len(sentence_count), sentence_with_no_feature
))

Number of sentences with feature word(s): 554564
Number of sentences w/o feature word: 481196


In [27]:
# sort sentence based on counts (the majority should be 1)
sorted_sent_counts = sorted(sentence_count.items(), key = lambda x: -x[1])

In [28]:
# sentence_vocab_list = list(sentence_count.keys())
# Building mappings from sentences to ids and ids to sentences
sent_to_id = {entry[0]: str(id) for (id, entry) in enumerate(sorted_sent_counts)}
# Since we loaded all the tokenized sentences, we don't need to add the special UNK token
id_to_sent = {str(id): sent for (sent, id) in sent_to_id.items()}

In [30]:
assert len(sent_to_id) == len(id_to_sent)

In [32]:
id_to_sent['0']

'very nice wine .'

In [33]:
id2sentence_filepath = '../Dataset/{}/train/sentence/id2sentence.json'.format(dataset_name)
with open(id2sentence_filepath, 'w') as f:
    print("Write file: {}".format(id2sentence_filepath))
    json.dump(id_to_sent, f)

Write file: ../Dataset/wine/train/sentence/id2sentence.json


In [34]:
sentence2id_filepath = '../Dataset/{}/train/sentence/sentence2id.json'.format(dataset_name)
with open(sentence2id_filepath, 'w') as f:
    print("Write file: {}".format(sentence2id_filepath))
    json.dump(sent_to_id, f)

Write file: ../Dataset/wine/train/sentence/sentence2id.json


In [35]:
# Load id2sentence and sentence2id, check whether they are the same as the newly processed mappings
id2sentence_filepath = '../Dataset/{}/train/sentence/id2sentence.json'.format(dataset_name)
with open(id2sentence_filepath, 'r') as f:
    print("Load file: {}".format(id2sentence_filepath))
    trainset_id2sent = json.load(f)
sentence2id_filepath = '../Dataset/{}/train/sentence/sentence2id.json'.format(dataset_name)
with open(sentence2id_filepath, 'r') as f:
    print("Load file: {}".format(sentence2id_filepath))
    trainset_sent2id = json.load(f)

Load file: ../Dataset/wine/train/sentence/id2sentence.json
Load file: ../Dataset/wine/train/sentence/sentence2id.json


In [36]:
assert trainset_id2sent == id_to_sent
assert trainset_sent2id == sent_to_id

# Get Sentence Feature

In [37]:
df_train_data

Unnamed: 0,item,user,rating,review
0,1015705,131074,93,this wine is simply outstanding . the nose has...
1,661436,131074,83,"ripe but diluted bing cherry , a hint of spice..."
2,458829,131074,92,blackberry fruit and smoke . remarkable depth ...
3,887649,131074,88,"clear and crisp , tangy lemon and fresh vegeta..."
4,744973,131074,90,juicy young fruit . darker . currant . some pl...
...,...,...,...,...
248447,70969,152917,93,"very structured , tannins still firm , a tad b..."
248448,470778,152917,88,me thinks this wine is passing it 's prime as ...
248449,1785,152917,90,wonderfully aged bordeaux . it took about 5 to...
248450,481908,152917,95,outstanding effort from kosta browne . i had t...


In [38]:
def check_vocab_is_same(sklearn_vocab, feature_vocab):
    if len(sklearn_vocab) == len(feature_vocab):
        for key, value in sklearn_vocab.items():
            sklearn_vocab_id = value
            feature_vocab_id = feature_vocab[key]
            if int(feature_vocab_id) == sklearn_vocab_id:
                continue
            else:
                return False
    else:
        return False
    return True

In [39]:
sentence_text_list = list(sent_to_id.keys())

In [44]:
len(sentence_text_list)

554564

In [40]:
sentence_text_list[:10]

['very nice wine .',
 'dark purple color .',
 'dark ruby color .',
 'deep purple color .',
 'very good wine .',
 'dark garnet color .',
 'love this wine .',
 'dark red color .',
 'deep ruby color .',
 'i love this wine .']

In [41]:
cntvector, tfidf_weight = get_tfidf_embedding(sentence_text_list, feature_word_list)

In [42]:
df_count = get_df_score(sentence_text_list, feature_word_list)

In [43]:
df_count.shape

(215,)

In [45]:
trainset_feature_df = dict()
trainset_feature_df_norm = dict()
for i in range(len(feature_word_list)):
    trainset_feature_df[feature_word_list[i]] = df_count[i]
    trainset_feature_df_norm[feature_word_list[i]] = df_count[i]/len(sentence_text_list)

In [46]:
type(trainset_feature_df['aroma'])

numpy.int64

In [47]:
for key, value in trainset_feature_df.items():
    if isinstance(value, np.int64):
        trainset_feature_df[key] = int(value)

In [48]:
type(trainset_feature_df['aroma'])

int

In [49]:
trainset_feat_df_file = '../Dataset/{}/train/feature/feature2df.json'.format(dataset_name)

with open(trainset_feat_df_file, 'w') as f:
    print("Write file: {}".format(trainset_feat_df_file))
    json.dump(trainset_feature_df, f)

Write file: ../Dataset/wine/train/feature/feature2df.json


In [50]:
trainset_feature_df_sort = dict(sorted(trainset_feature_df.items(), key = lambda x: -x[1]))

In [51]:
trainset_feature_df_sort_list = list(trainset_feature_df_sort.keys())
trainset_feature_df_sort_rank = dict()
for i in range(len(trainset_feature_df_sort_list)):
    trainset_feature_df_sort_rank[trainset_feature_df_sort_list[i]] = i+1

In [52]:
this_word = 'aroma'
print("df value: {}".format(trainset_feature_df[this_word]))
print("rank of the feature: {}".format(trainset_feature_df_sort_rank[this_word]))

df value: 4526
rank of the feature: 35


In [53]:
tfidf_weight.shape

(554564, 215)

In [54]:
check_vocab_is_same(cntvector.vocabulary_, feature_vocab)

True

In [55]:
sentence_to_feature = dict()
sentence_with_no_feature = 0
tfidf_sum_sents = np.sum(tfidf_weight, axis=1)
print("Shape of tf-idf sum: {}".format(tfidf_sum_sents.shape))
for i in range(len(sentence_text_list)):
    cur_sent = sentence_text_list[i]
    # if this sentence is in the sent_to_id vocabulary
    assert cur_sent in sent_to_id
    # get the sentence_id (str)
    cur_sent_id = sent_to_id[cur_sent]
    assert int(cur_sent_id) == i
    # find all the feature that has non-zero tf-idf weight
    feature_dict = dict()
    for j in range(len(tfidf_weight[i])):
        if tfidf_weight[i][j] != 0.0:
            # get the feature
            feature_id = str(j)
            feature = feature_word_list[j]
            feature_tfidf = tfidf_weight[i][j]
            feature_dict[feature_id] = feature_tfidf
    if len(feature_dict) > 0:
        sentence_to_feature[cur_sent_id] = feature_dict
    else:
        sentence_with_no_feature += 1
    if (i+1) % 50000 == 0:
        print("Processed {} lines".format(i+1))
print("Finish. Totally {} lines".format(i+1))
print("Totally {} sentences has at least 1 feature and {} sentences don't have feature.".format(
    len(sentence_to_feature), sentence_with_no_feature))

Shape of tf-idf sum: (554564,)
Processed 50000 lines
Processed 100000 lines
Processed 150000 lines
Processed 200000 lines
Processed 250000 lines
Processed 300000 lines
Processed 350000 lines
Processed 400000 lines
Processed 450000 lines
Processed 500000 lines
Processed 550000 lines
Finish. Totally 554564 lines
Totally 554564 sentences has at least 1 feature and 0 sentences don't have feature.


In [56]:
sentence2feature_filepath = '../Dataset/{}/train/sentence/sentence2feature.json'.format(dataset_name)
with open(sentence2feature_filepath, 'w') as f:
    print("Write file: {}".format(sentence2feature_filepath))
    json.dump(sentence_to_feature, f)

Write file: ../Dataset/wine/train/sentence/sentence2feature.json


In [57]:
sentence_to_feature['0']

{'0': 1.0}

In [58]:
id_to_sent['0']

'very nice wine .'

In [59]:
id2feature_dict['0']

'wine'

In [60]:
num_feature_per_sentence = []
for key, value in sentence_to_feature.items():
    num_feature_per_sentence.append(len(value))
    assert len(value) > 0

In [61]:
print("Mean number of features per sentence: {}".format(np.mean(num_feature_per_sentence)))
print("Max number of features per sentence: {}".format(np.max(num_feature_per_sentence)))
print("Min number of features per sentence: {}".format(np.min(num_feature_per_sentence)))

Mean number of features per sentence: 1.7565943696309172
Max number of features per sentence: 13
Min number of features per sentence: 1


# Get User to Feature

In [62]:
df_train_data

Unnamed: 0,item,user,rating,review
0,1015705,131074,93,this wine is simply outstanding . the nose has...
1,661436,131074,83,"ripe but diluted bing cherry , a hint of spice..."
2,458829,131074,92,blackberry fruit and smoke . remarkable depth ...
3,887649,131074,88,"clear and crisp , tangy lemon and fresh vegeta..."
4,744973,131074,90,juicy young fruit . darker . currant . some pl...
...,...,...,...,...
248447,70969,152917,93,"very structured , tannins still firm , a tad b..."
248448,470778,152917,88,me thinks this wine is passing it 's prime as ...
248449,1785,152917,90,wonderfully aged bordeaux . it took about 5 to...
248450,481908,152917,95,outstanding effort from kosta browne . i had t...


## GroupBy User

In [63]:
group_by_user = df_train_data.groupby('user')
user_id_list = []
user_reviews = []
# Loop over all user
for user_df_chunk in list(group_by_user):
    user_id = int(user_df_chunk[0])
    user_df = user_df_chunk[1]
    user_text = " ".join(list(user_df['review']))
    user_id_list.append(user_id)
    user_reviews.append(user_text)

In [64]:
print("Number of users: {}".format(len(user_id_list)))

Number of users: 6080


In [65]:
assert len(user_id_list) == len(user_reviews)

## Compute User Tf-idf

In [66]:
cntvector_user, tfidf_weight_user = get_tfidf_embedding(user_reviews, feature_word_list)

In [67]:
check_vocab_is_same(cntvector_user.vocabulary_, feature_vocab)

True

In [68]:
tfidf_weight_user.shape

(6080, 215)

In [70]:
print(feature_word_list[:20])

['wine', 'fruit', 'palate', 'dark', 'acidity', 'cherry', 'spice', 'smooth', 'taste', 'vanilla', 'pinot', 'alcohol', 'tart', 'citrus', 'blackberry', 'plum', 'acid', 'ruby', 'lemon', 'apple']


In [71]:
user_to_feature = dict()
for i in range(len(user_id_list)):
    feature_dict = dict()
    cur_user_id = user_id_list[i]
    assert len(tfidf_weight_user[i]) == len(feature_vocab)
    for j in range(len(tfidf_weight_user[i])):
        if tfidf_weight_user[i][j] != 0.0:
            # get the feature
            # NOTE: make sure that the feature_id is str format
            feature_id = str(j)
            feature = feature_word_list[j]
            assert feature_vocab[feature] == feature_id
            feature_tfidf = tfidf_weight_user[i][j]
            feature_dict[feature_id] = feature_tfidf
    assert len(feature_dict) > 0
    user_to_feature[str(cur_user_id)] = feature_dict
    if (i+1) % 500 == 0:
        print("{} user processed.".format(i+1))
print("Totally {} users".format(i+1))

500 user processed.
1000 user processed.
1500 user processed.
2000 user processed.
2500 user processed.
3000 user processed.
3500 user processed.
4000 user processed.
4500 user processed.
5000 user processed.
5500 user processed.
6000 user processed.
Totally 6080 users


In [72]:
len(user_to_feature)

6080

In [73]:
num_feature_per_user = []
for key,value in user_to_feature.items():
    num_feature_per_user.append(len(value))
    assert len(value) > 0

In [74]:
print("Mean number of features per user: {}".format(np.mean(num_feature_per_user)))
print("Max number of features per user: {}".format(np.max(num_feature_per_user)))
print("Min number of features per user: {}".format(np.min(num_feature_per_user)))

Mean number of features per user: 29.992598684210527
Max number of features per user: 129
Min number of features per user: 1


In [75]:
len(user_to_feature['3'])

18

## Save User to Feature Mapping into Json File

In [76]:
user2feature_filepath = '../Dataset/{}/train/user/user2feature.json'.format(dataset_name)
with open(user2feature_filepath, 'w') as f:
    print("Write file: {}".format(user2feature_filepath))
    json.dump(user_to_feature, f)

Write file: ../Dataset/wine/train/user/user2feature.json


# Get Item to Feature

## GroupBy Item

In [77]:
group_by_item = df_train_data.groupby('item')
item_id_list = []
item_reviews = []
# Loop over all user
for item_df_chunk in list(group_by_item):
    item_id = str(item_df_chunk[0])
    item_df = item_df_chunk[1]
    item_text = " ".join(list(item_df['review']))
    item_id_list.append(item_id)
    item_reviews.append(item_text)

In [78]:
print("Number of items: {}".format(len(item_id_list)))

Number of items: 15253


In [79]:
assert len(item_id_list) == len(item_reviews)

## Compute Item Tf-idf

In [80]:
cntvector_item, tfidf_weight_item = get_tfidf_embedding(item_reviews, feature_word_list)

In [81]:
check_vocab_is_same(cntvector_item.vocabulary_, feature_vocab)

True

In [82]:
tfidf_weight_item.shape

(15253, 215)

In [83]:
item_to_feature = dict()
for i in range(len(item_id_list)):
    feature_dict = dict()
    cur_item_id = item_id_list[i]
    assert len(tfidf_weight_item[i]) == len(feature_vocab)
    for j in range(len(tfidf_weight_item[i])):
        if tfidf_weight_item[i][j] != 0.0:
            # get the feature
            feature_id = str(j)
            feature = feature_word_list[j]
            assert feature_id == feature_vocab[feature]
            feature_tfidf = tfidf_weight_item[i][j]
            feature_dict[feature_id] = feature_tfidf
    assert len(feature_dict) > 0
    item_to_feature[cur_item_id] = feature_dict
    if (i+1) % 1000 == 0:
        print("{} items processed.".format(i+1))
print('Finish. Totally {} items'.format(i+1))

1000 items processed.
2000 items processed.
3000 items processed.
4000 items processed.
5000 items processed.
6000 items processed.
7000 items processed.
8000 items processed.
9000 items processed.
10000 items processed.
11000 items processed.
12000 items processed.
13000 items processed.
14000 items processed.
15000 items processed.
Finish. Totally 15253 items


In [84]:
len(item_to_feature)

15253

In [85]:
num_feature_per_item = []
for key,value in item_to_feature.items():
    num_feature_per_item.append(len(value))
    assert len(value) > 0

In [86]:
print("Mean number of features per item: {}".format(np.mean(num_feature_per_item)))
print("Max number of features per item: {}".format(np.max(num_feature_per_item)))
print("Min number of features per item: {}".format(np.min(num_feature_per_item)))

Mean number of features per item: 22.61850127843703
Max number of features per item: 70
Min number of features per item: 5


In [87]:
item2feature_filepath = '../Dataset/{}/train/item/item2feature.json'.format(dataset_name)
with open(item2feature_filepath, 'w') as f:
    print("Write file: {}".format(item2feature_filepath))
    json.dump(item_to_feature, f)

Write file: ../Dataset/wine/train/item/item2feature.json


# Compute Top User/Item Features

In [88]:
# TODO: Sanity Check