In [1]:
!which python

/u/pw7nc/anaconda3/bin/python


In [2]:
import os
import json
import pickle
import pandas as pd
import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize, wordpunct_tokenize

# Load Dataset

In [3]:
df_data_file = "/u/pw7nc/experiments/Dataset/wine/df.pickle"
unpickled_df = pd.read_pickle(df_data_file)

In [4]:
unpickled_df

Unnamed: 0,userid,itemid,rating,review,tokens,boa
0,1,18856,96,"Olive, horse sweat, dirty saddle, and smoke. ...","[olive, ,, horse, sweat, ,, dirty, saddle, ,, ...","{175: 2, 493: 1, 412: 1, 336: 1, 3: 1, 593: 1,..."
1,1,3495,93,A remarkably floral nose with violet and cham...,"[a, remarkably, floral, nose, with, violet, an...","{2: 1, 92: 1, 3: 1, 0: 1}"
2,1,40451,92,"Fantastic wine! Blackberry, smoke, olive, ste...","[fantastic, wine, !, blackberry, ,, smoke, ,, ...","{0: 3, 21: 1, 175: 1, 2: 1, 3: 1, 7: 1}"
3,1,26767,,Perfect cork. Perfect fill. Somewhat alluring...,"[perfect, cork, ., perfect, fill, ., somewhat,...","{54: 1, 20: 1, 22: 1, 221: 1, 5: 1}"
4,1,31665,,"OMFG, this wine just does not quit. I need to...","[omfg, ,, this, wine, just, does, not, quit, ....","{0: 1, 5: 1, 29: 1, 145: 1, 15: 1, 376: 1}"
...,...,...,...,...,...,...
2025990,152917,16814,94,Wow! This is a big Barolo - the bouquet upon ...,"[wow, !, this, is, a, big, barolo, -, the, bou...","{48: 1, 1: 1, 31: 1, 5: 1, 0: 1, 74: 1}"
2025991,152917,168156,91,"Opened six bottles, all drank very nicely - W...","[opened, six, bottles, ,, all, drank, very, ni...","{0: 3, 6: 1, 5: 1, 2: 2, 1: 1, 74: 1}"
2025992,152917,216895,85,"Dark, inky purple. A little oxidized. Some be...","[dark, ,, inky, purple, ., a, little, oxidized...","{4: 1, 1: 1, 140: 1, 2: 1, 0: 1}"
2025993,152917,712760,89,"Great Village, see my previous notes","[great, village, ,, see, my, previous, notes]",{}


In [5]:
print("Number of users: {}".format(len(unpickled_df['userid'].unique())))
print("Number of items: {}".format(len(unpickled_df['itemid'].unique())))

Number of users: 44268
Number of items: 485179


# Check 0 sentence review

In [6]:
cnt = 0
for review_text in list(unpickled_df['review']):
    review_sents = sent_tokenize(review_text)
    if len(review_sents) == 0:
        try:
            assert review_text.strip() == ''
        except:
            print(review_text)
        cnt += 1
print("Number of reviews on whole dataset with 0 sentences: {}".format(cnt))

Number of reviews on whole dataset with 0 sentences: 58


# Check Duplicate User-Item

In [7]:
user_item_set = set()
cnt_duplicate_review = 0

for idx, data_chunk in unpickled_df.iterrows():
    item_id = data_chunk['itemid']
    user_id = data_chunk['userid']
    if (user_id, item_id) in user_item_set:
        cnt_duplicate_review += 1
    else:
        user_item_set.add((user_id, item_id))

print("[Train] Number of duplicate reviews: {}".format(cnt_duplicate_review))
print("[Train] Number of unique reviews: {}".format(len(user_item_set)))

[Train] Number of duplicate reviews: 0
[Train] Number of unique reviews: 1685890


# Check Number of Reviews per User/Item

## User

In [8]:
groupby_user_whole = unpickled_df.groupby(['userid'])
groupby_item_whole = unpickled_df.groupby(['itemid'])

In [9]:
print("Number of unique user: {}".format(len(groupby_user_whole)))
print("Number of unique item: {}".format(len(groupby_item_whole)))

Number of unique user: 44268
Number of unique item: 485179


In [10]:
user_num_review_list = list()
user_num_review_dict = dict()
cnt = 0
for key, cur_user_df in groupby_user_whole:
    cur_user_id = key
    assert isinstance(cur_user_id, str)
    assert cur_user_id not in user_num_review_dict
    user_num_review_dict[cur_user_id] = len(cur_user_df)
    user_num_review_list.append(len(cur_user_df))
    cnt += 1
print("Finished! Totally {} users".format(cnt))

Finished! Totally 44268 users


In [11]:
print("Mean number of review per user: {}".format(
    np.mean(user_num_review_list)
))
print("Min number of review per user: {}".format(
    np.min(user_num_review_list)
))
print("Max number of review per user: {}".format(
    np.max(user_num_review_list)
))

Mean number of review per user: 38.083717357910906
Min number of review per user: 1
Max number of review per user: 26374


In [12]:
print("Top-10 least numbber of review per user: {}".format(
    sorted(user_num_review_list)[:10]
))
print("Top-10 most numbber of review per user: {}".format(
    sorted(user_num_review_list)[-10:]
))

Top-10 least numbber of review per user: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Top-10 most numbber of review per user: [4601, 4915, 5160, 6086, 6202, 6235, 6620, 6944, 7973, 26374]


## Item

In [13]:
item_num_review_list = list()
item_num_review_dict = dict()
cnt = 0
for key, cur_item_df in groupby_item_whole:
    cur_item_id = key
    assert isinstance(cur_item_id, str)
    assert cur_item_id not in item_num_review_dict
    item_num_review_dict[cur_item_id] = len(cur_item_df)
    item_num_review_list.append(len(cur_item_df))
    cnt += 1
print("Finished! Totally {} items".format(cnt))

Finished! Totally 485179 items


In [14]:
print("Mean number of review per item: {}".format(
    np.mean(item_num_review_list)
))
print("Min number of review per item: {}".format(
    np.min(item_num_review_list)
))
print("Max number of review per item: {}".format(
    np.max(item_num_review_list)
))

Mean number of review per item: 3.4747794113100525
Min number of review per item: 1
Max number of review per item: 515


In [15]:
print("Top-10 least numbber of review per item: {}".format(
    sorted(item_num_review_list)[:10]
))
print("Top-10 most numbber of review per item: {}".format(
    sorted(item_num_review_list)[-10:]
))

Top-10 least numbber of review per item: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Top-10 most numbber of review per item: [264, 266, 277, 279, 285, 290, 362, 416, 434, 515]


# Load Feature Words

In [18]:
feature_file_path = "/u/pw7nc/experiments/Dataset/wine/feature/feature_pure.txt"
feature_word_list = []
feature2id_dict = dict()
with open(feature_file_path, 'r') as f_feat:
    for line in f_feat:
        cur_feat_word = line.strip()
        assert cur_feat_word not in feature2id_dict
        # add this feature word into dict
        feature2id_dict[cur_feat_word] = str(len(feature_word_list))
        # add this feature word into list
        feature_word_list.append(cur_feat_word)
print("Totally {} features loaded.".format(len(feature2id_dict)))

Totally 212 features loaded.


In [19]:
# write feature words into file
feature2id_file_path = "/u/pw7nc/experiments/Dataset/wine/feature/feature2id.json"
with open(feature2id_file_path, 'w') as f_feat:
    json.dump(feature2id_dict, f_feat)

# Filter Dataset

In [22]:
def is_empty_review(review_text):
    """ Check whether a review is an empty review (i.e. 0 char or only have whitespaces)
    """
    review_sents = sent_tokenize(review_text)
    if len(review_sents) == 0:
        assert review_text.strip() == ''
        return True
    else:
        return False

In [30]:
def is_no_feature_review(review_text, feature_word_list):
    """ Check whether this review contains feature word
    """
    review_sents = sent_tokenize(review_text)
    feature_occurrence_flag = False
    for rvw_sent in review_sents:
        # tokenize this review sentence
        rvw_tokens = word_tokenize(rvw_sent.lower())
        for fea_word in feature_word_list:
            if fea_word in rvw_tokens:
                feature_occurrence_flag = True
                break
    return (not feature_occurrence_flag)

In [31]:
example_review = "this Wine is Dark. with Sweet Aroma."
print(is_no_feature_review(example_review, feature_word_list))

False


In [32]:
# 1. Remove Empty Review and No Feature Review
whole_review_data = []
cnt_lines = 0
cnt_num_empty_review = 0
cnt_num_no_feature_review = 0
for idx, data_chunk in unpickled_df.iterrows():
    item_id = data_chunk['itemid']
    user_id = data_chunk['userid']
    rating = data_chunk['rating']   # NOTE: rating's type is str
    review = data_chunk['review']
    if is_empty_review(review):
        # 1. check whether this review is empty
        cnt_num_empty_review += 1
    elif is_no_feature_review(review, feature_word_list):
        # 2. check whether this review contains
        cnt_num_no_feature_review += 1
    else:
        # add to list
        whole_review_data.append([item_id, user_id, rating, review])
    cnt_lines += 1
    if cnt_lines % 100000 == 0:
        print("{0} lines being processed. {1} empty reviews. {2} no feature reviews.".format(
            cnt_lines, cnt_num_empty_review, cnt_num_no_feature_review
        ))
print("{0} reviews has empty review.\n{1} reviews has no feature.\n{2} useful reviews.".format(
    cnt_num_empty_review, cnt_num_no_feature_review, len(whole_review_data)
))

200000 lines being processed.
400000 lines being processed.
600000 lines being processed.
800000 lines being processed.
1000000 lines being processed.
1200000 lines being processed.
1400000 lines being processed.
1600000 lines being processed.
58 reviews has empty review.
362562 reviews has no feature.
1323270 useful reviews.


In [33]:
# 2. Convert review to lower case and tokenize it.
print(whole_review_data[0])

['18856', '1', ' 96', ' Olive, horse sweat, dirty saddle, and smoke. This actually got quite a bit more spicy and expressive with significant aeration. This was a little dry on the palate first but filled out considerably in time, lovely, loaded with tapenade, leather, dry and powerful, very black olive, meaty. This improved considerably the longer it was open. A terrific bottle of 1981, 96+ and improving. This may well be my favorite vintage of Beau except for perhaps the 1990.']


In [43]:
# tokenize the lowercase the review text
# strip redundant whitespace of the rating (currently its still str since some of them is N/A)
for i in range(len(whole_review_data)):
    cur_origin_review = whole_review_data[i][-1]
    cur_tokenized_review = " ".join(word_tokenize(cur_origin_review.lower()))
    # remove redundant whitespace of itemid / userid / rating
    whole_review_data[i][0] = whole_review_data[i][0].strip()
    whole_review_data[i][1] = whole_review_data[i][1].strip()
    whole_review_data[i][2] = whole_review_data[i][2].strip()
    # tokenize the lowercase the review text
    whole_review_data[i][-1] = cur_tokenized_review
    if (i+1) % 200000 == 0:
        print("{} lines processed.".format(i+1))

200000 lines processed.
400000 lines processed.
600000 lines processed.
800000 lines processed.
1000000 lines processed.
1200000 lines processed.


In [46]:
print(whole_review_data[0])

['18856', '1', '96', 'olive , horse sweat , dirty saddle , and smoke . this actually got quite a bit more spicy and expressive with significant aeration . this was a little dry on the palate first but filled out considerably in time , lovely , loaded with tapenade , leather , dry and powerful , very black olive , meaty . this improved considerably the longer it was open . a terrific bottle of 1981 , 96+ and improving . this may well be my favorite vintage of beau except for perhaps the 1990 .']


In [45]:
print(len(whole_review_data))

1323270


# Check How many N/A ratings are there

In [47]:
cnt_na_rating = 0
for row in whole_review_data:
    if row[2] == "N/A":
        cnt_na_rating += 1
print("Totally {0} lines of data. Among them {1} lines has N/A rating.".format(
    len(whole_review_data), cnt_na_rating
))

Totally 1323270 lines of data. Among them 281022 lines has N/A rating.


# Get User/Item Statistics on the Filtered Dataset

In [48]:
df_filtered_review = pd.DataFrame(whole_review_data, columns=['item', 'user', 'rating', 'review'])

In [49]:
df_filtered_review.head()

Unnamed: 0,item,user,rating,review
0,18856,1,96.0,"olive , horse sweat , dirty saddle , and smoke..."
1,3495,1,93.0,a remarkably floral nose with violet and chamb...
2,40451,1,92.0,"fantastic wine ! blackberry , smoke , olive , ..."
3,26767,1,,perfect cork . perfect fill . somewhat allurin...
4,31665,1,,"omfg , this wine just does not quit . i need t..."


In [50]:
groupby_user_whole_filter = df_filtered_review.groupby(['user'])
groupby_item_whole_filter = df_filtered_review.groupby(['item'])

In [51]:
print("Number of unique user: {}".format(len(groupby_user_whole_filter)))
print("Number of unique item: {}".format(len(groupby_item_whole_filter)))

Number of unique user: 36711
Number of unique item: 417340


## User

In [52]:
user_num_review_filter_list = list()
user_num_review_filter_dict = dict()
cnt = 0
for key, cur_user_df in groupby_user_whole_filter:
    cur_user_id = key
    assert isinstance(cur_user_id, str)
    assert cur_user_id not in user_num_review_filter_list
    user_num_review_filter_dict[cur_user_id] = len(cur_user_df)
    user_num_review_filter_list.append(len(cur_user_df))
    cnt += 1
print("Finished! Totally {} users".format(cnt))

Finished! Totally 36711 users


In [53]:
print("Mean number of review per user: {}".format(
    np.mean(user_num_review_filter_list)
))
print("Min number of review per user: {}".format(
    np.min(user_num_review_filter_list)
))
print("Max number of review per user: {}".format(
    np.max(user_num_review_filter_list)
))

Mean number of review per user: 36.04559941162049
Min number of review per user: 1
Max number of review per user: 26115


In [54]:
print("Top-10 least numbber of review per user: {}".format(
    sorted(user_num_review_filter_list)[:10]
))
print("Top-10 most numbber of review per user: {}".format(
    sorted(user_num_review_filter_list)[-10:]
))

Top-10 least numbber of review per user: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Top-10 most numbber of review per user: [4053, 4126, 4242, 4320, 4370, 4805, 5323, 5964, 7553, 26115]


## Item

In [55]:
item_num_review_filter_list = list()
item_num_review_filter_dict = dict()
cnt = 0
for key, cur_item_df in groupby_item_whole_filter:
    cur_item_id = key
    assert isinstance(cur_item_id, str)
    assert cur_item_id not in item_num_review_filter_dict
    item_num_review_filter_dict[cur_item_id] = len(cur_item_df)
    item_num_review_filter_list.append(len(cur_item_df))
    cnt += 1
print("Finished! Totally {} items".format(cnt))

Finished! Totally 417340 items


In [56]:
print("Mean number of review per item: {}".format(
    np.mean(item_num_review_filter_list)
))
print("Min number of review per item: {}".format(
    np.min(item_num_review_filter_list)
))
print("Max number of review per item: {}".format(
    np.max(item_num_review_filter_list)
))

Mean number of review per item: 3.170724109838501
Min number of review per item: 1
Max number of review per item: 305


In [57]:
print("Top-10 least numbber of review per item: {}".format(
    sorted(item_num_review_filter_list)[:10]
))
print("Top-10 most numbber of review per item: {}".format(
    sorted(item_num_review_filter_list)[-10:]
))

Top-10 least numbber of review per item: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Top-10 most numbber of review per item: [210, 211, 213, 213, 225, 236, 239, 299, 303, 305]


# Save Cleaned Dataset into File

In [58]:
whole_data_filtered_file = "/u/pw7nc/experiments/Dataset/wine/whole_cleaned.json"
with open(whole_data_filtered_file, 'w') as f_out:
    for row in whole_review_data:
        row_dict = {
            'user': row[1],
            'item': row[0],
            'rating': row[2],
            'review': row[3]
        }
        # dump this dict into file
        json.dump(row_dict, f_out)
        f_out.write('\n')