In [1]:
!which python

/u/pw7nc/anaconda3/bin/python


In [2]:
import nltk
import spacy
import re
import json
import pandas as pd
import os
import numpy as np

# Read Data

In [3]:
# Load original dataset
dir_path = '../Dataset/wine/'
# Load whole dataset (cleaned)
whole_review = []
cnt = 0
file_path = os.path.join(dir_path, 'whole_cleaned.json')
with open(file_path) as f:
    for line in f:
        line_data = json.loads(line)
        user_id = line_data['user']
        item_id = line_data['item']
        rating = line_data['rating']
        review = line_data['review']
        whole_review.append([item_id, user_id, rating, review])
        cnt += 1
        if cnt % 100000 == 0:
            print('{} lines loaded.'.format(cnt))
print('Finish loading whole dataset, totally {} lines.'.format(len(whole_review)))

100000 lines loaded.
200000 lines loaded.
300000 lines loaded.
400000 lines loaded.
500000 lines loaded.
600000 lines loaded.
700000 lines loaded.
800000 lines loaded.
900000 lines loaded.
1000000 lines loaded.
1100000 lines loaded.
1200000 lines loaded.
1300000 lines loaded.
Finish loading whole dataset, totally 1323270 lines.


## Convert List Data to Pandas Dataframe

In [4]:
df_whole_data = pd.DataFrame(whole_review, columns=['item', 'user', 'rating', 'review'])

In [5]:
df_whole_data

Unnamed: 0,item,user,rating,review
0,18856,1,96,"olive , horse sweat , dirty saddle , and smoke..."
1,3495,1,93,a remarkably floral nose with violet and chamb...
2,40451,1,92,"fantastic wine ! blackberry , smoke , olive , ..."
3,26767,1,,perfect cork . perfect fill . somewhat allurin...
4,31665,1,,"omfg , this wine just does not quit . i need t..."
...,...,...,...,...
1323265,1055257,152917,93,excellent interpretation of a mersault . the c...
1323266,860528,152917,92,"ok , so argentina likes to grow their malbecs ..."
1323267,16814,152917,94,wow ! this is a big barolo - the bouquet upon ...
1323268,168156,152917,91,"opened six bottles , all drank very nicely - w..."


# Filter User/Item

In [6]:
"""
The requirement of being a satisfactory dataframe is that for each user / item in the dataset, 
there should be at least 10 reviews and no more than 500 reviews.
"""
lower_thres = 10
upper_thres = 1000
def is_satisfy_df(df_review):
    # group by user
    group_by_user = df_review.groupby('user')
    user_selected = set()
    user_review_morethanupper = set()
    user_review_lessthanlower = set()
    for user_df_chunk in list(group_by_user):
        user_id = int(user_df_chunk[0])
        user_df = user_df_chunk[1]
        if len(user_df) > upper_thres:
            user_review_morethanupper.add(user_id)
        elif len(user_df) < lower_thres:
            user_review_lessthanlower.add(user_id)
        else:
            user_selected.add(user_id)
    print("user_selected: {0} \t user(>upper): {1} \t user(<lower): {2}".format(
        len(user_selected), len(user_review_morethanupper), len(user_review_lessthanlower)))
    # group by item
    group_by_item = df_review.groupby('item')
    item_selected = set()
    item_review_morethanupper = set()
    item_review_lessthanlower = set()
    for item_df_chunk in list(group_by_item):
        item_id = int(item_df_chunk[0])
        item_df = item_df_chunk[1]
        if len(item_df) > upper_thres:
            item_review_morethanupper.add(item_id)
        elif len(item_df) < lower_thres:
            item_review_lessthanlower.add(item_id)
        else:
            item_selected.add(item_id)
    print("item_selected: {0} \t item(>upper): {1} \t item(<lower): {2}".format(
        len(item_selected), len(item_review_morethanupper), len(item_review_lessthanlower)))
    # whether this dataframe has rare/popular user/item
    if len(user_review_morethanupper) == 0 and len(user_review_lessthanlower) == 0:
        if len(item_review_morethanupper) == 0 and len(item_review_lessthanlower) == 0:
            return True
        else:
            return False
    else:
        return False

In [7]:
iter_cnt = 0
while True:
    # group by user
    group_by_user = df_whole_data.groupby('user')
    # Loop through all the dataframe for each user
    user_selected = set()
    user_review_morethanupper = set()
    user_review_lessthanlower = set()
    user_num_review = []
    for user_df_chunk in list(group_by_user):
        user_id = int(user_df_chunk[0])
        user_df = user_df_chunk[1]
        user_num_review.append(len(user_df))
        if len(user_df) > upper_thres:
            user_review_morethanupper.add(user_id)
        elif len(user_df) < lower_thres:
            user_review_lessthanlower.add(user_id)
        else:
            user_selected.add(user_id)
    print("Start Drop")
    # Only add users in the selected set
    cur_whole_review = []
    for row_data in whole_review:
        # if the user_id is in the selected user set or not
        if int(row_data[1]) in user_selected:
            cur_whole_review.append(row_data)
    whole_review = cur_whole_review
    # convert list to dataframe
    df_whole_data = pd.DataFrame(whole_review, columns=['item', 'user', 'rating', 'review'])
    # check whether this dataframe satisfies the requirement
    if is_satisfy_df(df_whole_data):
        break
    print('Finish User')
    # else, keep going
    # group by item
    group_by_item = df_whole_data.groupby('item')
    # Loop through all the dataframe for each item
    item_selected = set()
    item_review_morethanupper = set()
    item_review_lessthanlower = set()
    item_num_review = []
    for item_df_chunk in list(group_by_item):
        item_id = int(item_df_chunk[0])
        item_df = item_df_chunk[1]
        item_num_review.append(len(item_df))
        if len(item_df) > upper_thres:
            item_review_morethanupper.add(item_id)
        elif len(item_df) < lower_thres:
            item_review_lessthanlower.add(item_id)
        else:
            item_selected.add(item_id)
    print("Start Drop")
    # Filter item
    cur_whole_review = []
    for row_data in whole_review:
        # if the item_id is in the selected user set or not
        if int(row_data[0]) in item_selected:
            cur_whole_review.append(row_data)
    whole_review = cur_whole_review
    # convert list to dataframe
    df_whole_data = pd.DataFrame(whole_review, columns=['item', 'user', 'rating', 'review'])
    # check whether this dataframe satisfies the requirement
    if is_satisfy_df(df_whole_data):
        break
    print('Finish Item')
    iter_cnt += 1
    if iter_cnt % 10 == 0:
        print("{} iterations of filter".format(iter_cnt))

Start Drop
user_selected: 11507 	 user(>upper): 0 	 user(<lower): 0
item_selected: 17205 	 item(>upper): 0 	 item(<lower): 317251
Finish User
Start Drop
user_selected: 6373 	 user(>upper): 0 	 user(<lower): 4851
item_selected: 17205 	 item(>upper): 0 	 item(<lower): 0
Finish Item
Start Drop
user_selected: 6373 	 user(>upper): 0 	 user(<lower): 0
item_selected: 15447 	 item(>upper): 0 	 item(<lower): 1758
Finish User
Start Drop
user_selected: 6118 	 user(>upper): 0 	 user(<lower): 255
item_selected: 15447 	 item(>upper): 0 	 item(<lower): 0
Finish Item
Start Drop
user_selected: 6118 	 user(>upper): 0 	 user(<lower): 0
item_selected: 15284 	 item(>upper): 0 	 item(<lower): 163
Finish User
Start Drop
user_selected: 6085 	 user(>upper): 0 	 user(<lower): 33
item_selected: 15284 	 item(>upper): 0 	 item(<lower): 0
Finish Item
Start Drop
user_selected: 6085 	 user(>upper): 0 	 user(<lower): 0
item_selected: 15256 	 item(>upper): 0 	 item(<lower): 28
Finish User
Start Drop
user_selected: 6082

In [8]:
df_whole_data

Unnamed: 0,item,user,rating,review
0,682432,131074,91,"excellent ! dried cherry , some fig . leather ..."
1,490003,131074,94,rich and with more concentration than the rese...
2,1026221,131074,91,outstanding value wine . purple-ruby color . d...
3,605994,131074,93,what can i say about this bottle ? certainly n...
4,1181887,131074,94,"this wine has serious concentration , and offe..."
...,...,...,...,...
307742,65694,152917,89,"color is still deep , meniscus starting to sho..."
307743,470778,152917,88,me thinks this wine is passing it 's prime as ...
307744,2705,152917,89,this much maligned vintage actually is drinkin...
307745,70969,152917,93,"very structured , tannins still firm , a tad b..."


## Check Whether the Filtered Dataframe Satisfies the Requirement

In [9]:
# Check User
# group by user
group_by_user = df_whole_data.groupby('user')
user_selected = set()
user_review_morethanupper = set()
user_review_lessthanlower = set()
user_num_reviews = []
for user_df_chunk in list(group_by_user):
    user_id = int(user_df_chunk[0])
    user_df = user_df_chunk[1]
    if len(user_df) > upper_thres:
        user_review_morethanupper.add(user_id)
    elif len(user_df) < lower_thres:
        user_review_lessthanlower.add(user_id)
    else:
        user_selected.add(user_id)
    user_num_reviews.append(len(user_df))

In [10]:
print("Number of user selected: {}".format(len(user_selected)))
print("Number of user with more than upper reviews: {}".format(len(user_review_morethanupper)))
print("Number of user with less than lower reviews: {}".format(len(user_review_lessthanlower)))

Number of user selected: 6080
Number of user with more than upper reviews: 0
Number of user with less than lower reviews: 0


In [11]:
print("Mean number of reviews per user: {}".format(np.mean(user_num_reviews)))
print("Max number of reviews per user: {}".format(np.max(user_num_reviews)))
print("Min number of reviews per user: {}".format(np.min(user_num_reviews)))

Mean number of reviews per user: 50.61628289473684
Max number of reviews per user: 591
Min number of reviews per user: 10


In [12]:
# Check Item
# group by item
group_by_item = df_whole_data.groupby('item')
item_selected = set()
item_review_morethanupper = set()
item_review_lessthanlower = set()
item_num_reviews = []
for item_df_chunk in list(group_by_item):
    item_id = int(item_df_chunk[0])
    item_df = item_df_chunk[1]
    if len(item_df) > upper_thres:
        item_review_morethanupper.add(item_id)
    elif len(item_df) < lower_thres:
        item_review_lessthanlower.add(item_id)
    else:
        item_selected.add(item_id)
    item_num_reviews.append(len(item_df))

In [13]:
print("Number of item selected: {}".format(len(item_selected)))
print("Number of item with more than upper reviews: {}".format(len(item_review_morethanupper)))
print("Number of item with less than lower reviews: {}".format(len(item_review_lessthanlower)))

Number of item selected: 15253
Number of item with more than upper reviews: 0
Number of item with less than lower reviews: 0


In [14]:
print("Mean number of reviews per item: {}".format(np.mean(item_num_reviews)))
print("Max number of reviews per item: {}".format(np.max(item_num_reviews)))
print("Min number of reviews per item: {}".format(np.min(item_num_reviews)))

Mean number of reviews per item: 20.176162066478724
Max number of reviews per item: 247
Min number of reviews per item: 10


In [15]:
is_satisfy_df(df_whole_data)

user_selected: 6080 	 user(>upper): 0 	 user(<lower): 0
item_selected: 15253 	 item(>upper): 0 	 item(<lower): 0


True

## Handle N/A Ratings

In [16]:
rating_list_filter = list(df_whole_data['rating'])
print("Number of ratings: {}".format(len(rating_list_filter)))

Number of ratings: 307747


In [17]:
cnt_na_rating = 0
for rating in rating_list_filter:
    if rating == 'N/A':
        cnt_na_rating += 1
    else:
        cur_rating_int = int(rating)
        assert cur_rating_int >= 0 and cur_rating_int <= 100
print("Number of N/A ratings: {}".format(cnt_na_rating))

Number of N/A ratings: 53896


In [23]:
def compute_avg_cdd_rating(df_whole_data, user_id, item_id):
    """ Compute an average rating from user-side reviews and item-side reviews.
        This can be used to substitute the N/A (i.e. missing rating) for this user-item pair.
    """
    # get user-side reviews
    df_user_side = df_whole_data.loc[df_whole_data['user']==user_id]
    # get item-side reviews
    df_item_side = df_whole_data.loc[df_whole_data['item']==item_id]
    # get all the cdd ratings which are not N/A
    cdd_ratings = list()
    for cdd_rating in list(df_user_side['rating']):
        if cdd_rating == 'N/A':
            pass
        else:
            cdd_rating_int = int(cdd_rating)
            assert cdd_rating_int >= 0 and cdd_rating_int <= 100
            cdd_ratings.append(cdd_rating_int)
    for cdd_rating in list(df_item_side['rating']):
        if cdd_rating == 'N/A':
            pass
        else:
            cdd_rating_int = int(cdd_rating)
            assert cdd_rating_int >= 0 and cdd_rating_int <= 100
            cdd_ratings.append(cdd_rating_int)

    avg_cdd_rating = round(np.mean(cdd_ratings))
    assert isinstance(avg_cdd_rating, int)
    assert avg_cdd_rating >= 0 and avg_cdd_rating <= 100
    return avg_cdd_rating, cdd_ratings

In [26]:
avg_rating, cdd_ratings = compute_avg_cdd_rating(df_whole_data, "131074", "490003")

In [21]:
df_whole_data

Unnamed: 0,item,user,rating,review
0,682432,131074,91,"excellent ! dried cherry , some fig . leather ..."
1,490003,131074,94,rich and with more concentration than the rese...
2,1026221,131074,91,outstanding value wine . purple-ruby color . d...
3,605994,131074,93,what can i say about this bottle ? certainly n...
4,1181887,131074,94,"this wine has serious concentration , and offe..."
...,...,...,...,...
307742,65694,152917,89,"color is still deep , meniscus starting to sho..."
307743,470778,152917,88,me thinks this wine is passing it 's prime as ...
307744,2705,152917,89,this much maligned vintage actually is drinkin...
307745,70969,152917,93,"very structured , tannins still firm , a tad b..."


## Save Filtered Dataset

In [30]:
dir_path = '../Dataset/wine/'
file_path = os.path.join(dir_path, 'whole_filtered.json')
cnt_na_estimate = 0
with open(file_path, 'w') as f_out:
    print("Write file: {}".format(file_path))
    for idx, row_data in df_whole_data.iterrows():
        row_user_id = row_data['user']
        row_item_id = row_data['item']
        row_rating_str = row_data['rating']
        if row_rating_str == 'N/A':
            row_rating_int, _ = compute_avg_cdd_rating(df_whole_data, row_user_id, row_item_id)
            cnt_na_estimate += 1
        else:
            row_rating_int = int(row_rating_str)
            assert row_rating_int >= 0 and row_rating_int <= 100
        row_dict = {
            'user': row_data['user'],
            'item': row_data['item'],
            'rating': row_rating_int,
            'review': row_data['review']
        }
        # dump this into file
        json.dump(row_dict, f_out)
        f_out.write("\n")
        if (idx+1) % 50000 == 0:
            print("{} lines processed.".format(idx+1))
print("Totally {0} lines of data saved to file. Among them {1} lines use the estimated rating".format(
    idx+1, cnt_na_estimate
))

Write file: ../Dataset/wine/whole_filtered.json


AttributeError: '_io.TextIOWrapper' object has no attribute 'wirte'