In [1]:
!which python

/u/pw7nc/anaconda3/bin/python


In [2]:
import spacy
import nltk
import re
import json
import pandas as pd
import os
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from nltk.tokenize import sent_tokenize, word_tokenize

# Load Dataset

In [3]:
dir_path = '../Dataset/yelp/'
# Load cleaned train
train_clean_file = os.path.join(dir_path, 'train_review_filtered.json')
train_review = []
with open(train_clean_file, 'r') as f:
    for line in f:
        line_data = json.loads(line)
        item_id = line_data['item']
        user_id = line_data['user']
        rating = line_data['rating']
        review = line_data['review']
        features = line_data['features']
        train_review.append([item_id, user_id, rating, review, features])
print("Number of reviews on trainset: {}".format(len(train_review)))

Number of reviews on trainset: 668245


In [4]:
# Load cleaned test
test_clean_file = os.path.join(dir_path, 'test_review_filtered_clean.json')
test_review = []
with open(test_clean_file, 'r') as f:
    for line in f:
        line_data = json.loads(line)
        item_id = line_data['item']
        user_id = line_data['user']
        rating = line_data['rating']
        review = line_data['review']
        features = line_data['features']
        test_review.append([item_id, user_id, rating, review, features])
print("Number of reviews on testset: {}".format(len(test_review)))

Number of reviews on testset: 183650


## Convert List Data to Pandas Dataframe

In [5]:
df_train_data = pd.DataFrame(train_review, columns=['item', 'user', 'rating', 'review', 'features'])
df_test_data = pd.DataFrame(test_review, columns=['item', 'user', 'rating', 'review', 'features'])

In [6]:
df_train_data

Unnamed: 0,item,user,rating,review,features
0,10000,0,4,recommendations are right on and always fun to...,"[[beer, recommendations], [beers, drink, food]]"
1,10019,0,3,just the same pricing but with less item selec...,"[[selections, pricing]]"
2,1002,0,3,taste much better than ayce sushi ! the sushi ...,"[[sushi, taste], [sushi, bite], [service]]"
3,10020,0,3,this is a brand new location of ruelo that rec...,"[[location, plaza], [location, customer, tea, ..."
4,10039,0,4,medium rare is a little dry ... since the stea...,"[[steak, medium], [food, patrons, cheap]]"
...,...,...,...,...,...
668240,4993,9999,5,my two favorites are the carne asada and barba...,"[[juicy, favorites], [combo, taco, tacos, meal..."
668241,704,9999,4,the tacos themselves were a mixed bag . in par...,"[[tacos], [taco], [flavor, vegan], [pork, taco..."
668242,7379,9999,5,everything was great .,[[everything]]
668243,8530,9999,4,our last experience here went pretty well and ...,"[[food], [menu], [wait, chicken], [juicy], [me..."


In [7]:
df_test_data

Unnamed: 0,item,user,rating,review,features
0,10184,0,3,the pork chop itself is moist and tender .,"[[pork, tender]]"
1,1040,0,4,arrowshoot noodle were chewy and have great bi...,"[[bite, noodle]]"
2,10514,0,4,as there are 5 dim sum chefs working in the ki...,"[[kitchen, everything, chefs], [price], [tea],..."
3,10702,0,3,service is polite and since it was not busy th...,[[service]]
4,1081,0,4,"ramen salad - refreshing daily specials , make...","[[salad, ramen], [specials]]"
...,...,...,...,...,...
183645,4154,9999,3,the main draw to this casinos over the others ...,[[restaurant]]
183646,4565,9999,5,it 's not like normal stouts and the flavor is...,"[[flavor], [taste, staff], [prices]]"
183647,6,9999,3,the food here was very good . the prices were ...,"[[food], [prices, fries, burger], [burgers], [..."
183648,624,9999,5,my two favorite meats for tacos are carne asad...,"[[meats, tacos], [salsa, bar], [salsas, beans,..."


In [8]:
print("number of user in trainset: {}".format(len(list(df_train_data['user'].unique()))))
print("number of item in trainset: {}".format(len(list(df_train_data['item'].unique()))))

number of user in trainset: 15639
number of item in trainset: 21515


## Check the already labeled features

In [34]:
# from spacy.lang.en import English

# nlp = English()
# nlp.add_pipe(nlp.create_pipe('sentencizer'))
# doc = nlp("the pork chop itself is moist and tender . everything was great .")
# assert len(list(doc.sents)) == 2
# print(list(doc.sents))
# print(list(doc.sents)[0].text)

In [35]:
num_rows_differ_sents_features_list = 0
for row in df_train_data.iterrows():
    item_id = row[1]['item']
    user_id = row[1]['user']
    rating = row[1]['rating']
    rvw_text = row[1]['review']
    rvw_features = row[1]['features']
    rvw_sents = sent_tokenize(rvw_text)
    rvw_tokens = word_tokenize(rvw_text)
    rvw_features_concat = []
    for rvw_sent_feats in rvw_features:
        rvw_features_concat.extend(rvw_sent_feats)

    try:
        assert len(rvw_sents) == len(rvw_features)
    except:
        num_rows_differ_sents_features_list += 1

    for rvw_feat in rvw_features_concat:
        assert rvw_feat in rvw_tokens

    if (row[0]+1) % 50000 == 0:
        print("{} lines".format(row[0]+1))

50000 lines
100000 lines
150000 lines
200000 lines
250000 lines
300000 lines
350000 lines
400000 lines
450000 lines
500000 lines
550000 lines
600000 lines
650000 lines


In [36]:
# TODO: The current sentence tokenizer is not aligned with the provided feature chunks in the data
# Maybe we should preserve the input structure of list of sentences as review
print("Number of reviews with different number of tokenized sentences and list of sentence features: {}".format(
    num_rows_differ_sents_features_list
))
# At least we know that each review's feature words all appear in the corresponding review text.

Number of reviews with different number of tokenized sentences and list of sentence features: 41484


In [39]:
# Extract Features
feature2id_vocab = dict()
cnt_features = 0
for row in df_train_data.iterrows():
    rvw_features = row[1]['features']
    for rvw_sent_feats in rvw_features:
        for rvw_feat in rvw_sent_feats:
            if rvw_feat not in feature2id_vocab:
                feature2id_vocab[rvw_feat] = str(cnt_features)
                cnt_features += 1
assert cnt_features == len(feature2id_vocab)
print("Totally {} features".format(cnt_features))

Totally 498 features


In [40]:
id2feature_vocab = {v: u for u, v in feature2id_vocab.items()}

In [42]:
feature2id_file = "../Dataset/yelp/train/feature/feature2id.json"
with open(feature2id_file, 'w') as f:
    print("Write file: {}".format(feature2id_file))
    json.dump(feature2id_vocab, f)

id2feature_file = "../Dataset/yelp/train/feature/id2feature.json"
with open(id2feature_file, 'w') as f:
    print("Write file: {}".format(id2feature_file))
    json.dump(id2feature_vocab, f)


Write file: ../Dataset/yelp/train/feature/feature2id.json
Write file: ../Dataset/yelp/train/feature/id2feature.json


# Check Whether Test Features All Appears in Train Feature Set

In [43]:
num_rows_differ_sents_features_list = 0
for row in df_test_data.iterrows():
    item_id = row[1]['item']
    user_id = row[1]['user']
    rating = row[1]['rating']
    rvw_text = row[1]['review']
    rvw_features = row[1]['features']
    rvw_sents = sent_tokenize(rvw_text)
    rvw_tokens = word_tokenize(rvw_text)
    rvw_features_concat = []
    for rvw_sent_feats in rvw_features:
        rvw_features_concat.extend(rvw_sent_feats)
    # Align Sentences with Feature Lists
    try:
        assert len(rvw_sents) == len(rvw_features)
    except:
        num_rows_differ_sents_features_list += 1
    # Check Features Indeed Appears in the Review Text
    for rvw_feat in rvw_features_concat:
        assert rvw_feat in rvw_tokens

    if (row[0]+1) % 50000 == 0:
        print("{} lines".format(row[0]+1))

50000 lines
100000 lines
150000 lines


In [45]:
print("Number of reviews with different number of tokenized sentences and list of sentence features: {}".format(
    num_rows_differ_sents_features_list
))

Number of reviews with different number of tokenized sentences and list of sentence features: 10823


In [44]:
test_new_feature_num = 0
for row in df_test_data.iterrows():
    rvw_features = row[1]['features']
    for rvw_sent_feats in rvw_features:
        for rvw_feat in rvw_sent_feats:
            try:
                assert rvw_feat in feature2id_vocab
            except:
                print(rvw_feat)
                test_new_feature_num += 1

print("Totally {} features on test but not on train".format(test_new_feature_num))

Totally 0 features on test but not on train
