In [27]:
!which python

/u/pw7nc/anaconda3/bin/python


In [38]:
import spacy
import nltk
import re
import json
import pandas as pd
import os
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from nltk.tokenize import sent_tokenize, word_tokenize
import string
punct = string.punctuation

In [29]:
dataset_name = "tripadvisor"

# Load Dataset

In [30]:
dir_path = '../Dataset/tripadvisor/'
# Load cleaned train
train_clean_file = os.path.join(dir_path, 'train_review_filtered.json')
train_review = []
with open(train_clean_file, 'r') as f:
    for line in f:
        line_data = json.loads(line)
        item_id = line_data['item']
        user_id = line_data['user']
        rating = line_data['rating']
        review = line_data['review']
        feature = line_data['feature']
        train_review.append([item_id, user_id, rating, review, feature])
print("Number of reviews on trainset: {}".format(len(train_review)))

Number of reviews on trainset: 205595


In [31]:
# Load cleaned test
test_clean_file = os.path.join(dir_path, 'test_review_filtered_clean.json')
test_review = []
with open(test_clean_file, 'r') as f:
    for line in f:
        line_data = json.loads(line)
        item_id = line_data['item']
        user_id = line_data['user']
        rating = line_data['rating']
        review = line_data['review']
        feature = line_data['feature']
        test_review.append([item_id, user_id, rating, review, feature])
print("Number of reviews on testset: {}".format(len(test_review)))

Number of reviews on testset: 19444


## Convert List Data to Pandas Dataframe

In [32]:
df_train_data = pd.DataFrame(train_review, columns=['item', 'user', 'rating', 'review', 'feature'])
df_test_data = pd.DataFrame(test_review, columns=['item', 'user', 'rating', 'review', 'feature'])

In [33]:
df_train_data

Unnamed: 0,item,user,rating,review,feature
0,0,0,5,this is our second stay at this hotel ; we sta...,"[[location, rooms], [rooms, cleanliness], [roo..."
1,1,0,3,"small , cramped rooms , moldy grout in shower ...","[[rooms], [location, service]]"
2,10,0,3,"the room doors would slam when guests leave , ...","[[rooms, value], [service]]"
3,100,0,2,"first , the old style tv was on top of the clo...","[[rooms, location], [value], [service], [locat..."
4,1000,0,5,the food was exceptional - we really enjoyed t...,"[[service, location], [service], [rooms, servi..."
...,...,...,...,...,...
205590,752,999,5,"the rooms are spacious , quiet , and clean . m...","[[rooms], [service], [service], [service], [lo..."
205591,819,999,5,"room was very nice . bed was comfortable , had...","[[rooms], [rooms], [rooms], [rooms], [rooms], ..."
205592,827,999,3,not really the best stay i ever had . room was...,"[[location], [rooms, cleanliness, value]]"
205593,852,999,3,our room was not as nice as i had hoped . the ...,"[[rooms], [cleanliness]]"


In [34]:
df_test_data

Unnamed: 0,item,user,rating,review,feature
0,1111,0,2,when i mentioned this to the front desk they d...,[[service]]
1,1379,0,3,"the service was good . our room , was not the ...","[[service], [rooms], [cleanliness], [cleanline..."
2,1391,0,5,we stayed at the signature for four days to ce...,"[[value], [service]]"
3,1579,0,4,the lake buena vista is a perfect place to sta...,[[location]]
4,1689,0,5,summer ( at the front desk ) was perfect ! she...,"[[service], [value, location], [service, rooms]]"
...,...,...,...,...,...
19439,0,999,5,"this was a pleasant place , and with our annua...","[[value], [rooms], [rooms]]"
19440,128,999,5,we enjoyed our stay at the hilton very much ! ...,"[[location], [service], [rooms]]"
19441,429,999,5,from the moment we arrived at the front desk u...,[[service]]
19442,816,999,4,"wifi gratuit , nous n avons pas essayé le brea...",[[service]]


In [35]:
print("number of user in trainset: {}".format(len(list(df_train_data['user'].unique()))))
print("number of item in trainset: {}".format(len(list(df_train_data['item'].unique()))))

number of user in trainset: 4950
number of item in trainset: 4493


## Check the already labeled features

In [36]:
# Load attributes
attribute_file = os.path.join(dir_path, 'ta_output/attributes.json')
with open(attribute_file, 'r') as f:
    print("Load file: {}".format(attribute_file))
    attributes_vocab = json.load(f)

Load file: ../Dataset/tripadvisor/ta_output/attributes.json


In [37]:
# Load aspects
aspect_file = os.path.join(dir_path, 'ta_output/aspects.json')
with open(aspect_file, 'r') as f:
    print("Load file: {}".format(aspect_file))
    aspect_dict = json.load(f)

Load file: ../Dataset/tripadvisor/ta_output/aspects.json


In [39]:
feature_set = set()
for attr in attributes_vocab:
    if attr in punct:
        print("Remove: ", attr)
    else:
        feature_set.add(attr)
for aspect in aspect_dict:
    if aspect in punct:
        print("Remove: ", aspect)
    else:
        feature_set.add(aspect)
print("Number of attributes: {}".format(len(attributes_vocab)))
print("Number of aspects: {}".format(len(aspect_dict)))
print("--"*10+"Features are a combination of attributes and aspects"+"--"*10)
print("Number of features: {}".format(len(feature_set)))

Remove:  $
Remove:  %
Number of attributes: 440
Number of aspects: 159
--------------------Features are a combination of attributes and aspects--------------------
Number of features: 505


In [13]:
# from spacy.lang.en import English

# nlp = English()
# nlp.add_pipe(nlp.create_pipe('sentencizer'))
# doc = nlp("the pork chop itself is moist and tender . everything was great .")
# assert len(list(doc.sents)) == 2
# print(list(doc.sents))
# print(list(doc.sents)[0].text)

In [14]:
cnt_line_without_features = 0
cnt_rvw_features = list()
for row in df_train_data.iterrows():
    item_id = row[1]['item']
    user_id = row[1]['user']
    rating = row[1]['rating']
    rvw_text = row[1]['review']
    rvw_features = row[1]['feature']
    # rvw_sents = sent_tokenize(rvw_text)
    rvw_tokens = word_tokenize(rvw_text)
    num_rvw_features = 0
    for feat_word in feature_set:
        if feat_word in rvw_tokens:
            num_rvw_features += 1
    if num_rvw_features == 0:
        cnt_line_without_features += 1
    else:
        cnt_rvw_features.append(num_rvw_features)

    if (row[0]+1) % 50000 == 0:
        print("{} lines".format(row[0]+1))

print("Number of reviews that has no feature words: {}".format(cnt_line_without_features))
print("Avg number of features per review: {}".format(np.mean(cnt_rvw_features)))


50000 lines
100000 lines
150000 lines
200000 lines
Number of reviews that has no feature words: 0
Avg number of features per review: 9.114409397115688


In [24]:
len(cnt_rvw_features)

205595

In [15]:
# Extract Features
feature_count = dict()
cnt_features = 0
for row in df_train_data.iterrows():
    row_sentences = row[1]['review']
    row_tokens = word_tokenize(row_sentences)
    for row_token in row_tokens:
        if row_token in feature_set:
            if row_token not in feature_count:
                feature_count[row_token] = 1
                cnt_features += 1
            else:
                feature_count[row_token] += 1
# assert cnt_features == len(feature_count)
print("Totally {} features appear in train-set reviews.".format(cnt_features))


Totally 505 features appear in train-set reviews.


In [16]:
# sort the feature based on counts
sorted_feat_counts = sorted(feature_count.items(), key = lambda x: -x[1])
# build feature word to feature ids mappings
feature2id_vocab = {entry[0]: str(id) for (id, entry) in enumerate(sorted_feat_counts)}
id2feature_vocab = {v: u for u, v in feature2id_vocab.items()}

In [17]:
feature2id_file = "../Dataset/tripadvisor/train/feature/feature2id.json"
with open(feature2id_file, 'w') as f:
    print("Write file: {}".format(feature2id_file))
    json.dump(feature2id_vocab, f)

id2feature_file = "../Dataset/tripadvisor/train/feature/id2feature.json"
with open(id2feature_file, 'w') as f:
    print("Write file: {}".format(id2feature_file))
    json.dump(id2feature_vocab, f)


Write file: ../Dataset/tripadvisor/train/feature/feature2id.json
Write file: ../Dataset/tripadvisor/train/feature/id2feature.json


In [19]:
feature2df = dict()
last_df = -1
for feat_df in sorted_feat_counts:
    assert feat_df[0] not in feature2df
    if last_df > 0:
        assert last_df >= feat_df[1]
    last_df = feat_df[1]
    feature2df[feat_df[0]] = feat_df[1]

In [20]:
feature2df_file = "../Dataset/tripadvisor/train/feature/feature2df.json"
with open(feature2df_file, 'w') as f:
    print("Write file: {}".format(feature2df_file))
    json.dump(feature2df, f)

Write file: ../Dataset/tripadvisor/train/feature/feature2df.json


# Check Whether Test Features All Appears in Train Feature Set

In [25]:
cnt_line_without_features_testset = 0
cnt_rvw_features_testset = list()
for row in df_test_data.iterrows():
    item_id = row[1]['item']
    user_id = row[1]['user']
    rating = row[1]['rating']
    rvw_text = row[1]['review']
    # rvw_features = row[1]['feature']
    rvw_tokens = word_tokenize(rvw_text)
    num_rvw_features = 0
    for feat_word in feature2id_vocab.keys():
        if feat_word in rvw_tokens:
            num_rvw_features += 1
    if num_rvw_features == 0:
        cnt_line_without_features_testset += 1
    else:
        cnt_rvw_features_testset.append(num_rvw_features)
    if (row[0]+1) % 10000 == 0:
        print("{} lines".format(row[0]+1))
print("Finish! Totally {} lines.".format(row[0]+1))
print("Number of reviews that has no feature words: {}".format(cnt_line_without_features_testset))
print("Avg number of features per review: {}".format(np.mean(cnt_rvw_features_testset)))

10000 lines
Finish! Totally 19444 lines.
Number of reviews that has no feature words: 0
Avg number of features per review: 9.11335116231228


In [26]:
len(cnt_rvw_features_testset)

19444