In [2]:
# This notebook processes Amazon reviews derived from the datasets available here:
# http://jmcauley.ucsd.edu/data/amazon/

import orjson

In [3]:
# This dataset consists of a list of new-line separated entries. Entries are maps with the following keys:
#  reviewerID
#  asin
#  reviewerName
#  helpful (list of 2 numbers)
#  reviewText
#  overall (score as a float. always [1,2,3,4,5])
#  summary
#  unixReviewTime
#  reviewTime

# Converts the given entry with the above format into a new map with the following keys:
#  sentence (string)
#  label (integer)
def process_entry(entry, min_helpful=0, max_words=0):
    if not 'helpful' in entry:
        return None
    if entry['helpful'][0] < min_helpful:
        return None
    if entry['reviewText'].count(' ')+1 > max_words:
        return None
    return {'sentence': entry['reviewText'],
            'label': int(entry['overall']),
            }



def process_file(filepath, result):
    with open(filepath) as file:
        line = file.readline()
        result = [[], [], [], [], []]
        while line:
            pent = process_entry(orjson.loads(line), min_helpful=5, max_words=300)
            if pent:
                result[pent['label']-1].append(pent)
            line = file.readline()

#def balance_reviews(review_list):


In [7]:
import random
import glob, os

# Fetch the reviews.
folder = "C:/Users/jbetk/Documents/data/ml/sentiment_analysis/amazon_reviews/"
os.chdir(folder)
reviews = [[], [], [], [], []]
for file in glob.glob("*.json"):
    print("Processing file " + file)
    process_file(file, reviews)

# Normalize the actual ratings values so there are equal training samples for each value.
rating_counts = [0] * 5
for (i, rl) in enumerate(reviews):
    rating_counts[i] = len(rl)

min_rating_count = min(rating_counts)
balanced_reviews = []
validation_reviews = []
for rl in reviews:
    validation_reviews.extend(rl[0:100])
    balanced_reviews.extend(random.choices(rl[100:], k=min_rating_count-100))
    
# Shuffle the reviews
random.shuffle(balanced_reviews)

Processing file Books_5.json
Processing file CDs_and_Vinyl_5.json
Processing file Clothing_Shoes_and_Jewelry_5.json
Processing file Digital_Music_5.json
Processing file Electronics_5.json
Processing file Health_and_Personal_Care_5.json
Processing file Home_and_Kitchen_5.json
Processing file processed.json
Processing file validation.json
Processing file Video_Games_5.json


In [8]:
print(min_rating_count)

94159


In [None]:
with open(folder + "processed.json", "wb") as output_file:
    output_file.write(orjson.dumps(balanced_reviews))
    output_file.close()
    
with open(folder + "validation.json", "wb") as output_file:
    output_file.write(orjson.dumps(validation_reviews))
    output_file.close()


In [None]:
random.choices(reviews[0], k=10)
