## Setting up

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
cd /content/drive/My\ Drive/Colab\ Notebooks/data

/content/drive/My Drive/Colab Notebooks/data


In [0]:
import json
import re
from random import randrange, shuffle

## Data Loading

In [0]:
def review_obj_to_dict(review_obj):
  review_obj = review_obj.strip()
  return json.loads(review_obj)

## Data Cleaning

In [0]:
def clean_review_text(text):
  text = re.sub('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});', ' ', text) # remove HTML tags
  text = re.sub(r'http\S+', '', text) # remove hyperlinks
  text = re.sub('[\t\n]+', ' ', text) # remove tabs, newlines
  text = re.sub(' +', ' ', text) # remove multiple spaces
  text = text.strip() # strip special characters at the ends
  return text

In [0]:
def load_review_text(filename):
    if filename == 'books.json':
        key = 'review_text'
    else:
        key = 'reviewText'
    reviews = []
    with open(filename, 'r') as f:
        review_data = f.readlines()
        for line in review_data:
            review_dict = review_obj_to_dict(line)
            if key in review_dict:
                review_text = clean_review_text(review_dict[key])
                reviews.append(review_text)
    return reviews

In [0]:
all_food_reviews = load_review_text('food.json')
all_music_reviews = load_review_text('music.json')
all_movies_reviews = load_review_text('movies.json')
all_instruments_reviews = load_review_text('instruments.json')
all_toys_reviews = load_review_text('toys.json')
all_clothing_reviews = load_review_text('clothing.json')
all_sports_reviews = load_review_text('sports.json')
all_books_reviews = load_review_text('books.json')

In [0]:
[len(reviews) for reviews in (all_food_reviews, all_music_reviews, all_movies_reviews, all_instruments_reviews, all_toys_reviews, all_clothing_reviews, all_sports_reviews, all_books_reviews)]

[1143470, 169623, 1697533, 231344, 1827796, 278677, 296337, 1378033]

The datasets vary in size, so we will extract 10,000 from each. Instead of just getting the first 10,000 or the last 10,000 entries, we will randomly select 10,000 entries to generalize the dataset.

In [0]:
def get_random_indices(max_range, size=10000):
    assert max_range >= size
    number_set = set()
    while len(number_set) < size:
        random_int = randrange(max_range)
        number_set.add(random_int)
    return number_set

In [0]:
def get_reviews_subset(reviews_list):
    indices = get_random_indices(len(reviews_list))
    subset = [reviews_list[i] for i in indices]
    return subset

In [0]:
food_reviews = get_reviews_subset(all_food_reviews)
music_reviews = get_reviews_subset(all_music_reviews)
movies_reviews = get_reviews_subset(all_movies_reviews)
instruments_reviews = get_reviews_subset(all_instruments_reviews)
toys_reviews = get_reviews_subset(all_toys_reviews)
clothing_reviews = get_reviews_subset(all_clothing_reviews)
sports_reviews = get_reviews_subset(all_sports_reviews)
books_reviews = get_reviews_subset(all_books_reviews)

In [0]:
[len(reviews) for reviews in (food_reviews, music_reviews, movies_reviews, instruments_reviews, toys_reviews, clothing_reviews, sports_reviews, books_reviews)]

[10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000]

Compile all subsets into one big dataset.

In [0]:
all_reviews = [('food', review) for review in food_reviews]
all_reviews += [('music', review) for review in music_reviews]
all_reviews += [('movies', review) for review in movies_reviews]
all_reviews += [('instruments', review) for review in instruments_reviews]
all_reviews += [('toys', review) for review in toys_reviews]
all_reviews += [('clothing', review) for review in clothing_reviews]
all_reviews += [('sports', review) for review in sports_reviews]
all_reviews += [('books', review) for review in books_reviews]

len(all_reviews)

80000

Shuffle the dataset to randomize the order of the labels. 

In [0]:
shuffle(all_reviews)

Write the dataset to a file.

In [0]:
def write_dataset(reviews, combined_file):
    with open(combined_file, 'w') as f:
        for label, review in reviews:
            line = f"{label}\t{review}\n"
            f.write(line)

In [0]:
write_dataset(all_reviews, 'reviews_8_categories.txt')