# Download Amazon Reviews Data

This Amazon Reviews dataset is collected from the University of California's San Diego McAuley Lab (https://amazon-reviews-2023.github.io/main.html). 

It includes **user reviews, item metadata, and various product links.**

The Amazon Reivews dataset has 34 categories. This project will focus on the Home and Kitchen cateogry, which has 23.2 million users, 3.7 million items, and 67.4 million ratings. 	

### Citation 
    @article{hou2024bridging,
      title={Bridging Language and Items for Retrieval and Recommendation},
      author={Hou, Yupeng and Li, Jiacheng and He, Zhankui and Yan, An and Chen, Xiusi and McAuley, Julian},
      journal={arXiv preprint arXiv:2403.03952},
      year={2024}
    }					

In [8]:
import json
from pprint import pprint
import glob
import time

import numpy as np 
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from spacy.lang.en.stop_words import STOP_WORDS as stopwords

### Creating stopwords list

In [9]:
stop_words = spacy.lang.en.stop_words.STOP_WORDS
print(f'Original stopwords count: {len(stop_words)}')

# Include/ exclude certain words
exclude_stopwords = {'well', 'off', 'very', 'not', 'few', 'much'}
stop_words -= exclude_stopwords
print(f'Updated stopwords count: {len(stop_words)}')

# Remove adjectives from stopwords list using spaCy
exclude_adjectives = {word for word in stop_words if nlp(word)[0].pos_ == "ADJ"}
print('------------')
print(exclude_adjectives)
stop_words -= exclude_adjectives
print(f'Stopwords count: {len(stop_words)}')

# Convert the final stop words set to a list
print('------------')
stop_words = list(stop_words)
print(stop_words[:10])

Original stopwords count: 301
Updated stopwords count: 301
------------
set()
Stopwords count: 301
------------
['is', '‘ll', 'each', 'cannot', 'in', 'after', 'thru', 'some', 'why', 'say']


## Load Reviews and Meta Data

Read the JSON Lines file, vectorize its text content using TF-IDF, and combine the chunks into one dataset. 

In [3]:
reviews = "../data/Home_and_Kitchen.jsonl"
meta = "../data/meta_Home_and_Kitchen.jsonl"

### Reviews Data

In [13]:
def process_reviews_chunks(file, chunksize = 100000):
    """
    This function reads a JSON Lines file in chunks, fits the TF-IDF vectorizer on the first chunk,
    and transforms subsequent chunks. The resulting sparse matrices are combined using stacking.
    """

    # Setting as global variables
    global n, total_rows  
    n = 1 
    total_rows = 0

    tfidf = TfidfVectorizer(stop_words = stop_words, min_df = 2)

    # Read the file in chunks
    chunks = pd.read_json(file, lines=True, chunksize=chunksize, encoding='utf-8', encoding_errors='ignore')
    vectors = []  
    first_chunk = True

    for chunk in chunks:
        if first_chunk: 
            tfidf_matrix = tfidf.fit_transform(chunk['text'])
            first_chunk = False
        else: 
            tfidf_matrix = tfidf.transform(chunk['text'])
        
        vectors.append(tfidf_matrix)
        print(f"{len(chunk)} rows added")
        n += 1 
        total_rows += len(chunk)
            
    print("Done")
    print(f"Total rows: {total_rows}")
    return vstack(vectors)

In [None]:
start = time.process_time()

home_reviews = process_reviews_chunks(reviews)

end = time.process_time()
print(end - start)
print('Execution time:', elapsed_time, 'seconds')



100000 rows added
100000 rows added
100000 rows added
100000 rows added
100000 rows added
100000 rows added
100000 rows added
100000 rows added
100000 rows added
100000 rows added
100000 rows added
100000 rows added
100000 rows added
100000 rows added
100000 rows added
100000 rows added
100000 rows added
100000 rows added
100000 rows added
100000 rows added


### Meta Data

In [None]:
def process_meta_chunks(file, chunksize = 100000):
    """
    Processes chunks of records from metadata JSON Lines file and appends them to a DataFrame.
    """

    # Setting as global variables
    global n, total_rows  
    n = 1 
    total_rows = 0

    # Read the file in chunks
    chunks = pd.read_json(file, lines=True, chunksize=chunksize, encoding='utf-8', encoding_errors='ignore')
    dfs = []  
    n_chunks = 0

    for chunk in chunks:
        dfs.append(chunk)
        print(f"{len(chunk)} rows added")
        n += 1 
        total_rows += len(chunk)
        # Uncomment the code below if you want to work with a subset
        # if n_chunks >= 10:  
        #     break  
            
    print("Done")
    print(f"Total rows: {total_rows}")
    return pd.concat(dfs, ignore_index=True)

In [None]:
start = time.process_time()

home_meta = process_meta_chunks(meta)

end = time.process_time()
print(end - start)
print('Execution time:', elapsed_time, 'seconds')

## Export csv's

In [None]:
# cleaned_text = pd.read_csv('../data/cleaned_subset.csv')
# home_meta = pd.read_csv('../data/home_meta.csv')