# Download Amazon Reviews Data

This Amazon Reviews dataset is collected from the University of California's San Diego McAuley Lab (https://amazon-reviews-2023.github.io/main.html). 

It includes **user reviews, item metadata, and various product links.**

The Amazon Reivews dataset has 34 categories. This project will focus on the Home and Kitchen cateogry, which has 23.2 million users, 3.7 million items, and 67.4 million ratings. 	

### Citation 
    @article{hou2024bridging,
      title={Bridging Language and Items for Retrieval and Recommendation},
      author={Hou, Yupeng and Li, Jiacheng and He, Zhankui and Yan, An and Chen, Xiusi and McAuley, Julian},
      journal={arXiv preprint arXiv:2403.03952},
      year={2024}
    }					

In [1]:
import json
import time

import numpy as np 
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
import spacy 
nlp = spacy.load('en_core_web_sm')
from spacy.lang.en.stop_words import STOP_WORDS as stopwords
from scipy.sparse import vstack

from scipy.sparse import save_npz

## Loading Reviews and Meta Data

Read the JSON Lines file in chunks and combine the chunks into one dataset. 

In [2]:
reviews = "../data/Home_and_Kitchen.jsonl"
meta = "../data/meta_Home_and_Kitchen.jsonl"

### Reviews Data

In [31]:
def vectorize_json_text(file, chunksize=1000000):
    """
    Reads a JSON Lines file in chunks, fits the TF-IDF vectorizer on the first chunk,
    and transforms subsequent chunks. The resulting sparse matrices are combined using vertical stacking.
    """

    tfidf = TfidfVectorizer(stop_words = 'english', min_df = 2) 
    vectors = []  
    first_chunk = True
    total_rows = 0
    n_chunks = 0
    container = []

    # Read the JSON lines file manually to handle decoding string errors
    with open(file, 'r', encoding = 'utf-8', errors = 'replace') as f: 
        for line in f:
            try:
                data = json.loads(line)
            except json.JSONDecodeError: 
                continue 

            # Check if the loaded data is a dictionary
            if isinstance(data, dict):
                container.append(data)
            else:
                print("Warning: Ignored line - Not a dictionary")

            # Check if container has reached chunksize
            if len(container) >= chunksize: 
                chunk_df = pd.DataFrame(container)
                if first_chunk: 
                    tfidf_matrix = tfidf.fit_transform(chunk_df['text'])
                    first_chunk = False
                else: 
                    tfidf_matrix = tfidf.transform(chunk_df['text'])                    
                vectors.append(tfidf_matrix)
                n_chunks += 1  
                print(f"{len(chunk_df)} rows added")
                total_rows += len(chunk_df)  
                container = []

                # Uncomment the code below if you want a subset
                if n_chunks >= 2:
                    break
        
        # Process any remaining lines 
        if container:
            chunk_df = pd.DataFrame(container)
            tfidf_matrix = tfidf.transform(chunk_df['text'])                    
            vectors.append(tfidf_matrix)
            print(f"{len(chunk_df)} rows added")
            total_rows += len(chunk_df)   
                        
    print("Done")
    print(f"Total rows: {total_rows}")
    return vstack(vectors)

In [32]:
start = time.process_time()

home_sparse_matrix = vectorize_json_text(reviews)

end = time.process_time()
elapsed_time = end - start
print(f'Execution time: {elapsed_time} seconds')

NameError: name 'data' is not defined

In [None]:
home_sparse_matrix

In [None]:
def process_reviews_chunks(file, chunksize=100000):
    """
    Reads a JSON Lines file in chunks, removes the 'text' column,
    and appends the chunk to a list of dataframes.
    """
    dfs = []
    total_rows = 0

    # Read the JSON  file in chunks
    chunks = pd.read_json(file, lines=True, chunksize = chunksize)
    n_chunks = 0
    
    for chunk in chunks:    
        chunk = chunk.drop(columns = 'text')
        dfs.append(chunk)
        n_chunks += 1  
        total_rows += len(chunk)
        print(f"{len(chunk)} rows added")        

        # Uncomment the code below if you want a subset
        if n_chunks >= 2:
            break

    print("Done")
    print(f"Total rows: {total_rows}")
    return pd.concat(dfs, ignore_index=True)

In [None]:
start = time.process_time()

home_reviews = process_reviews_chunks(reviews)

end = time.process_time()
elapsed_time = end - start
print(f'Execution time: {elapsed_time} seconds')

### Metadata

In [None]:
def process_meta_chunks(file, chunksize = 100000):
    """
    Processes chunks of records from metadata JSON Lines file and appends them to a DataFrame.
    """
    dfs = []  
    total_rows = 0

    # Read the JSON file in chunks
    chunks = pd.read_json(file, lines=True, chunksize=chunksize, encoding='utf-8', encoding_errors='ignore')
    n_chunks = 0

    for chunk in chunks:
        dfs.append(chunk)
        print(f"{len(chunk)} rows added")
        n_chunks += 1 
        total_rows += len(chunk)
        # Uncomment the code below if you want to work with a subset
        if n_chunks >= 2:  
            break  
            
    print("Done")
    print(f"Total rows: {total_rows}")
    return pd.concat(dfs, ignore_index=True)

In [None]:
start = time.process_time()

home_meta = process_meta_chunks(meta)

end = time.process_time()
elapsed_time = end - start
print(f'Execution time: {elapsed_time} seconds')

## Export csv's and sparse matrix

In [None]:
# Save the reviews data sparse matrix 
save_npz('../data/reviews_matrix.npz', home_sparse_matrix)

# Export reviews and meta csv's 
# home_reviews = pd.read_csv('../data/home_reviews.csv')
# home_meta = pd.read_csv('../data/home_metadata.csv')