# Download Amazon Reviews Data

This Amazon Reviews dataset is collected from the University of California's San Diego McAuley Lab (https://amazon-reviews-2023.github.io/main.html). 

It includes **user reviews, item metadata, and various product links.**

The Amazon Reivews dataset has 34 categories. This project will focus on the Sports and Outdoors cateogry, which has 10.3 million users, 1.6 million items, and 19.6 million ratings. 	

### Citation 
    @article{hou2024bridging,
      title={Bridging Language and Items for Retrieval and Recommendation},
      author={Hou, Yupeng and Li, Jiacheng and He, Zhankui and Yan, An and Chen, Xiusi and McAuley, Julian},
      journal={arXiv preprint arXiv:2403.03952},
      year={2024}
    }					

In [None]:
import json
import time

import numpy as np 
import pandas as pd
import os

import spacy 
nlp = spacy.load('en_core_web_sm')
from spacy.lang.en.stop_words import STOP_WORDS as stopwords
from scipy.sparse import vstack

from scipy.sparse import save_npz

## Load Reviews and Meta Data

Read the JSON Lines file in chunks and combine the chunks into one dataset. 

In [None]:
reviews = "../data/Sports_and_Outdoors.jsonl"
meta = "../data/meta_Sports_and_Outdoors.jsonl"

### Reviews Data

In [None]:
def process_reviews_chunks(file, chunksize=100000):
    """
    Processes chunks of records from a JSON Lines file and appends them to a concatenated DataFrame.
    """
    dfs = []
    total_rows = 0
    n_chunks = 0

    # Read the JSON  file in chunks
    chunks = pd.read_json(file, lines=True, chunksize = chunksize)  
    
    for chunk in chunks:
        # Extract the product name from the 'details' columns
        chunk['product_name'] = chunk['details'].apply(extract_product_name)

        # Append the processed chunk to the dfs list
        dfs.append(chunk)
        n_chunks += 1  
        total_rows += len(chunk)
        print(f"{len(chunk)} rows added")        

        # Uncomment the code below if you want a subset
        # if n_chunks >= 2:  # 2 chunks 
        #     break

    print("Done")
    print(f"Total rows: {total_rows}")
    return pd.concat(dfs, ignore_index=True)

In [None]:
start = time.process_time()

reviews = process_reviews_chunks(reviews)

end = time.process_time()
elapsed_time = end - start
print(f'Execution time: {elapsed_time} seconds')
print(f'Length of Reviews DataFrame: {len(reviews)}')
print(reviews.columns)

### Item Metadata

In [None]:
def process_meta_chunks(file, chunksize = 100000):
    """
    Processes chunks of records from metadata JSON Lines file and appends them to a DataFrame.
    """
    dfs = []  
    total_rows = 0

    # Read the JSON file in chunks
    chunks = pd.read_json(file, lines=True, chunksize=chunksize, encoding='utf-8', encoding_errors='ignore')
    n_chunks = 0

    for chunk in chunks:
        dfs.append(chunk)
        print(f"{len(chunk)} rows added")
        n_chunks += 1 
        total_rows += len(chunk)
        
        # Uncomment the code below if you want to work with a subset
        # if n_chunks >= 2:  
        #     break  
            
    print("Done")
    print(f"Total rows: {total_rows}")
    return pd.concat(dfs, ignore_index=True)

In [None]:
start = time.process_time()

meta = process_meta_chunks(meta)

end = time.process_time()
elapsed_time = end - start
print(f'Execution time: {elapsed_time} seconds')
print(f'Length of Meta DataFrame: {len(meta)}')
print(meta.columns)

### Feature Engingeering

In [None]:
# Create a year column from the timestamp column 
reviews['year'] = reviews['timestamp'].dt.year

## Merge reviews and metadata dataframes and export to csv

In [None]:
merge_df = pd.merge(reviews_subset, meta_subset, on = 'parent_asin')
merge_df = merge_df.drop(columns = {'title_y', 'images_x', 'Unnamed: 0', 
                                   'bought_together', 'subtitle', 'author'}).rename(columns = {'title_x': 'title', 'images_y': 'images'})
print(merge_df.info())
print('---------------')

# Export to csv 
merge_df.to_csv('../data/merge_df.csv')
print('Exported merged DataFrame')