In [1]:
import pyspark.pandas as ps
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()



In [2]:
categories = ['AMAZON_FASHION', 'All_Beauty', 'Appliances', 'Arts_Crafts_and_Sewing', 'Automotive', 'Books', 'CDs_and_Vinyl', 'Cell_Phones_and_Accessories', 'Clothing_Shoes_and_Jewelry', 'Digital_Music', 'Electronics', 'Gift_Cards', 'Grocery_and_Gourmet_Food', 'Home_and_Kitchen', 'Industrial_and_Scientific', 'Kindle_Store', 'Luxury_Beauty', 'Magazine_Subscriptions', 'Movies_and_TV', 'Musical_Instruments', 'Office_Products', 'Patio_Lawn_and_Garden', 'Pet_Supplies', 'Prime_Pantry', 'Software', 'Sports_and_Outdoors', 'Tools_and_Home_Improvement', 'Toys_and_Games', 'Video_Games']

## Metadata

In [3]:
dataframes = {category: ps.read_json(f'/data/meta_{category}.json', index_col='asin') for category in categories}

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/01/15 01:26:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/01/15 01:26:15 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
24/01/15 01:26:25 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
                                                                                

In [4]:
for category, df in dataframes.items():
    df['category'] = category

In [5]:
df_list = [df_category.drop(['similar_item', 'details', 'tech1', 'tech2'], axis=1) for df_category in dataframes.values()]

In [6]:
df = ps.concat(df_list)

In [7]:
df.to_parquet('/data/meta.parquet', index_col='asin')

                                                                                

In [None]:
df.to_json('/data/meta.json', index_col='asin')

In [None]:
df.to_csv('/data/meta.csv', index_col='asin', columns=[column for column in df.columns if not (df.schema[column].dataType.typeName() in ['array', 'struct'])])

## Data

In [None]:

from pyspark.sql.types import StructType, StructField, StringType, DoubleType, BooleanType, ArrayType, LongType

schema = StructType([
    StructField('asin', StringType(), True),
    StructField('image', ArrayType(StringType()), True),
    StructField('overall', DoubleType(), True),
    StructField('reviewText', StringType(), True),
    StructField('reviewTime', StringType(), True),
    StructField('reviewerID', StringType(), True),
    StructField('reviewerName', StringType(), True),
    StructField('style', StructType([
        StructField('Color', StringType(), True),
        StructField('Color Name', StringType(), True),
        StructField('Design', StringType(), True),
        StructField('Flavor', StringType(), True),
        StructField('Format', StringType(), True),
        StructField('Item Package Quantity', StringType(), True),
        StructField('Package Quantity', StringType(), True),
        StructField('Package Type', StringType(), True),
        StructField('Pattern', StringType(), True),
        StructField('Scent Name', StringType(), True),
        StructField('Size', StringType(), True),
        StructField('Size Name', StringType(), True),
        StructField('Style', StringType(), True),
        StructField('Style Name', StringType(), True),
    ]), True),
    StructField('summary', StringType(), True),
    StructField('unixReviewTime', LongType(), True),
    StructField('verified', BooleanType(), True),
    StructField('vote', StringType(), True),
])


In [None]:
dataframes = {category: ps.DataFrame(spark.read.schema(schema).json(f'/data/{category}.json')).set_index(['reviewerID', 'asin']) for category in categories}

In [12]:
df = ps.concat(list(dataframes.values()))

In [13]:
df.to_parquet('/data/data.parquet', index_col=['reviewerID', 'asin'])



In [None]:
df.to_json('/data/data.json', index_col=['reviewerID', 'asin'])