In [0]:
#%pip install torch==2.1.0

In [0]:
import pandas as pd
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import sklearn
import numpy as np
import boto3
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, when, col, isnan
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType



In [0]:
import os
# To work with Amazon S3 storage, set the following variables using your AWS Access Key and Secret Key
# Set the Region to where your files are stored in S3.
access_key = 'My Access Key'
secret_key = 'My Secret Key'
# Set the environment variables so boto3 can pick them up later
os.environ['AWS_ACCESS_KEY_ID'] = access_key
os.environ['AWS_SECRET_ACCESS_KEY'] = secret_key
encoded_secret_key = secret_key.replace("/", "%2F")
aws_region = "us-east-2"

In [0]:
sc._jsc.hadoopConfiguration().set("fs.s3a.access.key", access_key)
sc._jsc.hadoopConfiguration().set("fs.s3a.secret.key", secret_key)
sc._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "s3." + aws_region +
".amazonaws.com")

In [0]:
file_path = 's3://amazon-reviews-ma/landing/amazon_reviews_multilingual_US_v1_00.tsv'
sdf = spark.read.csv(file_path, sep='\t', header=True, inferSchema=True)


In [0]:
# Creating a spark session
spark = SparkSession.builder \
    .appName("AmazonReviewsAnalysis") \
    .getOrCreate()

In [0]:
sdf.show()

+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|marketplace|customer_id|     review_id|product_id|product_parent|       product_title|product_category|star_rating|helpful_votes|total_votes|vine|verified_purchase|     review_headline|         review_body|review_date|
+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|         US|   53096384| R63J84G1LOX6R|1563890119|     763187671|The Sandman Vol. ...|           Books|          4|            0|          1|   N|                N|ignore the review...|this is the first...| 1995-08-13|
|         US|   53096399|R1BALOA11Z06MT|1559947608|     381720534|The 22 Immutable ...|           Books|          4|    

In [0]:
# count of rows 
sdf.count()

Out[9]: 6931166

In [0]:
# Look at some of the statistics for some specific columns
sdf.select("star_rating", "helpful_votes", "total_votes").summary("count", "min",
"max", "mean").show()
# Look at the Review headline and Review Body that we planning to use for our model
sdf.select("review_headline", "review_body").summary("count", "min", "max").show()

+-------+-----------------+------------------+-----------------+
|summary|      star_rating|     helpful_votes|      total_votes|
+-------+-----------------+------------------+-----------------+
|  count|          6931165|           6931165|          6931165|
|    min|                1|                 0|                0|
|    max|                5|             27550|            28727|
|   mean|4.306755934969085|2.0380304032583267|3.241855878485074|
+-------+-----------------+------------------+-----------------+

+-------+--------------------+--------------------+
|summary|     review_headline|         review_body|
+-------+--------------------+--------------------+
|  count|             6931154|             6930594|
|    min|\tBeautiful!\t201...|\bPlease make oth...|
|    max|🙌🙌🙌🙌🙌🙌🙌🙌?...|🚾🛁🗽🏡🗿🏧🏬🏈?...|
+-------+--------------------+--------------------+



In [0]:
# Checking to see if some of the columns have NULL values for the columns we might use
sdf.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in
["star_rating", "review_body","review_headline", "review_id", "helpful_votes", "total_votes"]] ).show()

+-----------+-----------+---------------+---------+-------------+-----------+
|star_rating|review_body|review_headline|review_id|helpful_votes|total_votes|
+-----------+-----------+---------------+---------+-------------+-----------+
|          1|        572|             19|        0|            1|          1|
+-----------+-----------+---------------+---------+-------------+-----------+



In [0]:
#Checking and Dropping any duplicates for the same review_id
# Check for duplicates (this is just to report the number of duplicates)
duplicate_count = sdf.groupBy('review_id').count().filter("count > 1").count()
print(duplicate_count)
# Drop duplicates based on the 'review_id' column
#sdf = sdf.dropDuplicates(['review_id'])

0


In [0]:
# Drop some of the records where the certain columns are empty (null or nan)for those columns
sdf = sdf.na.drop(subset=["star_rating", "review_body", "review_headline"])

In [0]:
# Potential list of columns that we dont need
columns_to_drop = ["marketplace", "product_parent", "vine"]

# Droping those columns
sdf = sdf.drop(*columns_to_drop)

In [0]:
# Define a function to strip out any non-ascii characters from review_headlines and review_body
def ascii_only(mystring):
    if mystring:
        return mystring.encode('ascii', 'ignore').decode('ascii')
    else:
        return None
# Turn this function into a User-Defined Function (UDF)
ascii_udf = udf(ascii_only)
# Clean up the review_headline and review_body
sdf = sdf.withColumn("clean_review_headline", ascii_udf('review_headline'))
sdf = sdf.withColumn("clean_review_body", ascii_udf('review_body'))
# Re-check the cleaned headline and body
sdf.select("clean_review_headline", "clean_review_body").summary("count", "min",
"max").show()

+-------+---------------------+--------------------+
|summary|clean_review_headline|   clean_review_body|
+-------+---------------------+--------------------+
|  count|              6930583|             6930583|
|    min|                     |                    |
|    max| ~~~~~~~ A MOVIE F...|~~~~~~~~~~~~~~~~~...|
+-------+---------------------+--------------------+



In [0]:
# Display the new clean_review_headline and clean_review_body columns
sdf.select("clean_review_headline", "clean_review_body").show(truncate=False)

+--------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|clean_review_headline                                   |clean_review_body                                                                                                                                                                                                  

In [0]:
# Display only a few rows to check the cleaned columns
sdf.select("clean_review_headline", "clean_review_body").limit(5).show(truncate=False)

+-------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|clean_review_headline                                  |clean_review_body                                                                                                                                                                                                                                                                                                                                                                                                         |
+-------------------------------------------------------+-----

In [0]:
#sanity check for null values in newly created columns
null_values = sdf.select([
    count(when(col(c).isNull(), c)).alias(c) for c in ['clean_review_headline', 'clean_review_body']
])

null_values.show()

+---------------------+-----------------+
|clean_review_headline|clean_review_body|
+---------------------+-----------------+
|                    0|                0|
+---------------------+-----------------+



In [0]:
sdf.show()

+-----------+--------------+----------+--------------------+----------------+-----------+-------------+-----------+-----------------+--------------------+--------------------+-----------+---------------------+--------------------+
|customer_id|     review_id|product_id|       product_title|product_category|star_rating|helpful_votes|total_votes|verified_purchase|     review_headline|         review_body|review_date|clean_review_headline|   clean_review_body|
+-----------+--------------+----------+--------------------+----------------+-----------+-------------+-----------+-----------------+--------------------+--------------------+-----------+---------------------+--------------------+
|   53096384| R63J84G1LOX6R|1563890119|The Sandman Vol. ...|           Books|          4|            0|          1|                N|ignore the review...|this is the first...| 1995-08-13| ignore the review...|this is the first...|
|   53096399|R1BALOA11Z06MT|1559947608|The 22 Immutable ...|           Books

In [0]:
# Look at the Review headline and Review Body
sdf.select("review_headline", "review_body").summary("count", "min", "max").show()

In [0]:
# Look at the Cleaned Review headline and Review Body and compare it
sdf.select("clean_review_headline", "clean_review_body").summary("count","min","max").show()

In [0]:
s3://amazon-reviews-ma/landing/amazon_reviews_multilingual_US_v1_00.tsv

In [0]:
# Define a function to clean the DataFrame
def clean_amazon_reviews(file_path, output_file_path_s3):
    # Reading the file from S3
    sdf = spark.read.csv(file_path, sep='\t', header=True, inferSchema=True)
    
    # Performing cleaning operations on the selected file from the dictionary
    sdf = sdf.na.drop(subset=["star_rating", "review_body", "review_headline"])
    columns_to_drop = ["marketplace", "product_parent", "vine"]
    sdf = sdf.drop(*columns_to_drop)
    
    # keeping only ascii charracters for the review_headline and review body and make a new column for clean_review_headline and clean_review_body
    def ascii_only(mystring):
        if mystring:
            return mystring.encode('ascii', 'ignore').decode('ascii')
        else:
            return None

    ascii_udf = udf(ascii_only, StringType())
    sdf = sdf.withColumn("clean_review_headline", ascii_udf('review_headline'))
    sdf = sdf.withColumn("clean_review_body", ascii_udf('review_body'))

    # Write the cleaned DataFrame back to S3 as a Parquet file
    sdf.write.mode('overwrite').parquet(output_file_path_s3)

    # Return the path to the cleaned data
    return output_file_path_s3

# List of source files and their corresponding destination paths for cleaned data
files_to_clean = [
    {
        'source': 's3://amazon-reviews-ma/landing/amazon_reviews_multilingual_US_v1_00.tsv',
        'destination': 's3://amazon-reviews-ma/raw/cleaned_amazon_reviews_multilingual_US_v1_00.parquet'
    },
    # {
    #     'source': 's3://amazon-reviews-ma/landing/amazon_reviews_us_Automotive_v1_00.tsv',
    #     'destination': 's3://amazon-reviews-ma/raw/cleaned_amazon_reviews_us_Automotive_v1_00.parquet'
    # },
    # {
    #     'source': 's3://amazon-reviews-ma/landing/amazon_reviews_us_Baby_v1_00.tsv',
    #     'destination': 's3://amazon-reviews-ma/raw/cleaned_amazon_reviews_us_Baby_v1_00.parquet'
    # },
    # {
    #     'source': 's3://amazon-reviews-ma/landing/amazon_reviews_us_Beauty_v1_00.tsv',
    #     'destination':'s3://amazon-reviews-ma/raw/cleaned_amazon_reviews_us_Beauty_v1_00.parquet'
    # },
    # {
    #     'source': 's3://amazon-reviews-ma/landing/amazon_reviews_us_Books_v1_02.tsv',
    #     'destination': 's3://amazon-reviews-ma/raw/cleaned_amazon_reviews_us_Books_v1_02.parquet'
    # },
    # {
    #     'source': 's3://amazon-reviews-ma/landing/amazon_reviews_us_Camera_v1_00.tsv',
    #     'destination': 's3://amazon-reviews-ma/raw/cleaned_amazon_reviews_us_Camera_v1_00.parquet'
    # },
    # {
    #     'source': 's3://amazon-reviews-ma/landing/amazon_reviews_us_Digital_Ebook_Purchase_v1_01.tsv',
    #     'destination': 's3://amazon-reviews-ma/raw/cleaned_amazon_reviews_us_Digital_Ebook_Purchase_v1_01.parquet'
    # },
    # {
    #     'source': 's3://amazon-reviews-ma/landing/amazon_reviews_us_Digital_Music_Purchase_v1_00.tsv',
    #     'destination': 's3://amazon-reviews-ma/raw/cleaned_amazon_reviews_us_Digital_Music_Purchase_v1_00.parquet'
    # },
    # {
    #     'source': 's3://amazon-reviews-ma/landing/amazon_reviews_us_Digital_Software_v1_00.tsv',
    #     'destination': 's3://amazon-reviews-ma/raw/cleaned_amazon_reviews_us_Digital_Software_v1_00.parquet'
    # },
    # {
    #     'source': 's3://amazon-reviews-ma/landing/amazon_reviews_us_Digital_Video_Download_v1_00.tsv',
    #     'destination': 's3://amazon-reviews-ma/raw/cleaned_amazon_reviews_us_Digital_Video_Download_v1_00.parquet'
    # },
    # # For run time purposes looping through 10 files at a time
    
]

# Loop through the list of files and clean each one
for paths in files_to_clean:
    cleaned_path = clean_amazon_reviews(paths['source'], paths['destination'])
    print(f'Cleaned file written to: {cleaned_path}')

Cleaned file written to: s3://amazon-reviews-ma/raw/cleaned_amazon_reviews_multilingual_US_v1_00.parquet


In [0]:
# Define a function to clean the DataFrame
def clean_amazon_reviews(file_path, output_file_path_s3):
    # Reading the file from S3
    sdf = spark.read.csv(file_path, sep='\t', header=True, inferSchema=True)
    
    # Performing cleaning operations on the selected file from the dictionary
    sdf = sdf.na.drop(subset=["star_rating", "review_body", "review_headline"])
    columns_to_drop = ["marketplace", "product_parent", "vine"]
    sdf = sdf.drop(*columns_to_drop)
    
    # keeping only ascii charracters for the review_headline and review body and make a new column for clean_review_headline and clean_review_body
    def ascii_only(mystring):
        if mystring:
            return mystring.encode('ascii', 'ignore').decode('ascii')
        else:
            return None

    ascii_udf = udf(ascii_only, StringType())
    sdf = sdf.withColumn("clean_review_headline", ascii_udf('review_headline'))
    sdf = sdf.withColumn("clean_review_body", ascii_udf('review_body'))

    # Write the cleaned DataFrame back to S3 as a Parquet file
    sdf.write.mode('overwrite').parquet(output_file_path_s3)

    # Return the path to the cleaned data
    return output_file_path_s3

# List of source files and their corresponding destination paths for cleaned data
files_to_clean = [
    # {
    #     'source': 's3://amazon-reviews-ma/landing/amazon_reviews_us_Apparel_v1_00.tsv',
    #     'destination': 's3://amazon-reviews-ma/raw/cleaned_amazon_reviews_us_Apparel_v1_00.parquet'
    # },
    # {
    #     'source': 's3://amazon-reviews-ma/landing/amazon_reviews_us_Automotive_v1_00.tsv',
    #     'destination': 's3://amazon-reviews-ma/raw/cleaned_amazon_reviews_us_Automotive_v1_00.parquet'
    # },
    # {
    #     'source': 's3://amazon-reviews-ma/landing/amazon_reviews_us_Baby_v1_00.tsv',
    #     'destination': 's3://amazon-reviews-ma/raw/cleaned_amazon_reviews_us_Baby_v1_00.parquet'
    # },
    # {
    #     'source': 's3://amazon-reviews-ma/landing/amazon_reviews_us_Beauty_v1_00.tsv',
    #     'destination':'s3://amazon-reviews-ma/raw/cleaned_amazon_reviews_us_Beauty_v1_00.parquet'
    # },
    # {
    #     'source': 's3://amazon-reviews-ma/landing/amazon_reviews_us_Books_v1_02.tsv',
    #     'destination': 's3://amazon-reviews-ma/raw/cleaned_amazon_reviews_us_Books_v1_02.parquet'
    # },
    # {
    #     'source': 's3://amazon-reviews-ma/landing/amazon_reviews_us_Camera_v1_00.tsv',
    #     'destination': 's3://amazon-reviews-ma/raw/cleaned_amazon_reviews_us_Camera_v1_00.parquet'
    # },
    # {
    #     'source': 's3://amazon-reviews-ma/landing/amazon_reviews_us_Digital_Ebook_Purchase_v1_01.tsv',
    #     'destination': 's3://amazon-reviews-ma/raw/cleaned_amazon_reviews_us_Digital_Ebook_Purchase_v1_01.parquet'
    # },
    # {
    #     'source': 's3://amazon-reviews-ma/landing/amazon_reviews_us_Digital_Music_Purchase_v1_00.tsv',
    #     'destination': 's3://amazon-reviews-ma/raw/cleaned_amazon_reviews_us_Digital_Music_Purchase_v1_00.parquet'
    # },
    # {
    #     'source': 's3://amazon-reviews-ma/landing/amazon_reviews_us_Digital_Software_v1_00.tsv',
    #     'destination': 's3://amazon-reviews-ma/raw/cleaned_amazon_reviews_us_Digital_Software_v1_00.parquet'
    # },
    # {
    #     'source': 's3://amazon-reviews-ma/landing/amazon_reviews_us_Digital_Video_Download_v1_00.tsv',
    #     'destination': 's3://amazon-reviews-ma/raw/cleaned_amazon_reviews_us_Digital_Video_Download_v1_00.parquet'
    # },
    # {
    #     'source': 's3://amazon-reviews-ma/landing/amazon_reviews_us_Digital_Video_Games_v1_00.tsv',
    #     'destination': 's3://amazon-reviews-ma/raw/cleaned_amazon_reviews_us_Digital_Video_Games_v1_00.parquet'
    # },
    # {
    #     'source': 's3://amazon-reviews-ma/landing/amazon_reviews_us_Electronics_v1_00.tsv',
    #     'destination': 's3://amazon-reviews-ma/raw/cleaned_amazon_reviews_us_Electronics_v1_00.parquet'
    # },
    # {
    #     'source': 's3://amazon-reviews-ma/landing/amazon_reviews_us_Furniture_v1_00.tsv',
    #     'destination': 's3://amazon-reviews-ma/raw/cleaned_amazon_reviews_us_Furniture_v1_00.parquet'
    # },
    # {
    #     'source': 's3://amazon-reviews-ma/landing/amazon_reviews_us_Gift_Card_v1_00.tsv',
    #     'destination': 's3://amazon-reviews-ma/raw/cleaned_amazon_reviews_us_Gift_Card_v1_00.parquet'
    # },
    # {
    #     'source': 's3://amazon-reviews-ma/landing/amazon_reviews_us_Grocery_v1_00.tsv',
    #     'destination': 's3://amazon-reviews-ma/raw/cleaned_amazon_reviews_us_Grocery_v1_00.parquet'
    # },
    # {
    #     'source': 's3://amazon-reviews-ma/landing/amazon_reviews_us_Health_Personal_Care_v1_00.tsv',
    #     'destination': 's3://amazon-reviews-ma/raw/cleaned_amazon_reviews_us_Health_Personal_Care_v1_00.parquet'
    # },
    # {
    #     'source': 's3://amazon-reviews-ma/landing/amazon_reviews_us_Major_Appliances_v1_00.tsv',
    #     'destination': 's3://amazon-reviews-ma/raw/cleaned_amazon_reviews_us_Major_Appliances_v1_00.parquet'
    # },
    # {
    #     'source': 's3://amazon-reviews-ma/landing/amazon_reviews_us_Mobile_Apps_v1_00.tsv',
    #     'destination': 's3://amazon-reviews-ma/raw/cleaned_amazon_reviews_us_Mobile_Apps_v1_00.parquet'
    # },
    # {
    #     'source': 's3://amazon-reviews-ma/landing/amazon_reviews_us_Mobile_Electronics_v1_00.tsv',
    #     'destination': 's3://amazon-reviews-ma/raw/cleaned_amazon_reviews_us_Mobile_Electronics_v1_00.parquet'
    # },
    # {
    #     'source': 's3://amazon-reviews-ma/landing/amazon_reviews_us_Music_v1_00.tsv',
    #     'destination': 's3://amazon-reviews-ma/raw/cleaned_amazon_reviews_us_Music_v1_00.parquet'
    # },
    # {
    #     'source': 's3://amazon-reviews-ma/landing/amazon_reviews_us_Musical_Instruments_v1_00.tsv',
    #     'destination': 's3://amazon-reviews-ma/raw/cleaned_amazon_reviews_us_Musical_Instruments_v1_00.parquet'
    # },
    # {
    #     'source': 's3://amazon-reviews-ma/landing/amazon_reviews_us_Office_Products_v1_00.tsv',
    #     'destination': 's3://amazon-reviews-ma/raw/cleaned_amazon_reviews_us_Office_Products_v1_00.parquet'
    # },
    # {
    #     'source': 's3://amazon-reviews-ma/landing/amazon_reviews_us_Outdoors_v1_00.tsv',
    #     'destination': 's3://amazon-reviews-ma/raw/cleaned_amazon_reviews_us_Outdoors_v1_00.parquet'
    # },
    # {
    #     'source': 's3://amazon-reviews-ma/landing/amazon_reviews_us_PC_v1_00.tsv',
    #     'destination': 's3://amazon-reviews-ma/raw/cleaned_amazon_reviews_us_PC_v1_00.parquet'
    # },
    # {
    #     'source': 's3://amazon-reviews-ma/landing/amazon_reviews_us_Personal_Care_Appliances_v1_00.tsv',
    #     'destination': 's3://amazon-reviews-ma/raw/cleaned_amazon_reviews_us_Personal_Care_Appliances_v1_00.parquet'
    # },
    # {
    #     'source': 's3://amazon-reviews-ma/landing/amazon_reviews_us_Pet_Products_v1_00.tsv',
    #     'destination': 's3://amazon-reviews-ma/raw/cleaned_amazon_reviews_us_Pet_Products_v1_00.parquet'
    # },
    {
        'source': 's3://amazon-reviews-ma/landing/amazon_reviews_us_Shoes_v1_00.tsv',
        'destination': 's3://amazon-reviews-ma/raw/cleaned_amazon_reviews_us_Shoes_v1_00.parquet'
    },
    {
        'source': 's3://amazon-reviews-ma/landing/amazon_reviews_us_Software_v1_00.tsv',
        'destination': 's3://amazon-reviews-ma/raw/cleaned_amazon_reviews_us_Software_v1_00.parquet'
    },
    {
        'source': 's3://amazon-reviews-ma/landing/amazon_reviews_us_Sports_v1_00.tsv',
        'destination': 's3://amazon-reviews-ma/raw/cleaned_amazon_reviews_us_Sports_v1_00.parquet'
    },
    {
        'source': 's3://amazon-reviews-ma/landing/amazon_reviews_us_Tools_v1_00.tsv',
        'destination': 's3://amazon-reviews-ma/raw/cleaned_amazon_reviews_us_Tools_v1_00.parquet'
    },
    {
        'source': 's3://amazon-reviews-ma/landing/amazon_reviews_us_Toys_v1_00.tsv',
        'destination': 's3://amazon-reviews-ma/raw/cleaned_amazon_reviews_us_Toys_v1_00.parquet'
    },
    {
        'source': 's3://amazon-reviews-ma/landing/amazon_reviews_us_Video_DVD_v1_00.tsv',
        'destination': 's3://amazon-reviews-ma/raw/cleaned_amazon_reviews_us_Video_DVD_v1_00.parquet'
    },
    {
        'source': 's3://amazon-reviews-ma/landing/amazon_reviews_us_Video_Games_v1_00.tsv',
        'destination': 's3://amazon-reviews-ma/raw/cleaned_amazon_reviews_us_Video_Games_v1_00.parquet'
    },
    {
        'source': 's3://amazon-reviews-ma/landing/amazon_reviews_us_Video_v1_00.tsv',
        'destination': 's3://amazon-reviews-ma/raw/cleaned_amazon_reviews_us_Video_v1_00.parquet'
    },
    {
        'source': 's3://amazon-reviews-ma/landing/amazon_reviews_us_Watches_v1_00.tsv',
        'destination': 's3://amazon-reviews-ma/raw/cleaned_amazon_reviews_us_Watches_v1_00.parquet'
    },
    {
        'source': 's3://amazon-reviews-ma/landing/amazon_reviews_us_Wireless_v1_00.tsv',
        'destination': 's3://amazon-reviews-ma/raw/cleaned_amazon_reviews_us_Wireless_v1_00.parquet'
    },

    # All the remaining 35 files listed above
    
]

# Loop through the list of files and clean each one
for paths in files_to_clean:
    cleaned_path = clean_amazon_reviews(paths['source'], paths['destination'])
    print(f'Cleaned file written to: {cleaned_path}')

Cleaned file written to: s3://amazon-reviews-ma/raw/cleaned_amazon_reviews_us_Shoes_v1_00.parquet
Cleaned file written to: s3://amazon-reviews-ma/raw/cleaned_amazon_reviews_us_Software_v1_00.parquet
Cleaned file written to: s3://amazon-reviews-ma/raw/cleaned_amazon_reviews_us_Sports_v1_00.parquet
Cleaned file written to: s3://amazon-reviews-ma/raw/cleaned_amazon_reviews_us_Tools_v1_00.parquet
Cleaned file written to: s3://amazon-reviews-ma/raw/cleaned_amazon_reviews_us_Toys_v1_00.parquet
Cleaned file written to: s3://amazon-reviews-ma/raw/cleaned_amazon_reviews_us_Video_DVD_v1_00.parquet
Cleaned file written to: s3://amazon-reviews-ma/raw/cleaned_amazon_reviews_us_Video_Games_v1_00.parquet
Cleaned file written to: s3://amazon-reviews-ma/raw/cleaned_amazon_reviews_us_Video_v1_00.parquet
Cleaned file written to: s3://amazon-reviews-ma/raw/cleaned_amazon_reviews_us_Watches_v1_00.parquet
Cleaned file written to: s3://amazon-reviews-ma/raw/cleaned_amazon_reviews_us_Wireless_v1_00.parquet
