In [1]:
# library imports 

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType,IntegerType, FloatType,BooleanType,DateType

In [2]:
# initialise the session 

spark = SparkSession \
    .builder \
    .appName("Amazon Product Review Analysis") \
    .getOrCreate()

### Load Dataset 

| Column | Description | 
| :--- | :--- |
| marketplace | 2 letter country code of the marketplace where the review was written. |
| customer_id | Random identifier that can be used to aggregate reviews written by a single author. |
| review_id | The unique ID of the review. |
| product_id | The unique Product ID the review pertains to. In the multilingual dataset the reviews for the same product in different countries can be grouped by the same product_id. | 
| product_parent | Random identifier that can be used to aggregate reviews for the same product. |
| product_title | Title of the product. | 
| product_category | Broad product category that can be used to group reviews (also used to group the dataset into  coherent parts). | 
| star_rating | the 1-5 star rating of the review. | 
| helpful_votes | Number of helpful votes. | 
| total_votes | Number of total votes the review received. | 
| vine | Review was written as part of the Vine program. |
| verified_purchase | The review is on a verified purchase. |
| review_headline | The title of the review. |
| review_body | The review text. |
| review_date | The date the review was written | 


DATA FORMAT
Tab ('\t') separated text file, without quote or escape characters.
First line in each file is header; 1 line corresponds to 1 record.


In [9]:
# load the data set 
review_data = '../data/sample_us.tsv'

aws_product_review_schema = StructType([
    StructField("marketplace", StringType(), True),
    StructField("customer_id", StringType(), True),
    StructField("review_id", StringType(), True),
    StructField("product_id",StringType(),True),
    StructField("product_parent",StringType(),False),
    StructField("product_title", StringType(), False),
    StructField("product_category", StringType(), False),
    StructField("star_rating", IntegerType(), False),
    StructField("helpful_votes",IntegerType(),False),
    StructField("total_votes", IntegerType(), False),
    StructField("vine",StringType(),False),
    StructField("verified_purchase", StringType(), False),
    StructField("review_headline", StringType(), False),
    StructField("review_body", StringType(), False),
    StructField("review_date",DateType(),False)])

awsProductReview_raw_data = spark.read.csv(review_data,header=True,sep="\t",schema=aws_product_review_schema)



In [10]:
awsProductReview_raw_data.show(5)

+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|marketplace|customer_id|     review_id|product_id|product_parent|       product_title|product_category|star_rating|helpful_votes|total_votes|vine|verified_purchase|     review_headline|         review_body|review_date|
+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|         US|   18778586| RDIJS7QYB6XNR|B00EDBY7X8|     122952789|Monopoly Junior B...|            Toys|          5|            0|          0|   N|                Y|          Five Stars|        Excellent!!!| 2015-08-31|
|         US|   24769659|R36ED1U38IELG8|B00D7JFOPC|     952062646|56 Pieces of Wood...|            Toys|          5|    