In [0]:
# Install Java, Spark, and Findspark
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz
!tar xf spark-2.4.5-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

In [0]:
!wget https://jdbc.postgresql.org/download/postgresql-42.2.9.jar

--2020-05-06 20:55:54--  https://jdbc.postgresql.org/download/postgresql-42.2.9.jar
Resolving jdbc.postgresql.org (jdbc.postgresql.org)... 72.32.157.228, 2001:4800:3e1:1::228
Connecting to jdbc.postgresql.org (jdbc.postgresql.org)|72.32.157.228|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 914037 (893K) [application/java-archive]
Saving to: ‘postgresql-42.2.9.jar’


2020-05-06 20:55:54 (7.82 MB/s) - ‘postgresql-42.2.9.jar’ saved [914037/914037]



In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("CloudETL").config("spark.driver.extraClassPath","/content/postgresql-42.2.9.jar").getOrCreate()

In [0]:
from pyspark import SparkFiles
url = "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Video_Games_v1_00.tsv.gz"
spark.sparkContext.addFile(url)

video_games_df = spark.read.csv(SparkFiles.get("amazon_reviews_us_Video_Games_v1_00.tsv.gz"), sep="\t", header=True)
video_games_df.show()

+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|marketplace|customer_id|     review_id|product_id|product_parent|       product_title|product_category|star_rating|helpful_votes|total_votes|vine|verified_purchase|     review_headline|         review_body|review_date|
+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|         US|   12039526| RTIS3L2M1F5SM|B001CXYMFS|     737716809|Thrustmaster T-Fl...|     Video Games|          5|            0|          0|   N|                Y|an amazing joysti...|Used this for Eli...| 2015-08-31|
|         US|    9636577| R1ZV7R40OLHKD|B00M920ND6|     569686175|Tonsee 6 buttons ...|     Video Games|          5|    

## Count number of records (rows) in dataset

In [0]:
print("Original Count", video_games_df.count())

Original Count 1785997


## Drop incomplete rows

In [0]:
video_games_df = video_games_df.dropna()
print("New Count", video_games_df.count())

New Count 1785886
Unique Count 1785886


## Examine the schema

In [0]:
video_games_df.printSchema()

root
 |-- marketplace: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- review_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- product_parent: string (nullable = true)
 |-- product_title: string (nullable = true)
 |-- product_category: string (nullable = true)
 |-- star_rating: string (nullable = true)
 |-- helpful_votes: string (nullable = true)
 |-- total_votes: string (nullable = true)
 |-- vine: string (nullable = true)
 |-- verified_purchase: string (nullable = true)
 |-- review_headline: string (nullable = true)
 |-- review_body: string (nullable = true)
 |-- review_date: string (nullable = true)



## Create dataframes that match table schema file
## Create Review ID Table


In [0]:
# Create new df with required columns
review_id_df = video_games_df.select(["review_id", "customer_id", "product_id", "product_parent", "review_date"])
review_id_df.show(5)

+--------------+-----------+----------+--------------+-----------+
|     review_id|customer_id|product_id|product_parent|review_date|
+--------------+-----------+----------+--------------+-----------+
|R1004PYTPK6ELD|   38281029|B00004WHWF|      89143877| 2002-01-23|
|R100EZDMO39LBZ|    1386752|B00OZBFUBY|     872540442| 2015-06-10|
|R1011I65X7RSKT|   41907229|B00006ISBU|     654790631| 2003-12-23|
|R101V84BKDOR1I|   12034223|B001KX5042|     279727821| 2014-08-08|
|R101VJUP2TFB3Y|   31532612|B009DL2TBA|     586138868| 2015-01-05|
+--------------+-----------+----------+--------------+-----------+
only showing top 5 rows



In [0]:
# Drop duplicates
review_id_df = review_id_df.dropDuplicates(["review_id"])

In [0]:
# Ensure column data types match schema
from pyspark.sql.types import * 

review_id_df = review_id_df.withColumn("customer_id",review_id_df["customer_id"].cast(IntegerType()))\
    .withColumn("product_parent",review_id_df["product_parent"].cast(IntegerType()))\
    .withColumn("review_date",review_id_df["review_date"].cast(DateType()))

review_id_df.printSchema()

root
 |-- review_id: string (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- product_id: string (nullable = true)
 |-- product_parent: integer (nullable = true)
 |-- review_date: date (nullable = true)



## Create Products Table

In [0]:
# Create new df with required columns
products_df = video_games_df.select(["product_id", "product_title"])
products_df.show(5)

+----------+--------------------+
|product_id|       product_title|
+----------+--------------------+
|B00004WHWF|         Red Faction|
|B00OZBFUBY|USPRO&reg; PS3 Bl...|
|B00006ISBU|       NBA Live 2003|
|B001KX5042|Imagine:  Cheerle...|
|B009DL2TBA|PlayStation 3 500...|
+----------+--------------------+
only showing top 5 rows



In [0]:
# Drop duplicates
products_df = products_df.dropDuplicates(["product_id"])

In [0]:
# Ensure column data types match schema
products_df.printSchema()

root
 |-- product_id: string (nullable = true)
 |-- product_title: string (nullable = true)



## Create Customers Table

In [0]:
# Create new df with required columns
customers_df = video_games_df.select(["customer_id"])
customers_df.show(5)

+-----------+
|customer_id|
+-----------+
|   38281029|
|    1386752|
|   41907229|
|   12034223|
|   31532612|
+-----------+
only showing top 5 rows



In [0]:
# Group by customer id to get customer count
# No need to drop duplicates as customer ids will be grouped and dropping duplicates would interfere with count values
customers_df = customers_df.groupBy("customer_id").count()
customers_df.orderBy("customer_id").select(["customer_id", "count"])
customers_df.show(5)

+-----------+-----+
|customer_id|count|
+-----------+-----+
|    5401088|    5|
|   37964777|   16|
|   20872523|    1|
|   44777937|    4|
|    2384511|    1|
+-----------+-----+
only showing top 5 rows



In [0]:
# Rename count column
customers_df = customers_df.withColumnRenamed("count", "customer_count")
customers_df.show(5)

+-----------+--------------+
|customer_id|customer_count|
+-----------+--------------+
|    5401088|             5|
|   37964777|            16|
|   20872523|             1|
|   44777937|             4|
|    2384511|             1|
+-----------+--------------+
only showing top 5 rows



In [0]:
# Ensure column data types match schema
customers_df = customers_df.withColumn("customer_id",customers_df["customer_id"].cast(IntegerType()))\
    .withColumn("customer_count",customers_df["customer_count"].cast(IntegerType()))

customers_df.printSchema()

root
 |-- customer_id: integer (nullable = true)
 |-- customer_count: integer (nullable = false)



## Create Vine Table

In [0]:
# Create new df with required columns
vine_df = video_games_df.select(["review_id", "star_rating", "helpful_votes", "total_votes", "vine"])
vine_df.show(5)

+--------------+-----------+-------------+-----------+----+
|     review_id|star_rating|helpful_votes|total_votes|vine|
+--------------+-----------+-------------+-----------+----+
|R1004PYTPK6ELD|          5|            0|          0|   N|
|R100EZDMO39LBZ|          4|            0|          0|   N|
|R1011I65X7RSKT|          5|            2|          2|   N|
|R101V84BKDOR1I|          5|            0|          0|   N|
|R101VJUP2TFB3Y|          5|            0|          0|   N|
+--------------+-----------+-------------+-----------+----+
only showing top 5 rows



In [0]:
# Drop duplicates
vine_df = vine_df.dropDuplicates(["review_id"])

In [0]:
# Ensure column data types match schema
vine_df = vine_df.withColumn("star_rating",vine_df["star_rating"].cast(IntegerType()))\
    .withColumn("helpful_votes",vine_df["helpful_votes"].cast(IntegerType()))\
    .withColumn("total_votes",vine_df["total_votes"].cast(IntegerType()))

vine_df.printSchema()

root
 |-- review_id: string (nullable = true)
 |-- star_rating: integer (nullable = true)
 |-- helpful_votes: integer (nullable = true)
 |-- total_votes: integer (nullable = true)
 |-- vine: string (nullable = true)



In [0]:
# Create two dataframes, one for Vine reviews and one for non-Vine reviews
# Vine review dataframe
vine_reviews_df = vine_df.filter(vine_df["vine"] == "Y")
vine_reviews_df.show(5)

+--------------+-----------+-------------+-----------+----+
|     review_id|star_rating|helpful_votes|total_votes|vine|
+--------------+-----------+-------------+-----------+----+
|R17J2E6TK7BO8D|          4|            0|          0|   Y|
|R1FECXHM1GG32Z|          5|            1|          2|   Y|
|R1HEHO3N86PF8H|          3|           12|         15|   Y|
|R1PW9XE8R7N257|          5|            1|          2|   Y|
|R1RJEOYGF7V23V|          4|            0|          0|   Y|
+--------------+-----------+-------------+-----------+----+
only showing top 5 rows



In [0]:
# Non-vine review dataframe
non_vine_reviews_df = vine_df.filter(vine_df["vine"] == "N")
non_vine_reviews_df.show(5)

+--------------+-----------+-------------+-----------+----+
|     review_id|star_rating|helpful_votes|total_votes|vine|
+--------------+-----------+-------------+-----------+----+
|R1004PYTPK6ELD|          5|            0|          0|   N|
|R1006LDKUHBWUK|          5|            0|          0|   N|
|R100DAG2IK0FKR|          2|            0|          1|   N|
|R101B69JLWW7AS|          5|            0|          0|   N|
|R10292JZNM6DM6|          3|            4|          5|   N|
+--------------+-----------+-------------+-----------+----+
only showing top 5 rows



In [0]:
# What's the total number of Vine vs. non-Vine reviews?
print("Total Number of Vine Reviews", vine_reviews_df.count())
print("Total Number of Non-Vine Reviews", non_vine_reviews_df.count())

Total Number of Vine Reviews 4290
Total Number of Non-Vine Reviews 1781596


In [0]:
# How many Vine reviews rate the products as 5 stars?
star_vine_df = vine_reviews_df.groupBy("star_rating").count()
star_vine_df.orderBy("star_rating").select(["star_rating", "count"]).show()

+-----------+-----+
|star_rating|count|
+-----------+-----+
|          1|   60|
|          2|  194|
|          3|  718|
|          4| 1711|
|          5| 1607|
+-----------+-----+



In [0]:
# How many Non-Vine reviews rate the products as 5 stars?
star_non_vine_df = non_vine_reviews_df.groupBy("star_rating").count()
star_non_vine_df.orderBy("star_rating").select(["star_rating", "count"]).show()

+-----------+-------+
|star_rating|  count|
+-----------+-------+
|          1| 192088|
|          2|  94599|
|          3| 153141|
|          4| 316519|
|          5|1025249|
+-----------+-------+



In [0]:
# What was the average rating for Vine reviews?
from pyspark.sql.functions import mean
vine_reviews_df.select(mean("star_rating")).show()

+------------------+
|  avg(star_rating)|
+------------------+
|4.0748251748251745|
+------------------+



In [0]:
# What was the average rating for Non-Vine reviews?
non_vine_reviews_df.select(mean("star_rating")).show()

+----------------+
|avg(star_rating)|
+----------------+
|4.05985981109073|
+----------------+



In [0]:
# How many helpful votes did Vine reviews recieve?
helpful_vine_reviews_df = vine_reviews_df.select(["review_id", "helpful_votes"])
total_helpful_vine = helpful_vine_reviews_df.groupby().sum()
total_helpful_vine.show()

+------------------+
|sum(helpful_votes)|
+------------------+
|             10076|
+------------------+



In [0]:
# How many helpful votes did Non-Vine reviews recieve?
helpful_non_vine_reviews_df = non_vine_reviews_df.select(["review_id", "helpful_votes"])
total_helpful_non_vine = helpful_non_vine_reviews_df.groupby().sum()
total_helpful_non_vine.show()

+------------------+
|sum(helpful_votes)|
+------------------+
|           4024920|
+------------------+



In [0]:
# How many total votes did Vine reviews recieve?
votes_vine_reviews_df = vine_reviews_df.select(["review_id", "total_votes"])
total_votes_vine = votes_vine_reviews_df.groupby().sum()
total_votes_vine.show()

+----------------+
|sum(total_votes)|
+----------------+
|           14064|
+----------------+



In [0]:
# How many total votes did Non-Vine reviews recieve?
votes_non_vine_reviews_df = non_vine_reviews_df.select(["review_id", "total_votes"])
total_votes_non_vine = votes_non_vine_reviews_df.groupby().sum()
total_votes_non_vine.show()

+----------------+
|sum(total_votes)|
+----------------+
|         6696252|
+----------------+



## Write DataFrames to RDS

In [0]:
# Link to RDS
mode = "append"
jdbc_url = "jdbc:postgresql://amazon-reviews-db.cistst9iemrt.us-west-1.rds.amazonaws.com:5432/video_games_db"
config = {
    "user": "root",
    "password": "Removed due to public upload",
    "driver": "org.postgresql.Driver"
}

In [0]:
# Write review ID table
review_id_df.write.jdbc(url=jdbc_url, table="review_id_table", mode=mode, properties=config)

In [0]:
# Write products table
products_df.write.jdbc(url=jdbc_url, table="products", mode=mode, properties=config)

In [0]:
# Write customers table
customers_df.write.jdbc(url=jdbc_url, table="customers", mode=mode, properties=config)

In [0]:
# Write vine table
vine_df.write.jdbc(url=jdbc_url, table="vine_table", mode=mode, properties=config)