In [None]:
import os
# Find the latest version of spark 3.0  from http://www-us.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.0.2'
spark_version = 'spark-3.1.1'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!wget -q https://jdbc.postgresql.org/download/postgresql-42.2.19.jar
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"
os.environ["SPARK_CLASSPATH"] = f"/content/postgresql-42.2.19.jar"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Get:1 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease [15.9 kB]
0% [Waiting for headers] [Waiting for headers] [Connected to cloud.r-project.or                                                                               Get:2 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
0% [Waiting for headers] [2 InRelease 14.2 kB/88.7 kB 16%] [Connected to cloud.                                                                               Hit:3 http://archive.ubuntu.com/ubuntu bionic InRelease
0% [2 InRelease 88.7 kB/88.7 kB 100%] [Connected to cloud.r-project.org (13.227                                                                               Get:4 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
                                                                               Hit:5 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
0% [Waiting for headers] [Connected to cloud.r-p

In [None]:
# Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.config("spark.executor.memory", "70g").config("spark.driver.memory", "50g").config("spark.memory.offHeap.enabled",True).config("spark.memory.offHeap.size","16g").appName("Extract")\
.config("spark.jars", "/content/postgresql-42.2.19.jar").getOrCreate()
# Enable Arrow-based columnar data transfers
spark.conf.set("park.sqls.execution.arrow.enabled", "true")
# Increase timeout values
spark.conf.set("spark.sql.broadcastTimeout", "1000")
# disable the broadcasting
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", "-1")

In [None]:
url = "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Home_Improvement_v1_00.tsv.gz"
from pyspark.sql.functions import col, lit
from pyspark import SparkFiles
spark.sparkContext.addFile(url)
spark_df = spark.read.csv(SparkFiles.get("amazon_reviews_us_Home_Improvement_v1_00.tsv.gz"), sep="\t", header=True)
spark_df = spark_df.withColumn("review_date",col("review_date").cast("date"))
spark_df = spark_df.withColumn("star_rating",col("star_rating").cast("int"))
spark_df = spark_df.withColumn("helpful_votes",col("helpful_votes").cast("int"))
spark_df = spark_df.withColumn("total_votes",col("total_votes").cast("int"))
spark_df = spark_df.na.drop("any")
spark_df.show()

+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|marketplace|customer_id|     review_id|product_id|product_parent|       product_title|product_category|star_rating|helpful_votes|total_votes|vine|verified_purchase|     review_headline|         review_body|review_date|
+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|         US|   48881148|R215C9BDXTDQOW|B00FR4YQYK|     381800308|SadoTech Model C ...|Home Improvement|          4|            0|          0|   N|                Y|          Four Stars|        good product| 2015-08-31|
|         US|   47882936|R1DTPUV1J57YHA|B00439MYYE|     921341748|iSpring T32M 3.2 ...|Home Improvement|          5|    

In [None]:
spark_df.select('product_category').distinct().show()

+----------------+
|product_category|
+----------------+
|Home Improvement|
+----------------+



In [None]:
spark_df.printSchema()

root
 |-- marketplace: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- review_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- product_parent: string (nullable = true)
 |-- product_title: string (nullable = true)
 |-- product_category: string (nullable = true)
 |-- star_rating: integer (nullable = true)
 |-- helpful_votes: integer (nullable = true)
 |-- total_votes: integer (nullable = true)
 |-- vine: string (nullable = true)
 |-- verified_purchase: string (nullable = true)
 |-- review_headline: string (nullable = true)
 |-- review_body: string (nullable = true)
 |-- review_date: date (nullable = true)



In [None]:
# Outputting the number of rows
spark_df.count()

2634255

In [None]:
# Removed duplicate rows
spark_df = spark_df.dropDuplicates()
spark_df.count()

2634255

# Transform dataframes to fit the schema 

In [None]:
# Keep and rename necessary columns for Review ID table
review_id_df = spark_df.select(["review_id", "customer_id", "product_id", "product_parent", "review_date"])
review_id_df = review_id_df.withColumn('batch_id',lit(1))
review_id_df.show()

+--------------+-----------+----------+--------------+-----------+--------+
|     review_id|customer_id|product_id|product_parent|review_date|batch_id|
+--------------+-----------+----------+--------------+-----------+--------+
|R215C9BDXTDQOW|   48881148|B00FR4YQYK|     381800308| 2015-08-31|       1|
|R1DTPUV1J57YHA|   47882936|B00439MYYE|     921341748| 2015-08-31|       1|
| RFAZK5EWKJWOU|   44435471|B00002N762|      56053291| 2015-08-31|       1|
|R2XT8X0O0WS1AL|   28377689|B000QFCP1G|     595928517| 2015-08-31|       1|
|R14GRNANKO2Y2J|   50134766|B00WRCRKOI|     417053744| 2015-08-31|       1|
|R2BLF9VYL24LCQ|   14066511|B00NIH88EW|     275395071| 2015-08-31|       1|
|R1GI9UW5KJ671O|   15211046|B005B9CI96|     856617815| 2015-08-31|       1|
|R2H5CEJN863M86|   14862498|B008L0OMWI|     125102494| 2015-08-31|       1|
| R5PPDHFOZ3SMU|   23617292|B00P9FTC6O|     523110842| 2015-08-31|       1|
| RE1L9IENKJJ7Y|   35820485|B00K6BQEHQ|     797306964| 2015-08-31|       1|
|R3CZ0990QC2

In [None]:
# For Products table
products_df = spark_df.select(["product_id", "product_title", "product_category"])
products_df = products_df.dropDuplicates(["product_id"])
products_df = products_df.dropDuplicates()
products_df = products_df.withColumn('batch_id',lit(1))
products_df.show(truncate= False)

+----------+----------------------------------------------------------------------------------------+----------------+--------+
|product_id|product_title                                                                           |product_category|batch_id|
+----------+----------------------------------------------------------------------------------------+----------------+--------+
|7800807606|CBconcept Brand Halogen Light Bulb Jc G4 6volt 5watt - 10 Bulbs                         |Home Improvement|1       |
|9178907519|IKEA - GAVIK Table lamp, purple, frosted glass                                          |Home Improvement|1       |
|B00002N70K|Kwikset 293 Small Round Escutcheon Plate,                                               |Home Improvement|1       |
|B00002N7BC|Westinghouse 6720800 One-Light Flush-Mount Interior Ceiling Fixture with Pull Chain     |Home Improvement|1       |
|B00002NAEP|Westinghouse One-Light Exterior Wall Lantern                                            |Hom

In [None]:
# For Customers table
customers_df = spark_df.groupby("customer_id").agg({
    "customer_id": "count"
    }).withColumnRenamed("count(customer_id)", "customer_count")
customers_df = customers_df.withColumn('batch_id',lit(1))
customers_df.show()

+-----------+--------------+--------+
|customer_id|customer_count|batch_id|
+-----------+--------------+--------+
|   10145717|             1|       1|
|   30238476|             1|       1|
|     856598|             1|       1|
|   45046643|             3|       1|
|   36775983|            14|       1|
|   10245571|            16|       1|
|   23089404|             1|       1|
|   13114084|             4|       1|
|   25415089|             6|       1|
|   51970980|             1|       1|
|   38676202|             1|       1|
|   47118816|             1|       1|
|   14286306|             4|       1|
|   31389365|             1|       1|
|   52113137|             1|       1|
|   32841016|             1|       1|
|   28292593|             1|       1|
|   51105439|             1|       1|
|   41681532|             1|       1|
|   35665618|             5|       1|
+-----------+--------------+--------+
only showing top 20 rows



In [None]:
# For Review table
reviews_df = spark_df.select(["review_id", "verified_purchase", "review_headline", "review_body"])
reviews_df = reviews_df.dropDuplicates()
reviews_df = reviews_df.withColumn('batch_id',lit(1))
reviews_df.show()

+--------------+-----------------+--------------------+--------------------+--------+
|     review_id|verified_purchase|     review_headline|         review_body|batch_id|
+--------------+-----------------+--------------------+--------------------+--------+
|R2H85HQ4J3OVWF|                Y| Quick, Easy, Sturdy|The lid to my las...|       1|
|R28XCA0KI7GT15|                Y|          Five Stars|fixed my dishwash...|       1|
|R1DQV59X4D3D8F|                Y|          Five Stars|Came in super fas...|       1|
|R1O1G3ST0LH6N6|                Y|          Five Stars|So far so good. I...|       1|
|R14VV3JZD8S87J|                Y|               Works|Gets the bad ring...|       1|
|R1C66XDVVGUWIR|                Y|             Ok Lock|Cant rate them 5 ...|       1|
|R2G3B2Q8R62M14|                Y|          Good value|I like the string...|       1|
|R1OKTANJ7L6GV4|                Y|High quality, bright|There are some ch...|       1|
| RV5B9J07DPZ1I|                Y|          Four Stars

In [None]:
# For Vine table
vine_df = spark_df.select(["review_id", "star_rating", "helpful_votes", "total_votes", "vine"])
vine_df = vine_df.withColumn('batch_id',lit(1))
vine_df.show()

+--------------+-----------+-------------+-----------+----+--------+
|     review_id|star_rating|helpful_votes|total_votes|vine|batch_id|
+--------------+-----------+-------------+-----------+----+--------+
|R215C9BDXTDQOW|          4|            0|          0|   N|       1|
|R1DTPUV1J57YHA|          5|            0|          0|   N|       1|
| RFAZK5EWKJWOU|          5|            0|          0|   N|       1|
|R2XT8X0O0WS1AL|          5|            0|          0|   N|       1|
|R14GRNANKO2Y2J|          5|            0|          0|   N|       1|
|R2BLF9VYL24LCQ|          5|            1|          1|   N|       1|
|R1GI9UW5KJ671O|          5|            0|          0|   N|       1|
|R2H5CEJN863M86|          5|            0|          1|   N|       1|
| R5PPDHFOZ3SMU|          5|            0|          0|   N|       1|
| RE1L9IENKJJ7Y|          1|            0|          0|   N|       1|
|R3CZ0990QC2Z0H|          5|            0|          0|   N|       1|
|R3UMMD2IO29QSP|          5|      

# Load data to AWS RDS

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
# Insert the directory
import sys
sys.path.insert(0,'/content/drive/My Drive/Colab_Notebooks/Homework/')

In [None]:
from config import server, database, port, username, password

In [None]:
# Review ID table
review_id_df.write.mode("append").format("jdbc").option("url", f"jdbc:postgresql://{server}:{port}/{database}").option("dbtable", "review_id_table").option("user", f"{username}").option("password", f"{password}")\
           .option("driver", "org.postgresql.Driver").save()

In [None]:
# Product table
products_df.write.mode("append").mode("append").format("jdbc").option("url", f"jdbc:postgresql://{server}:{port}/{database}").option("dbtable", "products").option("user", f"{username}").option("password", f"{password}")\
           .option("driver", "org.postgresql.Driver").save()

In [None]:
# Customers table
customers_df.write.mode("append").format("jdbc").option("url", f"jdbc:postgresql://{server}:{port}/{database}").option("dbtable", "customers").option("user", f"{username}").option("password", f"{password}")\
           .option("driver", "org.postgresql.Driver").save()

In [None]:
# Reviews table
reviews_df.write.mode("append").format("jdbc").option("url", f"jdbc:postgresql://{server}:{port}/{database}").option("dbtable", "reviews").option("user", f"{username}").option("password", f"{password}")\
           .option("driver", "org.postgresql.Driver").save()

In [None]:
# Vine table
vine_df.write.mode("append").format("jdbc").option("url", f"jdbc:postgresql://{server}:{port}/{database}").option("dbtable", "vine_table").option("user", f"{username}").option("password", f"{password}")\
           .option("driver", "org.postgresql.Driver").save()

# Query Test

In [None]:
df = spark.read.jdbc(url = f"jdbc:postgresql://{server}:{port}/{database}", 
                     table = "(SELECT t1.*, t2.vine \
                     FROM reviews AS t1 INNER JOIN vine_table AS t2 ON t1.review_id = t2.review_id) AS my_table",
                     properties={"user": f"{username}", "password": f"{password}", "driver": 'org.postgresql.Driver'})
df.show()