In [1]:
import os
# Find the latest version of spark 3.0  from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.0.1'
spark_version = 'spark-3.0.3'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!pip install pyspark
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
0% [Connecting to archive.ubuntu.com] [Connecting to security.ubuntu.com] [1 In0% [Connecting to archive.ubuntu.com] [Connecting to security.ubuntu.com] [Wait0% [1 InRelease gpgv 3,626 B] [Connecting to archive.ubuntu.com] [Connecting to                                                                               Ign:2 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
0% [1 InRelease gpgv 3,626 B] [Connecting to archive.ubuntu.com] [Connecting to                                                                               Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease [1,581 B]
Hit:4 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:5 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Get:6 http://ppa.laun

In [3]:
# Start a SparkSession and and link to Postgres
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("VineReviews").getOrCreate()

In [27]:
# Load in tsv.gz file from S3 into a DataFrame 
from pyspark import SparkFiles

url = "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Video_Games_v1_00.tsv.gz"
spark.sparkContext.addFile(url)

df = spark.read.option('header', 'true').csv(SparkFiles.get("amazon_reviews_us_Video_Games_v1_00.tsv.gz"), inferSchema=True, sep='\t', timestampFormat='yyyy-mm-dd')
df.show(10)

+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-------------------+
|marketplace|customer_id|     review_id|product_id|product_parent|       product_title|product_category|star_rating|helpful_votes|total_votes|vine|verified_purchase|     review_headline|         review_body|        review_date|
+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-------------------+
|         US|   12039526| RTIS3L2M1F5SM|B001CXYMFS|     737716809|Thrustmaster T-Fl...|     Video Games|          5|            0|          0|   N|                Y|an amazing joysti...|Used this for Eli...|2015-01-31 00:08:00|
|         US|    9636577| R1ZV7R40OLHKD|B00M920ND6|     569686175|Tonsee 6 buttons ...| 

In [28]:
# Ouput number of rows in the DataFrame
print(df.count())

1785997


In [29]:
# Drop null and duplicate rows and output the resulting number of rows in the DataFrame
df = df.dropna().dropDuplicates()
print(df.count())

1785886


In [30]:
vine_df = df.select(["review_id", "star_rating", "helpful_votes", "total_votes", "vine"])
vine_df.show(10)

+--------------+-----------+-------------+-----------+----+
|     review_id|star_rating|helpful_votes|total_votes|vine|
+--------------+-----------+-------------+-----------+----+
|R101BFBMRAOEDT|          3|            0|          1|   N|
|R101OYKK90GCD3|          5|            0|          1|   N|
|R1025IN3G40UR0|          4|           36|         38|   N|
|R102FS86G4B4N3|          1|            0|          0|   N|
|R102MY1UF3EV41|          5|            0|          0|   N|
|R102YIWZEIAXT9|          5|            2|          4|   N|
|R104KX3QSYZDMQ|          5|            0|          0|   N|
|R104LQ2H4P36H5|          5|            0|          0|   N|
|R105RLBQN5OIH7|          4|            0|          1|   N|
|R1061FAGHHER0R|          5|            0|          0|   N|
+--------------+-----------+-------------+-----------+----+
only showing top 10 rows



In [49]:
vine_summ_df = vine_df.groupBy("vine").agg({"review_id":"count", "star_rating":"avg", "helpful_votes":"sum", "total_votes":"sum"})
vine_summ_df.show()

+----+----------------+----------------+------------------+------------------+
|vine|count(review_id)|sum(total_votes)|  avg(star_rating)|sum(helpful_votes)|
+----+----------------+----------------+------------------+------------------+
|   Y|            4290|           14064|4.0748251748251745|             10076|
|   N|         1781596|         6696252|  4.05985981109073|           4024920|
+----+----------------+----------------+------------------+------------------+



In [46]:
fivestar_df = vine_df.filter(vine_df["star_rating"] == 5).groupBy("vine").agg({"star_rating":"count"})
fivestar_df.show(10)

+----+------------------+
|vine|count(star_rating)|
+----+------------------+
|   Y|              1607|
|   N|           1025249|
+----+------------------+



From the data above it can be said that vine reviews are trustworthy, as the average star rating is relatively similar between vine reviews and non-vine reviews (4.07 vs. 4.05). In addition, when looking at the average number of helpful_votes per review, the figures are also similar. Vine reviews have c.2 helpful votes per review, while non-vine reviews also have c.2 helpful votes per review. This means that vine reviews have an equal amount of detail as non-vine reviews. Finally, 37% of vine reviews were 5-star reviews, compared to 58% of non-vine reviews. As vine reviews have a smaller proportion of 5-star reviews, it shows that there is little to no bias amongst vine reviewers to post positive reviews.