In [1]:
import os
# Find the latest version of spark 3.0  from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.0.3'
spark_version = 'spark-3.0.3'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

Get:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Get:3 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Ign:4 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:5 http://archive.ubuntu.com/ubuntu bionic InRelease
Get:6 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release [697 B]
Get:7 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease [15.9 kB]
Hit:8 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:9 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release.gpg [836 B]
Get:10 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ Packages [62.9 kB]
Get:11 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Hit:12 http://ppa.launchpad.net/cran/

### Loading Reviews Dataframe

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Vine-Review").config("spark.driver.extraClassPath","/content/postgresql-42.2.16.jar").getOrCreate()

In [4]:
from pyspark import SparkFiles
url = "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_multilingual_FR_v1_00.tsv.gz"
spark.sparkContext.addFile(url)
df = spark.read.option("encoding", "UTF-8").csv(SparkFiles.get(""), sep="\t", header=True, inferSchema=True)
df.show()

+-----------+-----------+--------------+----------+--------------+--------------------+--------------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|marketplace|customer_id|     review_id|product_id|product_parent|       product_title|    product_category|star_rating|helpful_votes|total_votes|vine|verified_purchase|     review_headline|         review_body|review_date|
+-----------+-----------+--------------+----------+--------------+--------------------+--------------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|         FR|      14952|R32VYUWDIB5LKE|0552774294|     362925721|    The God Delusion|               Books|          5|            0|          0|   N|                Y|a propos de ce livre|je conseille fort...| 2013-02-13|
|         FR|      14952|R3CCMP4EV6HAVL|B004GJXQ20|     268067011|A Game of Thrones...|Digital_Ebook_Pur

In [5]:
# Create the vine_table DataFrame
vine_df = df.select(["review_id", "star_rating", "helpful_votes", "total_votes", "vine", "verified_purchase"])

vine_df.show()

+--------------+-----------+-------------+-----------+----+-----------------+
|     review_id|star_rating|helpful_votes|total_votes|vine|verified_purchase|
+--------------+-----------+-------------+-----------+----+-----------------+
|R32VYUWDIB5LKE|          5|            0|          0|   N|                Y|
|R3CCMP4EV6HAVL|          5|            0|          0|   N|                Y|
|R14NAE6UGTVTA2|          3|            1|          3|   N|                Y|
|R2E7QEWSC6EWFA|          4|            0|          1|   N|                Y|
|R26E6I47GQRYKR|          2|            3|          5|   N|                N|
|R1RJMTSNCKB9LP|          2|            0|          0|   N|                Y|
|R2P2XF84YELQBZ|          5|            0|          3|   N|                Y|
| RJKULSX2Y5R07|          5|            0|          0|   N|                Y|
|R3UYE0U7SQCI8Q|          5|            2|          3|   N|                Y|
|R1TKJ7XFS3RDEB|          5|            0|          1|   N|     

### Creating Dataframes for Comparison

In [6]:
# creating filtered dataframe
new_vine_df = vine_df.filter("total_votes >= 20")

In [7]:
# creating helpful votes dataframe
helpful_votes_df = new_vine_df.filter(new_vine_df["helpful_votes"]/new_vine_df["total_votes"]>= 0.5)
helpful_votes_df.show()

+--------------+-----------+-------------+-----------+----+-----------------+
|     review_id|star_rating|helpful_votes|total_votes|vine|verified_purchase|
+--------------+-----------+-------------+-----------+----+-----------------+
|R2LX6LJ7BA92OO|          1|           21|         27|   N|                Y|
|R2612UZ2GU2K29|          5|           25|         32|   N|                Y|
|R28X8YI8B00C6S|          4|           52|         63|   N|                Y|
|R3O3RDGDHHINF2|          5|           41|         53|   N|                Y|
|R1FSDCDV0XV3X6|          4|           27|         33|   N|                Y|
|R3QBW3JMQ4WOOW|          5|           14|         20|   N|                Y|
| RRZXAHEVTT2HC|          5|           69|         92|   N|                Y|
|R3AUV7LA2MZTEO|          5|           16|         21|   N|                N|
| R3NOR8QP5W657|          1|           15|         29|   N|                N|
|R1XM8IQUDFKBQZ|          2|           19|         27|   N|     

In [9]:
# creating dataframe of helpful reviews from vine program users
vine_helpful_df = helpful_votes_df.filter(helpful_votes_df["vine"] == 'Y')
vine_helpful_df.show()

+--------------+-----------+-------------+-----------+----+-----------------+
|     review_id|star_rating|helpful_votes|total_votes|vine|verified_purchase|
+--------------+-----------+-------------+-----------+----+-----------------+
|R22SUWPP3KRZT6|          3|           21|         26|   Y|                N|
|R3O1MAY03ZF3MP|          3|           19|         20|   Y|                N|
|R30LQPCSI3ELHN|          5|           18|         23|   Y|                N|
|R3QIS7D8004AD2|          3|           76|         80|   Y|                N|
|R3R6CDRZU5U2XW|          4|           41|         47|   Y|                N|
|R16J9CTJQ41IC2|          5|           18|         22|   Y|                N|
| RJNGHTZD3NT6F|          3|           30|         35|   Y|                N|
|R1QVLVW5QS39ZF|          4|           21|         25|   Y|                N|
| R4SMY6Y2J9897|          5|           24|         28|   Y|                N|
|R1JKVISEGW1GXV|          5|           37|         41|   Y|     

In [10]:
# creating dataframe of helpful reviews from non-vine program users
non_vine_helpful_df = helpful_votes_df.filter(helpful_votes_df["vine"] == 'N')
non_vine_helpful_df.show()

+--------------+-----------+-------------+-----------+----+-----------------+
|     review_id|star_rating|helpful_votes|total_votes|vine|verified_purchase|
+--------------+-----------+-------------+-----------+----+-----------------+
|R2LX6LJ7BA92OO|          1|           21|         27|   N|                Y|
|R2612UZ2GU2K29|          5|           25|         32|   N|                Y|
|R28X8YI8B00C6S|          4|           52|         63|   N|                Y|
|R3O3RDGDHHINF2|          5|           41|         53|   N|                Y|
|R1FSDCDV0XV3X6|          4|           27|         33|   N|                Y|
|R3QBW3JMQ4WOOW|          5|           14|         20|   N|                Y|
| RRZXAHEVTT2HC|          5|           69|         92|   N|                Y|
|R3AUV7LA2MZTEO|          5|           16|         21|   N|                N|
| R3NOR8QP5W657|          1|           15|         29|   N|                N|
|R1XM8IQUDFKBQZ|          2|           19|         27|   N|     

### Analysis of Paid (Vine) Users

In [14]:
# total number of reviews
vine_total = vine_helpful_df.count()
vine_total

19

In [15]:
# number of 5 star reviews
vine_five_star = vine_helpful_df.filter("star_rating = 5").count()
vine_five_star

9

In [17]:
# percentage of 5 star reviews
(vine_five_star/ vine_total) *100

47.368421052631575

### Analysis of Non-Paid Users

In [18]:
# total number of reviews
non_vine_total = non_vine_helpful_df.count()
non_vine_total

8332

In [19]:
# number of 5 star reviews
non_vine_five_star = non_vine_helpful_df.filter("star_rating = 5").count()
non_vine_five_star

4783

In [20]:
# percentage of 5 star reviews
(non_vine_five_star/ non_vine_total) *100

57.40518482957273