In [1]:
import pandas as pd
import gzip, os, json, time

from dotenv import load_dotenv
from pyspark.sql import SparkSession

import pyspark

## Demo without Spark

In [2]:
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

In [3]:
load_dotenv("./env.env")
path = os.getenv("DATA_PATH")

In [4]:
start_time = time.time()

ratings = []
for review in parse(path):
  ratings.append(review['overall'])

end_time = time.time()
elapsed_time = end_time - start_time

In [5]:
print(sum(ratings) / len(ratings))
print(f"Time taken: {elapsed_time:.6f} seconds")

4.0220948494727224
Time taken: 22.620104 seconds


## Demo with Spark (Spark Dataframe)

In [6]:
spark = SparkSession.builder \
    .appName("AmazonReviewsAnalysis") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/03 11:21:01 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [7]:
start_time = time.time()

reviews_df = spark.read.json(path)
valid_reviews_df = reviews_df.filter(reviews_df.overall.isNotNull())
average_rating = valid_reviews_df.selectExpr("avg(overall) as avg_rating").collect()[0]['avg_rating']

end_time = time.time()

elapsed_time = end_time - start_time

                                                                                

In [8]:
print(f"Average Rating: {average_rating}")
print(f"Time taken: {elapsed_time:.6f} seconds")
spark.stop()

Average Rating: 4.0220948494727224
Time taken: 41.305547 seconds


## Demo with Spark (RDD)

In [9]:
sc = pyspark.SparkContext(appName="AdvDataEng-03-2-Spark")

start_time = time.time()

rdd = sc.textFile(path)
parsed_rdd = rdd.map(lambda line: json.loads(line))
ratings_rdd = parsed_rdd.map(lambda review: review.get('overall')).filter(lambda x: x is not None)

rating_sum = ratings_rdd.reduce(lambda x, y: x + y)
rating_count = ratings_rdd.count()
average_rating = rating_sum / rating_count

end_time = time.time()
elapsed_time = end_time - start_time

                                                                                

In [10]:
print(f"Average Rating: {average_rating}")
print(f"Time taken: {elapsed_time:.6f} seconds")

sc.stop()

Average Rating: 4.0220948494727224
Time taken: 27.581993 seconds
