In [23]:
import pandas as pd
import gzip, os, json, time
from pyspark.sql import SparkSession

os.environ["SPARK_HOME"] = "/usr/local/spark-3.5.3-bin-hadoop3"
import findspark
findspark.init()
import pyspark

## Demo without Spark

In [24]:
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

In [3]:
path = os.path.abspath('/Users/lukasheini/Desktop/Advanced Data Engineering/Projekt/Data/Video_Games.json.gz')

start_time = time.time()

df = getDF(path)
ratings = []
for review in parse(path):
  ratings.append(review['overall'])

end_time = time.time()
elapsed_time = end_time - start_time

In [4]:
print(sum(ratings) / len(ratings))
print(f"Time taken: {elapsed_time:.6f} seconds")

4.0220948494727224
Time taken: 151.415379 seconds


## Demo with Spark (Spark Dataframe)

In [12]:
spark = SparkSession.builder \
    .appName("AmazonReviewsAnalysis") \
    .getOrCreate()

In [8]:
start_time = time.time()

reviews_df = spark.read.json(path)
valid_reviews_df = reviews_df.filter(reviews_df.overall.isNotNull())
average_rating = valid_reviews_df.selectExpr("avg(overall) as avg_rating").collect()[0]['avg_rating']

end_time = time.time()

elapsed_time = end_time - start_time

                                                                                

In [9]:
print(f"Average Rating: {average_rating}")
print(f"Time taken: {elapsed_time:.6f} seconds")
spark.stop()

Average Rating: 4.0220948494727224
Time taken: 24.500437 seconds


## Demo with Spark (RDD)

In [27]:
sc = pyspark.SparkContext.getOrCreate("AmazonReviewsAnalysisRDD")

start_time = time.time()

rdd = sc.textFile(path)
parsed_rdd = rdd.map(lambda line: json.loads(line))
ratings_rdd = parsed_rdd.map(lambda review: review.get('overall')).filter(lambda x: x is not None)

rating_sum = ratings_rdd.reduce(lambda x, y: x + y)
rating_count = ratings_rdd.count()
average_rating = rating_sum / rating_count

end_time = time.time()
elapsed_time = end_time - start_time

                                                                                

In [28]:
print(f"Average Rating: {average_rating}")
print(f"Time taken: {elapsed_time:.6f} seconds")

sc.stop()

Average Rating: 4.0220948494727224
Time taken: 17.930070 seconds
