In [None]:
import findspark
# Locate the Spark installation
findspark.init()

import pyspark
from pyspark import StorageLevel
from pyspark.sql.functions import col
from pyspark.sql.functions import regexp_replace
from pyspark.sql import *

spark = SparkSession.builder\
        .appName("IngestionR")\
        .master("local[*]")\
        .config("spark.driver.memory", "5g")\
        .config("spark.executor.memory", "4g")\
        .config("spark.storage.memoryFraction", "0.5")\
        .config("spark.shuffle.memoryFraction", "0.5")\
        .config("spark.driver.maxResultSize", "0")\
        .getOrCreate()

23/06/25 18:09:52 WARN Utils: Your hostname, DSaDBA resolves to a loopback address: 127.0.1.1; using 10.0.2.15 instead (on interface enp0s3)
23/06/25 18:09:52 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/06/25 18:09:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [None]:
from pyspark.sql.types import StructType, StructField, ArrayType, StringType, LongType

# Define the schema
schema = StructType([
    StructField('helpful', ArrayType(StringType(), True), True),
    StructField('movie', StringType(), True),
    StructField('rating', StringType(), True),
    StructField('review_date', StringType(),True),
    StructField('review_detail', StringType(), True),
    StructField('review_id', StringType(), True),
    StructField('review_summary', StringType(), True),
    StructField('reviewer', StringType(), True),
    StructField('spoiler_tag', LongType(), True)])

# Read the JSON files from a directory
df = spark.read.json('hdfs://localhost:54310/user/reviews/kaggledata/part-01.json', schema=schema)
df.limit(5).toPandas()

                                                                                

Unnamed: 0,helpful,movie,rating,review_date,review_detail,review_id,review_summary,reviewer,spoiler_tag
0,"[1, 1]",After Life (2019– ),9,3 May 2020,"I enjoyed the first season, but I must say I t...",rw5704482,Very Strong Season 2,raeldor-96879,0
1,"[2, 2]",The Valhalla Murders (2019– ),6,3 May 2020,I know Iceland is a small country and police d...,rw5704483,Icelandic detectives?,dosleeb,0
2,"[0, 0]",Special OPS (2020– ),7,3 May 2020,"Except K K , no other actor looks comfortable ...",rw5704484,Nothing special,brightconscious,0
3,"[5, 9]",#BlackAF (2020– ),8,3 May 2020,I'm guessing that as a 62 year old white woman...,rw5704485,Good but,gasconyway,0
4,"[26, 41]",The Droving (2020),2,3 May 2020,Here's the truth. There's not much to this mov...,rw5704487,An honest review,mmason-15867,0


In [None]:
# Remove illegal characters from the review
df = df.withColumn("review_detail", regexp_replace("review_detail", "\t", " "))
df = df.withColumn("review_detail", regexp_replace("review_detail", "\n", " "))
df = df.withColumn("review_detail", regexp_replace("review_detail", "\"", " "))
df = df.withColumn("review_detail", regexp_replace("review_detail", "\\\\", " "))

# Remove illegal characters from the review summary
df = df.withColumn("review_summary", regexp_replace("review_summary", "\t", " "))
df = df.withColumn("review_summary", regexp_replace("review_summary", "\n", " "))
df = df.withColumn("review_summary", regexp_replace("review_summary", "\"", " "))
df = df.withColumn("review_summary", regexp_replace("review_summary", "\\\\", " "))

# Separate the helpfulness into two columns
df = df.withColumn('helpful_upvotes', col('helpful').getItem(0))
df = df.withColumn('helpful_total', col('helpful').getItem(1))
df = df.drop('helpful')
# Display the resulting table
df.limit(5).toPandas()

                                                                                

Unnamed: 0,movie,rating,review_date,review_detail,review_id,review_summary,reviewer,spoiler_tag,helpful_upvotes,helpful_total
0,After Life (2019– ),9,3 May 2020,"I enjoyed the first season, but I must say I t...",rw5704482,Very Strong Season 2,raeldor-96879,0,1,1
1,The Valhalla Murders (2019– ),6,3 May 2020,I know Iceland is a small country and police d...,rw5704483,Icelandic detectives?,dosleeb,0,2,2
2,Special OPS (2020– ),7,3 May 2020,"Except K K , no other actor looks comfortable ...",rw5704484,Nothing special,brightconscious,0,0,0
3,#BlackAF (2020– ),8,3 May 2020,I'm guessing that as a 62 year old white woman...,rw5704485,Good but,gasconyway,0,5,9
4,The Droving (2020),2,3 May 2020,Here's the truth. There's not much to this mov...,rw5704487,An honest review,mmason-15867,0,26,41


In [None]:
# Save the data in the HDFS
df.write.csv(path='hdfs://localhost:54310/user/reviews/data/reviews/part1', mode='overwrite', header=False, sep="\t")

                                                                                

In [None]:
names = ["part-02","part-03","part-04","part-05","part-06"]
c = 1
for name in names:
    c += 1
    # Read the JSON files from a directory
    df = spark.read.json('hdfs://localhost:54310/user/reviews/kaggledata/'+name+'.json', schema=schema)
    # Remove illegal characters from the review
    df = df.withColumn("review_detail", regexp_replace("review_detail", "\t", " "))
    df = df.withColumn("review_detail", regexp_replace("review_detail", "\n", " "))
    df = df.withColumn("review_detail", regexp_replace("review_detail", "\"", " "))
    df = df.withColumn("review_detail", regexp_replace("review_detail", "\\\\", " "))
    # Remove illegal characters from the review summary
    df = df.withColumn("review_summary", regexp_replace("review_summary", "\t", " "))
    df = df.withColumn("review_summary", regexp_replace("review_summary", "\n", " "))
    df = df.withColumn("review_summary", regexp_replace("review_summary", "\"", " "))
    df = df.withColumn("review_summary", regexp_replace("review_summary", "\\\\", " "))
    # Separate the helpfulness into two columns
    df = df.withColumn('helpful_upvotes', col('helpful').getItem(0))
    df = df.withColumn('helpful_total', col('helpful').getItem(1))
    df = df.drop('helpful')
    # Save the DataFrame as CSV in HDFS
    df.write.csv(path='hdfs://localhost:54310/user/reviews/data/reviews/part'+str(c), mode='overwrite', header=False, sep="\t")

                                                                                

In [None]:
# Stop the SparkSession
spark.stop()