In [None]:
import findspark
# Locate the Spark installation
findspark.init()

import pyspark
from pyspark import StorageLevel
from pyspark.sql.functions import col
from pyspark.sql.functions import regexp_replace
from pyspark.sql import *

spark = SparkSession.builder\
        .appName("Filtering")\
        .master("local[*]")\
        .config("spark.driver.memory", "4g")\
        .config("spark.executor.memory", "5g")\
        .config("spark.storage.memoryFraction", "0.5")\
        .config("spark.shuffle.memoryFraction", "0.5")\
        .config("spark.driver.maxResultSize", "0")\
        .getOrCreate()

23/06/26 00:59:07 WARN Utils: Your hostname, DSaDBA resolves to a loopback address: 127.0.1.1; using 10.0.2.15 instead (on interface enp0s3)
23/06/26 00:59:07 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/06/26 00:59:08 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [None]:
from pyspark.sql.types import StructType, StructField, ArrayType, StringType, LongType

# Define the schema
schema = StructType([
    StructField('movie', StringType(), True),
    StructField('rating', StringType(), True),
    StructField('genre', StringType(), True),
    StructField('review_date', StringType(),True),
    StructField('review_detail', StringType(), True),
    StructField('review_id', StringType(), True),
    StructField('review_summary', StringType(), True),
    StructField('reviewer', StringType(), True),
    StructField('spoiler_tag', LongType(), True),
    StructField('helpful_upvotes', LongType(), True),
    StructField('helpful_total', LongType(), True)])

In [None]:
# Read the files to filter from the HDFS and display the table
df = spark.read.csv('hdfs://localhost:54310/user/reviews/data/mapreduce/part-00000', schema=schema, sep="\t", mode="DROPMALFORMED")
df.limit(5).toPandas()

                                                                                

Unnamed: 0,movie,rating,genre,review_date,review_detail,review_id,review_summary,reviewer,spoiler_tag,helpful_upvotes,helpful_total
0,#1 Cheerleader Camp (2010 Video),1,-,3 May 2014,I couldn't give this movie more than zero hear...,rw3008997,Zero hearts,berzinam,0,0,4
1,#1 Cheerleader Camp (2010 Video),3,-,29 July 2010,I was expecting a film much like fired up (whi...,rw2287638,Not very good.,jaffaq,0,11,21
2,#1 Cheerleader Camp (2010 Video),5,-,28 July 2010,If you are an 12-16 year old boy you will love...,rw2287295,"Starts good, then very predictable",mognam,0,30,39
3,#1 Cheerleader Camp (2010 Video),4,-,21 March 2018,This is another teen sex comedy. The basic plo...,rw4099770,Count your chickens for a rainy day,nogodnomasters,1,0,0
4,#1 Cheerleader Camp (2010 Video),2,-,30 December 2012,"I have never hated a movie, or rated it below ...",rw2727990,"An awfully predictable, yet meaningless movie.",muslim-4,0,1,7


In [None]:
# Remove the reviews which have no valid values for movie or vote
filtered_df = df[df['rating'].isin(list(range(11)))]
filtered_df = filtered_df[filtered_df['movie'].isNotNull()]
filtered_df.limit(5).toPandas()

Unnamed: 0,movie,rating,genre,review_date,review_detail,review_id,review_summary,reviewer,spoiler_tag,helpful_upvotes,helpful_total
0,#1 Cheerleader Camp (2010 Video),1,-,3 May 2014,I couldn't give this movie more than zero hear...,rw3008997,Zero hearts,berzinam,0,0,4
1,#1 Cheerleader Camp (2010 Video),3,-,29 July 2010,I was expecting a film much like fired up (whi...,rw2287638,Not very good.,jaffaq,0,11,21
2,#1 Cheerleader Camp (2010 Video),5,-,28 July 2010,If you are an 12-16 year old boy you will love...,rw2287295,"Starts good, then very predictable",mognam,0,30,39
3,#1 Cheerleader Camp (2010 Video),4,-,21 March 2018,This is another teen sex comedy. The basic plo...,rw4099770,Count your chickens for a rainy day,nogodnomasters,1,0,0
4,#1 Cheerleader Camp (2010 Video),2,-,30 December 2012,"I have never hated a movie, or rated it below ...",rw2727990,"An awfully predictable, yet meaningless movie.",muslim-4,0,1,7


In [None]:
# Get the number of invalid entries
rows = df.count()
filtered_rows = filtered_df.count()
print(f'{rows - filtered_rows} reviews were removed out of {rows}.')



549412 reviews were removed out of 3562336.


                                                                                

In [None]:
# Save the filtered data
df.write.csv(path='hdfs://localhost:54310/user/data/filtered', mode='overwrite', header=False, sep="\t")

                                                                                

In [None]:
# Stop Spark
spark.stop()