In [None]:
import findspark
# Locate the Spark installation
findspark.init()

import pyspark
from pyspark import StorageLevel
from pyspark.sql.functions import col
from pyspark.sql.functions import regexp_replace
from pyspark.sql import *

spark = SparkSession.builder\
        .appName("MissingRatings")\
        .master("local[*]")\
        .config("spark.driver.memory", "4g")\
        .config("spark.executor.memory", "5g")\
        .config("spark.storage.memoryFraction", "0.5")\
        .config("spark.shuffle.memoryFraction", "0.5")\
        .config("spark.driver.maxResultSize", "0")\
        .getOrCreate()

23/06/26 16:24:18 WARN Utils: Your hostname, DSaDBA resolves to a loopback address: 127.0.1.1; using 10.0.2.15 instead (on interface enp0s3)
23/06/26 16:24:18 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/06/26 16:24:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [None]:
from pyspark.sql.types import StructType, StructField, ArrayType, StringType, LongType

# Define the schema
schema = StructType([
    StructField('movie', StringType(), True),
    StructField('rating', StringType(), True),
    StructField('genre', StringType(), True),
    StructField('review_date', StringType(),True),
    StructField('review_detail', StringType(), True),
    StructField('review_id', StringType(), True),
    StructField('review_summary', StringType(), True),
    StructField('reviewer', StringType(), True),
    StructField('spoiler_tag', LongType(), True),
    StructField('helpful_upvotes', LongType(), True),
    StructField('helpful_total', LongType(), True)])

In [None]:
# Read the file to filter from the HDFS
df = spark.read.csv('hdfs://localhost:54310/user/reviews/mr_data_3456', schema=schema, sep="\t")

In [None]:
# Keep only the reviews which have no rating
filtered_df = df[~df['rating'].isNotNull()]
filtered_df = filtered_df[filtered_df['movie'].isNotNull()]
filtered_df.limit(5).toPandas()

In [None]:
# Randomly subsample the data to work on a manageable sample in the sandox environment
number_of_samples = 10000
sample_df = filtered_df.sample(withReplacement=False, fraction=number_of_samples/filtered_df.count())

In [None]:
import pymongo

# Connect to the local MongoDB instance and select the database used as repository for the dataset
mongo = pymongo.MongoClient()
mongo_db = mongo.project
# Clear the content of the reviews collection
mongo_db.noRatings.delete_many({})

In [None]:
# Get a dict representation of the DataFrame containing the sample
dict = sample_df.toPandas().to_dict(orient='records')
# Insert the reviews in the collection of the project MongoDB database
mongo_db.noRatings.insert_many(dict);

In [None]:
# Close the connection to the local MongoDB instance
mongo.close()

# Stop the Spark context underlying the Spark session
spark.stop()