In [1]:
import os
os.environ["PYSPARK_PYTHON"] = "C:\\Users\\aryen\\anaconda3\\python.exe"
os.environ["PYSPARK_DRIVER_PYTHON"] = "C:\\Users\\aryen\\anaconda3\\python.exe"


In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("test").getOrCreate()
print(spark.version)


3.5.5


In [3]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder.appName("ReviewsData").getOrCreate()


In [4]:
file_path = "final_reviews_data.csv"  # Update this path
df_spark = spark.read.option("header", True).option("inferSchema", True).csv(file_path)


In [5]:
from pyspark.sql.functions import col, count, when

# Count missing values in each column
df_spark.select([count(when(col(c).isNull(), c)).alias(c) for c in df_spark.columns]).show()


+------+-----------+--------+----+-----+------------------+-----------+-----+------------+------+
|tconst|movie_title|numVotes|year|label|tomatometer_status|review_type|genre|review_label|review|
+------+-----------+--------+----+-----+------------------+-----------+-----+------------+------+
|     0|          0|       0|   0|    0|              5836|       5836| 5836|        5836|    87|
+------+-----------+--------+----+-----+------------------+-----------+-----+------------+------+



In [6]:
from pyspark.sql.functions import when

# Convert tomatometer_status to numerical values
df_spark = df_spark.withColumn(
    "tomatometer_status",
    when(df_spark["tomatometer_status"] == "Fresh", 1)
    .when(df_spark["tomatometer_status"] == "Rotten", 0)
    .otherwise(-1)  # Mark missing values as -1
)

df_spark.select("tomatometer_status").show(10)


+------------------+
|tomatometer_status|
+------------------+
|                -1|
|                -1|
|                 1|
|                -1|
|                 1|
|                 1|
|                -1|
|                 1|
|                -1|
|                 1|
+------------------+
only showing top 10 rows



In [7]:
# we KEEP review_type but convert to a feature
# we DROP review_label because it seems seems to 
# duplicate tomatometer_status, it doesn’t add extra information.

from pyspark.sql.functions import col, size, when, expr, split

# Convert `review_type` into an array (assuming it’s stored as a comma-separated string)
df_spark = df_spark.withColumn("review_type", split(col("review_type"), ", "))

# Calculate the Fresh review ratio
df_spark = df_spark.withColumn(
    "fresh_ratio",
    when(col("review_type").isNull(), -1).otherwise(
        size(expr("filter(review_type, x -> x = 'Fresh')")) / size(col("review_type"))
    )
)

# Drop the original `review_type` and `review_label` columns
df_spark = df_spark.drop("review_type", "review_label")

# Show results
df_spark.select("tomatometer_status", "fresh_ratio").show(10)

+------------------+-----------+
|tomatometer_status|fresh_ratio|
+------------------+-----------+
|                -1|       -1.0|
|                -1|       -1.0|
|                 1|        0.0|
|                -1|       -1.0|
|                 1|        0.0|
|                 1|        0.0|
|                -1|       -1.0|
|                 1|        0.0|
|                -1|       -1.0|
|                 1|        0.0|
+------------------+-----------+
only showing top 10 rows



In [8]:
# Reload the dataset to restore all columns
df_spark = spark.read.option("header", True).option("inferSchema", True).csv("final_reviews_data.csv")

# Show `review_type` to check its format
df_spark.select("review_type").show(10, truncate=False)


+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|review_type                                                                                                                                                                                                                                        |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|NULL                                                                                                                                                                                                                                               |
|NULL           

In [9]:
from pyspark.sql.functions import col, size, when, expr, split, regexp_replace

# Convert `review_type` from string to an actual array
df_spark = df_spark.withColumn(
    "review_type",
    split(regexp_replace(col("review_type"), "[\\[\\]']", ""), ", ")
)

# Calculate the Fresh review ratio correctly
df_spark = df_spark.withColumn(
    "fresh_ratio",
    when(size(col("review_type")) == 0, -1)  # Handle empty values
    .otherwise(size(expr("filter(review_type, x -> x = 'Fresh')")) / size(col("review_type")))
)

# Drop the original `review_type` and `review_label` columns
df_spark = df_spark.drop("review_type", "review_label")

# Show the results
df_spark.select("tomatometer_status", "fresh_ratio").show(10)


+------------------+------------------+
|tomatometer_status|       fresh_ratio|
+------------------+------------------+
|              NULL|               1.0|
|              NULL|               1.0|
|             Fresh|0.8571428571428571|
|              NULL|               1.0|
|             Fresh|               0.9|
|             Fresh|               1.0|
|              NULL|               1.0|
|             Fresh|0.8333333333333334|
|              NULL|               1.0|
|             Fresh|               1.0|
+------------------+------------------+
only showing top 10 rows



In [10]:
df_spark.filter(col("tomatometer_status") == "Fresh").select("tomatometer_status", "fresh_ratio").show(10)


+------------------+------------------+
|tomatometer_status|       fresh_ratio|
+------------------+------------------+
|             Fresh|0.8571428571428571|
|             Fresh|               0.9|
|             Fresh|               1.0|
|             Fresh|0.8333333333333334|
|             Fresh|               1.0|
|             Fresh|               1.0|
|             Fresh|0.9444444444444444|
|             Fresh|               1.0|
|             Fresh|               1.0|
|             Fresh|               1.0|
+------------------+------------------+
only showing top 10 rows



In [11]:
from pyspark.sql.functions import when

# Create a binary flag for missing tomatometer_status
df_spark = df_spark.withColumn(
    "missing_tomatometer",
    when(col("tomatometer_status").isNull(), 1).otherwise(0)
)

# Replace NULL tomatometer_status with "Unknown"
df_spark = df_spark.withColumn(
    "tomatometer_status",
    when(col("tomatometer_status").isNull(), "Unknown").otherwise(col("tomatometer_status"))
)

# Show results
df_spark.select("tomatometer_status", "missing_tomatometer").distinct().show()


+------------------+-------------------+
|tomatometer_status|missing_tomatometer|
+------------------+-------------------+
|           Unknown|                  1|
|            Rotten|                  0|
|             Fresh|                  0|
|               NaN|                  0|
+------------------+-------------------+



In [12]:
df_spark = df_spark.filter((col("tomatometer_status") != "NaN") & (col("tomatometer_status").isNotNull()))

# Show distinct labels again
df_spark.select("tomatometer_status").distinct().show()


+------------------+
|tomatometer_status|
+------------------+
|            Rotten|
|           Unknown|
|             Fresh|
+------------------+



In [13]:
# Remove Unknown rows
df_spark = df_spark.filter(col("tomatometer_status").isin(["Fresh", "Rotten"]))

# Show distinct labels again to confirm
df_spark.select("tomatometer_status").distinct().show()


+------------------+
|tomatometer_status|
+------------------+
|            Rotten|
|             Fresh|
+------------------+



In [14]:
print("Row count:", df_spark.count())


Row count: 2087


 Convert genre into Multi-Hot Encoding
Since each movie can have multiple genres (e.g., "Action, Comedy"), we need to:

Split genre into a list of genres.
Create a separate column for each unique genre.
Mark 1 if the movie belongs to that genre, otherwise 0.

In [15]:
df_spark.select("genre").show(5, truncate=False)


+----------------------------------------------------+
|genre                                               |
+----------------------------------------------------+
|Action & Adventure, Classics, Western, Romance      |
|Classics, Drama                                     |
|Classics, Comedy                                    |
|Drama                                               |
|Action & Adventure, Classics, Kids & Family, Romance|
+----------------------------------------------------+
only showing top 5 rows



In [16]:
from pyspark.sql.functions import split, explode, when, array_contains, col

# Convert `genre` from a string to an array
df_spark = df_spark.withColumn("genre", split(col("genre"), ", "))

# Extract all unique genres
unique_genres = df_spark.select(explode(col("genre"))).distinct().rdd.flatMap(lambda x: x).collect()

# Create binary columns for each genre
for genre in unique_genres:
    df_spark = df_spark.withColumn(genre, when(array_contains(col("genre"), genre), 1).otherwise(0))

# Drop the original genre column
df_spark = df_spark.drop("genre")

# Show results
df_spark.show(5)


+---------+--------------------+--------+----+-----+------------------+--------------------+------------------+-------------------+------------------+-------+----------------+-----+-----------+-------------------------+-------------+-------------------------+--------+-------------------------+--------------------+---------+----------------+------------------+---+----------+------+-------+------+-----------+-------------+
|   tconst|         movie_title|numVotes|year|label|tomatometer_status|              review|       fresh_ratio|missing_tomatometer|Action & Adventure|Romance|Sports & Fitness|Drama|Documentary|Art House & International|Kids & Family|Science Fiction & Fantasy|Classics|Musical & Performing Arts|Faith & Spirituality|Animation|Special Interest|Mystery & Suspense|NaN|Television|Horror|Western|Comedy|Cult Movies|Gay & Lesbian|
+---------+--------------------+--------+----+-----+------------------+--------------------+------------------+-------------------+------------------+

In [17]:
from pyspark.sql.functions import regexp_replace, lower

# Remove unwanted characters & convert to lowercase
df_spark = df_spark.withColumn("clean_review", lower(regexp_replace(col("review"), "[^a-zA-Z0-9\s]", "")))

# Show results
df_spark.select("clean_review").show(5, truncate=False)


+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [18]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover

# Tokenize text (split into words)
tokenizer = Tokenizer(inputCol="clean_review", outputCol="words")
df_spark = tokenizer.transform(df_spark)

# Remove stopwords (filter out common words)
stopwords_remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
df_spark = stopwords_remover.transform(df_spark)

# Show results
df_spark.select("filtered_words").show(5, truncate=False)


+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [19]:
from pyspark.ml.feature import HashingTF, IDF

# Convert words into term frequency (TF) vectors
hashing_tf = HashingTF(inputCol="filtered_words", outputCol="raw_features", numFeatures=1000)
df_spark = hashing_tf.transform(df_spark)

# Compute TF-IDF scores
idf = IDF(inputCol="raw_features", outputCol="tfidf_features")
df_spark = idf.fit(df_spark).transform(df_spark)

# Show results
df_spark.select("tfidf_features").show(5, truncate=False)


+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

Select Final Features for Training

keep the following:
- Genres (multi-hot encoded)
- NumVotes (movie popularity)
- Year (release year)
- TF-IDF features (from reviews)
- Tomatometer Status (Fresh/Rotten) → This is the target variable

In [20]:
from pyspark.ml.feature import VectorAssembler

# Define final feature columns (excluding categorical ones like "tomatometer_status")
feature_columns = ["numVotes", "year", "fresh_ratio", "missing_tomatometer"] + ["tfidf_features"]

# Assemble into a single vector
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features", handleInvalid="skip")
df_spark = assembler.transform(df_spark)

# Show results
df_spark.select("features", "tomatometer_status").show(5, truncate=False)


+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [21]:
df_spark.select("features", "tomatometer_status").show(5, truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [22]:
train_data, test_data = df_spark.randomSplit([0.8, 0.2], seed=42)

print(f"Training data count: {train_data.count()}, Test data count: {test_data.count()}")


Training data count: 1502, Test data count: 366


In [23]:
from pyspark.ml.feature import StringIndexer

# Convert tomatometer_status from string to numeric labels
indexer = StringIndexer(inputCol="tomatometer_status", outputCol="label_index")
df_spark = indexer.fit(df_spark).transform(df_spark)

# Show transformed data
df_spark.select("tomatometer_status", "label_index").show(10)


+------------------+-----------+
|tomatometer_status|label_index|
+------------------+-----------+
|             Fresh|        0.0|
|             Fresh|        0.0|
|             Fresh|        0.0|
|             Fresh|        0.0|
|             Fresh|        0.0|
|             Fresh|        0.0|
|             Fresh|        0.0|
|             Fresh|        0.0|
|             Fresh|        0.0|
|             Fresh|        0.0|
+------------------+-----------+
only showing top 10 rows



## Why Rotten is 1.0
Having Rotten = 1.0 and Fresh = 0.0 is completely fine! 

## Why is it OK?
Machine learning models don’t care about which class is 0 or 1, as long as there’s a clear distinction between labels.
The important part is consistency—every "Rotten" is labeled as 1.0, and every "Fresh" is labeled as 0.0.

In [24]:
df_spark.select("tomatometer_status", "label_index").distinct().show()


+------------------+-----------+
|tomatometer_status|label_index|
+------------------+-----------+
|            Rotten|        1.0|
|             Fresh|        0.0|
+------------------+-----------+



In [25]:
# Re-split the data
train_data, test_data = df_spark.select("features", "label_index").randomSplit([0.8, 0.2], seed=42)

# Verify train_data again
train_data.printSchema()


root
 |-- features: vector (nullable = true)
 |-- label_index: double (nullable = false)



In [26]:
df_spark.select("features", "tomatometer_status").show(10, truncate=False)


+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [27]:
# Keep the tconst column in test_data for submission
selected_cols = ["tconst", "features", "label_index"]
df_spark = df_spark.select(*selected_cols)

# Split the dataset into training and test sets
train_data, test_data = df_spark.randomSplit([0.8, 0.2], seed=42)


In [28]:
# Train-Test Split
train_data, test_data = df_spark.select("features", "label_index").randomSplit([0.8, 0.2], seed=42)

# Train Random Forest
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(featuresCol="features", labelCol="label_index", numTrees=100)
rf_model = rf.fit(train_data)

# Make Predictions
rf_predictions = rf_model.transform(test_data)

# Show Predictions
rf_predictions.select("label_index", "prediction", "probability").show(10)


+-----------+----------+--------------------+
|label_index|prediction|         probability|
+-----------+----------+--------------------+
|        0.0|       0.0|[0.77384638133476...|
|        0.0|       0.0|[0.60167592019632...|
|        0.0|       0.0|[0.72805185130769...|
|        0.0|       0.0|[0.71004838030793...|
|        0.0|       0.0|[0.67609265769792...|
|        0.0|       0.0|[0.72316831860798...|
|        0.0|       0.0|[0.71285603424973...|
|        0.0|       0.0|[0.71520443573869...|
|        0.0|       0.0|[0.77611634324731...|
|        0.0|       0.0|[0.71967028489366...|
+-----------+----------+--------------------+
only showing top 10 rows



In [29]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="label_index", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(rf_predictions)

print(f"🎯 Model Accuracy: {accuracy:.4f}")


🎯 Model Accuracy: 0.9481
