## Big Data Project - Model2: Random Forest
### This file contain code and result of Random Forest trained on review of books.jsonl file with 2 executor on 100 percent of actual dataset

In [1]:
import pandas as pd
import pyspark.sql.functions as F
from pyspark.sql.functions import col, count, when

### --Creating and building spark session with 2 maxExecutors with 50 partition

In [2]:
from pyspark.sql import SparkSession

# Creating a SparkSession
spark = (
  SparkSession.builder
    .appName("AmazonReviewsUsingBERT")
    .master("yarn")
    .config("spark.dynamicAllocation.enabled", "true")
    .config("spark.shuffle.service.enabled",   "true")
    .config("spark.dynamicAllocation.minExecutors","1")
    .config("spark.dynamicAllocation.maxExecutors","2")
    .config("spark.sql.shuffle.partitions",       "50")
    .getOrCreate()
)

25/05/04 20:40:21 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


### --Data Loading and preprocessing
#### I have turned my data as public so you should be able to run the below code, if there is problem while running then please let me know.

In [3]:
# Loadig the JSONL file into a Spark DataFrame
df = spark.read.json("gs://bigdataprojectdata/notebooks/jupyter/Books.jsonl")


                                                                                

In [4]:
#HANDLING MISSING VALUES
df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).show()



+----+------------+------+-----------+------+----+---------+-----+-------+-----------------+
|asin|helpful_vote|images|parent_asin|rating|text|timestamp|title|user_id|verified_purchase|
+----+------------+------+-----------+------+----+---------+-----+-------+-----------------+
|   0|           0|     0|          0|     0|   0|        0|    0|      0|                0|
+----+------------+------+-----------+------+----+---------+-----+-------+-----------------+



                                                                                

In [5]:
from pyspark.sql.functions import when
df = df.withColumn(
    "sentiment", 
    when(df["rating"] >= 4, "positive")
    .when(df["rating"] <= 3, "negative")
)

In [6]:
df = df.dropDuplicates(["title", "user_id"])

In [7]:
df = df.select("text","rating", "sentiment")

In [8]:
#checking if reviews "text" is a proper review i.e. contains more than 3 words. Dropping all the rows which has text containg less that 4 words
df = (df
                         .withColumn("word_count", F.size(F.split(F.col("text"),r"\s+")))
                         .filter(F.col("word_count")>=4)
                         .drop("word_count")
    )
df

DataFrame[text: string, rating: double, sentiment: string]

In [9]:
#checkign the counts of sentiment rows if they are balanced or not
counts_sentiments = (
    df
        .groupBy("sentiment")
        .agg(F.count("*").alias("review_count"))
        .orderBy("sentiment")
)
counts_sentiments.show()



+---------+------------+
|sentiment|review_count|
+---------+------------+
| negative|     4144153|
| positive|    21092377|
+---------+------------+



                                                                                

In [10]:
fractions = {"positive": 0.4, "negative": 1.0}

df = df.sampleBy("sentiment", fractions, seed=42)



In [11]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer
from pyspark.ml.classification import RandomForestClassifier

In [12]:
# Text preprocessing
tokenizer = Tokenizer(inputCol="text", outputCol="words")
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=10000)
idf = IDF(inputCol="rawFeatures", outputCol="features")

In [13]:

# Label encoding 
indexer = StringIndexer(inputCol="sentiment", outputCol="label")

In [14]:
# Random Forest Classifier
rf = RandomForestClassifier(featuresCol="features", labelCol="label", numTrees=50)

In [15]:
# Combine all into a pipeline
pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf, indexer, rf])

In [16]:
# Split data
train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)

In [17]:
import time
start = time.time()
model = pipeline.fit(train_df)
predictions = model.transform(test_df)
end = time.time()
print(f"Training Time: {end - start:.2f} seconds")


25/05/04 21:31:34 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_89_9 !
25/05/04 21:31:34 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_89_5 !
25/05/04 21:31:34 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_89_17 !
25/05/04 21:31:34 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_89_25 !
25/05/04 21:31:34 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_89_1 !
25/05/04 21:31:34 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_89_13 !
25/05/04 21:31:34 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_89_21 !
25/05/04 21:31:35 WARN YarnAllocator: Container from a bad node: container_1746391117347_0001_01_000003 on host: cluster-c2aa-w-0.northamerica-south1-b.c.academic-timing-458516-v2.internal. Exit status: 143. Diagnostics: [2025-05-04 21:31:35.139]Container killed on request. Exit code is 143
[2025-05-04 21:31:35.141]Container exited with a non-

25/05/04 21:42:39 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_89_32 !
25/05/04 21:42:39 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_89_35 !
25/05/04 21:42:39 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_89_16 !
25/05/04 21:42:39 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_89_8 !
25/05/04 21:42:39 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_89_39 !
25/05/04 21:42:39 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_89_46 !
25/05/04 21:42:39 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_89_12 !
25/05/04 21:42:39 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_89_20 !
25/05/04 21:42:39 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_89_28 !
25/05/04 21:42:39 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_89_4 !
25/05/04 21:42:39 WARN BlockManagerMasterEndpoint: No more rep

25/05/04 21:56:32 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_89_5 !
25/05/04 21:56:32 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_89_41 !
25/05/04 21:56:32 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_89_26 !
25/05/04 21:56:32 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_89_9 !
25/05/04 21:56:32 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_89_18 !
25/05/04 21:56:32 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_89_0 !
25/05/04 21:56:32 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_89_17 !
25/05/04 21:56:32 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_89_22 !
25/05/04 21:56:32 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_89_48 !
25/05/04 21:56:32 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_89_13 !
25/05/04 21:56:33 WARN YarnAllocator: Container from a bad node

25/05/04 22:07:58 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_89_10 !
25/05/04 22:07:58 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_89_25 !
25/05/04 22:07:58 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_89_14 !
25/05/04 22:07:58 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_89_21 !
25/05/04 22:07:58 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_89_37 !
25/05/04 22:07:58 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_89_31 !
25/05/04 22:07:58 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_89_3 !
25/05/04 22:07:58 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_89_39 !
25/05/04 22:07:58 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_89_7 !
25/05/04 22:07:59 WARN YarnAllocator: Container from a bad node: container_1746391117347_0001_01_000009 on host: cluster-c2aa-w-0.northamerica-south1-b.c.ac

25/05/04 22:25:43 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_89_8 !
25/05/04 22:25:43 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_89_41 !
25/05/04 22:25:43 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_89_7 !
25/05/04 22:25:43 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_89_34 !
25/05/04 22:25:43 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_89_12 !
25/05/04 22:25:43 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_89_18 !
25/05/04 22:25:43 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_89_19 !
25/05/04 22:25:43 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_89_48 !
25/05/04 22:25:43 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_89_13 !
25/05/04 22:25:43 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_89_3 !
25/05/04 22:25:43 WARN BlockManagerMasterEndpoint: No more repl

25/05/04 22:35:08 WARN DAGScheduler: Broadcasting large task binary with size 1103.0 KiB
25/05/04 22:35:10 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_89_26 !
25/05/04 22:35:10 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_89_33 !
25/05/04 22:35:10 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_89_0 !
25/05/04 22:35:10 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_89_10 !
25/05/04 22:35:10 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_89_40 !
25/05/04 22:35:10 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_89_4 !
25/05/04 22:35:10 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_89_15 !
25/05/04 22:35:10 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_89_21 !
25/05/04 22:35:10 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_89_30 !
25/05/04 22:35:10 WARN BlockManagerMasterEndpoint: No more replicas

25/05/04 22:46:35 WARN YarnAllocator: Container from a bad node: container_1746391117347_0001_01_000022 on host: cluster-c2aa-w-1.northamerica-south1-b.c.academic-timing-458516-v2.internal. Exit status: 143. Diagnostics: [2025-05-04 22:46:35.427]Container killed on request. Exit code is 143
[2025-05-04 22:46:35.427]Container exited with a non-zero exit code 143. 
[2025-05-04 22:46:35.427]Killed by external signal
.
25/05/04 22:46:35 WARN YarnSchedulerBackend$YarnSchedulerEndpoint: Requesting driver to remove executor 22 for reason Container from a bad node: container_1746391117347_0001_01_000022 on host: cluster-c2aa-w-1.northamerica-south1-b.c.academic-timing-458516-v2.internal. Exit status: 143. Diagnostics: [2025-05-04 22:46:35.427]Container killed on request. Exit code is 143
[2025-05-04 22:46:35.427]Container exited with a non-zero exit code 143. 
[2025-05-04 22:46:35.427]Killed by external signal
.
25/05/04 22:46:35 ERROR YarnScheduler: Lost executor 22 on cluster-c2aa-w-1.northa

25/05/04 22:55:10 WARN YarnAllocator: Container from a bad node: container_1746391117347_0001_01_000025 on host: cluster-c2aa-w-0.northamerica-south1-b.c.academic-timing-458516-v2.internal. Exit status: 143. Diagnostics: [2025-05-04 22:55:09.961]Container killed on request. Exit code is 143
[2025-05-04 22:55:09.961]Container exited with a non-zero exit code 143. 
[2025-05-04 22:55:09.962]Killed by external signal
.
25/05/04 22:55:10 WARN YarnSchedulerBackend$YarnSchedulerEndpoint: Requesting driver to remove executor 25 for reason Container from a bad node: container_1746391117347_0001_01_000025 on host: cluster-c2aa-w-0.northamerica-south1-b.c.academic-timing-458516-v2.internal. Exit status: 143. Diagnostics: [2025-05-04 22:55:09.961]Container killed on request. Exit code is 143
[2025-05-04 22:55:09.961]Container exited with a non-zero exit code 143. 
[2025-05-04 22:55:09.962]Killed by external signal
.
25/05/04 22:55:10 ERROR YarnScheduler: Lost executor 25 on cluster-c2aa-w-0.northa

Training Time: 7982.00 seconds


In [18]:
predictions.select("text", "sentiment", "prediction").show(10)

[Stage 48:>                                                         (0 + 1) / 1]

+--------------------+---------+----------+
|                text|sentiment|prediction|
+--------------------+---------+----------+
|!<br />How to des...| positive|       0.0|
|" 'Keeping it Rea...| positive|       0.0|
|" I found the boo...| positive|       0.0|
|"$30 Writing Scho...| positive|       0.0|
|"...Most people a...| negative|       0.0|
|"...war is a secu...| positive|       0.0|
|"22 Britannia Roa...| positive|       0.0|
|"A Dragons Tale" ...| positive|       0.0|
|"A Summer Tale of...| positive|       0.0|
|"A Tree Grows in ...| positive|       0.0|
+--------------------+---------+----------+
only showing top 10 rows



                                                                                

In [19]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Test Accuracy: {accuracy:.4f}")



Test Accuracy: 0.6705


                                                                                

In [20]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
bce = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction",
                                    labelCol="label",
                                    metricName="areaUnderROC")
roc_auc = bce.evaluate(predictions)

                                                                                

In [21]:
f1 = evaluator.setMetricName("f1").evaluate(predictions)


                                                                                

In [22]:
print(f"Test ROC AUC = {roc_auc:.4f}")
print(f"Test Accuracy = {accuracy:.4f}")
print(f"Test F1 Score = {f1:.4f}")

Test ROC AUC = 0.7637
Test Accuracy = 0.6705
Test F1 Score = 0.5383


In [23]:
from pyspark import SparkContext

sc = spark.sparkContext
executors = sc._jsc.sc().getExecutorMemoryStatus().keySet()
print(f"Active Executors: {executors}")

Active Executors: Set(cluster-c2aa-m.northamerica-south1-b.c.academic-timing-458516-v2.internal:45405, cluster-c2aa-w-0.northamerica-south1-b.c.academic-timing-458516-v2.internal:32947, cluster-c2aa-w-0.northamerica-south1-b.c.academic-timing-458516-v2.internal:35549, cluster-c2aa-w-1.northamerica-south1-b.c.academic-timing-458516-v2.internal:40115, cluster-c2aa-w-1.northamerica-south1-b.c.academic-timing-458516-v2.internal:40173)
