## Big Data Project - Model2: Random Forest
### This file contain code and result of Random Forest trained on review of books.jsonl file with 1 executor on 100 percent of actual dataset

In [2]:
import pandas as pd
import pyspark.sql.functions as F
from pyspark.sql.functions import col, count, when

### --Creating and building spark session with 1 maxExecutors with 50 partition

In [3]:
from pyspark.sql import SparkSession

# Creating a SparkSession
spark = (
  SparkSession.builder
    .appName("AmazonReviewsUsingBERT")
    .master("yarn")
    .config("spark.dynamicAllocation.enabled", "true")
    .config("spark.shuffle.service.enabled",   "true")
    .config("spark.dynamicAllocation.minExecutors","1")
    .config("spark.dynamicAllocation.maxExecutors","1")
    .config("spark.sql.shuffle.partitions",       "50")
    .getOrCreate()
)

25/05/04 20:30:50 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


### --Data Loading and preprocessing
#### I have turned my data as public so you should be able to run the below code, if there is problem while running then please let me know.

In [None]:
# Loadig the JSONL file into a Spark DataFrame
df = spark.read.json("gs://bigdataprojectdata/notebooks/jupyter/Books.jsonl")


In [None]:
#HANDLING MISSING VALUES
df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).show()

In [None]:
from pyspark.sql.functions import when
df = df.withColumn(
    "sentiment", 
    when(df["rating"] >= 4, "positive")
    .when(df["rating"] <= 3, "negative")
)

In [None]:
df = df.dropDuplicates(["title", "user_id"])

In [None]:
df = df.select("text","rating", "sentiment")

In [None]:
#checking if reviews "text" is a proper review i.e. contains more than 3 words. Dropping all the rows which has text containg less that 4 words
df = (df
                         .withColumn("word_count", F.size(F.split(F.col("text"),r"\s+")))
                         .filter(F.col("word_count")>=4)
                         .drop("word_count")
    )
df

In [None]:
#checkign the counts of sentiment rows if they are balanced or not
counts_sentiments = (
    df
        .groupBy("sentiment")
        .agg(F.count("*").alias("review_count"))
        .orderBy("sentiment")
)
counts_sentiments.show()

In [None]:
fractions = {"positive": 0.4, "negative": 1.0}

df = df.sampleBy("sentiment", fractions, seed=42)



In [30]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer
from pyspark.ml.classification import RandomForestClassifier

In [31]:
# Text preprocessing
tokenizer = Tokenizer(inputCol="text", outputCol="words")
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=10000)
idf = IDF(inputCol="rawFeatures", outputCol="features")

In [32]:

# Label encoding 
indexer = StringIndexer(inputCol="sentiment", outputCol="label")

In [33]:
# Random Forest Classifier
rf = RandomForestClassifier(featuresCol="features", labelCol="label", numTrees=50)

In [34]:
# Combine all into a pipeline
pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf, indexer, rf])

In [35]:
# Split data
train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)

In [36]:
import time
start = time.time()
model = pipeline.fit(train_df)
predictions = model.transform(test_df)
end = time.time()
print(f"Training Time: {end - start:.2f} seconds")


25/05/04 18:37:21 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_127_6 !
25/05/04 18:37:21 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_127_1 !
25/05/04 18:37:21 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_127_10 !
25/05/04 18:37:21 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_127_22 !
25/05/04 18:37:21 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_127_14 !
25/05/04 18:37:21 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_127_18 !
25/05/04 18:37:22 WARN YarnAllocator: Container from a bad node: container_1746379126446_0001_01_000009 on host: cluster-c2aa-w-1.northamerica-south1-b.c.academic-timing-458516-v2.internal. Exit status: 137. Diagnostics: [2025-05-04 18:37:21.988]Container killed on request. Exit code is 137
[2025-05-04 18:37:21.989]Container exited with a non-zero exit code 137. 
[2025-05-04 18:37:21.989]Killed by external signal
.
25/05/04 18:3

25/05/04 18:50:26 WARN YarnAllocator: Container from a bad node: container_1746379126446_0001_01_000008 on host: cluster-c2aa-w-0.northamerica-south1-b.c.academic-timing-458516-v2.internal. Exit status: 137. Diagnostics: [2025-05-04 18:50:26.451]Container killed on request. Exit code is 137
[2025-05-04 18:50:26.451]Container exited with a non-zero exit code 137. 
[2025-05-04 18:50:26.451]Killed by external signal
.
25/05/04 18:50:26 WARN YarnSchedulerBackend$YarnSchedulerEndpoint: Requesting driver to remove executor 8 for reason Container from a bad node: container_1746379126446_0001_01_000008 on host: cluster-c2aa-w-0.northamerica-south1-b.c.academic-timing-458516-v2.internal. Exit status: 137. Diagnostics: [2025-05-04 18:50:26.451]Container killed on request. Exit code is 137
[2025-05-04 18:50:26.451]Container exited with a non-zero exit code 137. 
[2025-05-04 18:50:26.451]Killed by external signal
.
25/05/04 18:50:26 ERROR YarnScheduler: Lost executor 8 on cluster-c2aa-w-0.northame

25/05/04 19:04:12 WARN YarnAllocator: Container from a bad node: container_1746379126446_0001_01_000011 on host: cluster-c2aa-w-1.northamerica-south1-b.c.academic-timing-458516-v2.internal. Exit status: 143. Diagnostics: [2025-05-04 19:04:12.412]Container killed on request. Exit code is 143
[2025-05-04 19:04:12.413]Container exited with a non-zero exit code 143. 
[2025-05-04 19:04:12.416]Killed by external signal
.
25/05/04 19:04:12 WARN YarnSchedulerBackend$YarnSchedulerEndpoint: Requesting driver to remove executor 11 for reason Container from a bad node: container_1746379126446_0001_01_000011 on host: cluster-c2aa-w-1.northamerica-south1-b.c.academic-timing-458516-v2.internal. Exit status: 143. Diagnostics: [2025-05-04 19:04:12.412]Container killed on request. Exit code is 143
[2025-05-04 19:04:12.413]Container exited with a non-zero exit code 143. 
[2025-05-04 19:04:12.416]Killed by external signal
.
25/05/04 19:04:12 ERROR YarnScheduler: Lost executor 11 on cluster-c2aa-w-1.northa

25/05/04 19:16:25 WARN YarnAllocator: Container from a bad node: container_1746379126446_0001_01_000017 on host: cluster-c2aa-w-0.northamerica-south1-b.c.academic-timing-458516-v2.internal. Exit status: 137. Diagnostics: [2025-05-04 19:16:25.090]Container killed on request. Exit code is 137
[2025-05-04 19:16:25.090]Container exited with a non-zero exit code 137. 
[2025-05-04 19:16:25.091]Killed by external signal
.
25/05/04 19:16:25 WARN YarnSchedulerBackend$YarnSchedulerEndpoint: Requesting driver to remove executor 17 for reason Container from a bad node: container_1746379126446_0001_01_000017 on host: cluster-c2aa-w-0.northamerica-south1-b.c.academic-timing-458516-v2.internal. Exit status: 137. Diagnostics: [2025-05-04 19:16:25.090]Container killed on request. Exit code is 137
[2025-05-04 19:16:25.090]Container exited with a non-zero exit code 137. 
[2025-05-04 19:16:25.091]Killed by external signal
.
25/05/04 19:16:25 ERROR YarnScheduler: Lost executor 17 on cluster-c2aa-w-0.northa

25/05/04 19:29:34 WARN YarnAllocator: Container from a bad node: container_1746379126446_0001_01_000019 on host: cluster-c2aa-w-1.northamerica-south1-b.c.academic-timing-458516-v2.internal. Exit status: 143. Diagnostics: [2025-05-04 19:29:34.601]Container killed on request. Exit code is 143
[2025-05-04 19:29:34.602]Container exited with a non-zero exit code 143. 
[2025-05-04 19:29:34.602]Killed by external signal
.
25/05/04 19:29:34 WARN YarnSchedulerBackend$YarnSchedulerEndpoint: Requesting driver to remove executor 19 for reason Container from a bad node: container_1746379126446_0001_01_000019 on host: cluster-c2aa-w-1.northamerica-south1-b.c.academic-timing-458516-v2.internal. Exit status: 143. Diagnostics: [2025-05-04 19:29:34.601]Container killed on request. Exit code is 143
[2025-05-04 19:29:34.602]Container exited with a non-zero exit code 143. 
[2025-05-04 19:29:34.602]Killed by external signal
.
25/05/04 19:29:34 ERROR YarnScheduler: Lost executor 19 on cluster-c2aa-w-1.northa

25/05/04 19:41:05 WARN YarnAllocator: Container from a bad node: container_1746379126446_0001_01_000023 on host: cluster-c2aa-w-0.northamerica-south1-b.c.academic-timing-458516-v2.internal. Exit status: 137. Diagnostics: [2025-05-04 19:41:04.904]Container killed on request. Exit code is 137
[2025-05-04 19:41:04.904]Container exited with a non-zero exit code 137. 
[2025-05-04 19:41:04.905]Killed by external signal
.
25/05/04 19:41:05 WARN YarnSchedulerBackend$YarnSchedulerEndpoint: Requesting driver to remove executor 23 for reason Container from a bad node: container_1746379126446_0001_01_000023 on host: cluster-c2aa-w-0.northamerica-south1-b.c.academic-timing-458516-v2.internal. Exit status: 137. Diagnostics: [2025-05-04 19:41:04.904]Container killed on request. Exit code is 137
[2025-05-04 19:41:04.904]Container exited with a non-zero exit code 137. 
[2025-05-04 19:41:04.905]Killed by external signal
.
25/05/04 19:41:05 ERROR YarnScheduler: Lost executor 23 on cluster-c2aa-w-0.northa

25/05/04 19:54:42 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_127_1 !
25/05/04 19:54:42 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_127_49 !
25/05/04 19:54:42 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_127_20 !
25/05/04 19:54:42 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_127_16 !
25/05/04 19:54:42 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_127_31 !
25/05/04 19:54:42 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_127_5 !
25/05/04 19:54:42 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_127_8 !
25/05/04 19:54:42 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_127_11 !
25/05/04 19:54:42 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_127_27 !
25/05/04 19:54:42 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_127_38 !
25/05/04 19:54:43 WARN YarnAllocator: Container from 

Training Time: 7432.20 seconds


In [38]:
predictions.select("text", "sentiment", "prediction").show(10)

[Stage 64:>                                                         (0 + 1) / 1]

+--------------------+---------+----------+
|                text|sentiment|prediction|
+--------------------+---------+----------+
|!<br />How to des...| positive|       0.0|
|" 'Keeping it Rea...| positive|       0.0|
|" I found the boo...| positive|       0.0|
|"$30 Writing Scho...| positive|       0.0|
|"...Most people a...| negative|       0.0|
|"...war is a secu...| positive|       0.0|
|"22 Britannia Roa...| positive|       0.0|
|"A Dragons Tale" ...| positive|       0.0|
|"A Summer Tale of...| positive|       0.0|
|"A Tree Grows in ...| positive|       0.0|
+--------------------+---------+----------+
only showing top 10 rows



                                                                                

In [39]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Test Accuracy: {accuracy:.4f}")



Test Accuracy: 0.6702


                                                                                

In [40]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
bce = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction",
                                    labelCol="label",
                                    metricName="areaUnderROC")
roc_auc = bce.evaluate(predictions)

                                                                                

In [42]:
f1 = evaluator.setMetricName("f1").evaluate(predictions)


                                                                                

In [43]:
print(f"Test ROC AUC = {roc_auc:.4f}")
print(f"Test Accuracy = {accuracy:.4f}")
print(f"Test F1 Score = {f1:.4f}")

Test ROC AUC = 0.7609
Test Accuracy = 0.6702
Test F1 Score = 0.5383


In [46]:
from pyspark import SparkContext

sc = spark.sparkContext
executors = sc._jsc.sc().getExecutorMemoryStatus().keySet()
print(f"Active Executors: {executors}")

Active Executors: Set(cluster-c2aa-m.northamerica-south1-b.c.academic-timing-458516-v2.internal:41221, cluster-c2aa-w-1.northamerica-south1-b.c.academic-timing-458516-v2.internal:36857)
