#### Random Forest Model Fine-Tuning

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import lit
from pyspark.sql.functions import col, isnan, count, when, isnull, size, split
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, LongType, FloatType, DateType
from pyspark.sql.functions import col, regexp_replace
from pyspark.ml.linalg import Vectors
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier,GBTClassifier,RandomForestClassifier, RandomForestClassificationModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
import matplotlib.pyplot as plt
import pandas as pd

In [3]:
spark = SparkSession.builder.appName('fine_tune_random_forest_attempt3').getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/04/13 07:22:44 INFO org.apache.spark.SparkEnv: Registering MapOutputTracker
24/04/13 07:22:44 INFO org.apache.spark.SparkEnv: Registering BlockManagerMaster
24/04/13 07:22:44 INFO org.apache.spark.SparkEnv: Registering BlockManagerMasterHeartbeat
24/04/13 07:22:44 INFO org.apache.spark.SparkEnv: Registering OutputCommitCoordinator


In [4]:
trainingDataPath = "gs://ds5460-tlee-spring2024/notebooks/jupyter/data/usa/combined_datasets/trainingDataTransformed.parquet"
testDataPath = "gs://ds5460-tlee-spring2024/notebooks/jupyter/data/usa/combined_datasets/dataWithZeroReviewTransformed.parquet"


# Load the transformed data
reviews_data = spark.read.parquet(trainingDataPath)
zero_review_data = spark.read.parquet(testDataPath)

                                                                                

In [5]:
# Do 70/30 split on seed = 42 for consistency
train_data,test_data = reviews_data.randomSplit([0.7,0.3], seed=42)

In [5]:
# Initialize a new RandomForestClassifier
rfc = RandomForestClassifier(labelCol='target_label', featuresCol='features', maxBins=2200)

In [6]:
# Define the parameter grid by testing different fine-tune parameters
paramGrid = (ParamGridBuilder()
             .addGrid(rfc.numTrees, [20])
             .addGrid(rfc.maxDepth, [12])
             .build())

evaluator = MulticlassClassificationEvaluator(labelCol="target_label", predictionCol="prediction", metricName="accuracy")

# Setup the CrossValidator
cv = CrossValidator(estimator=rfc,
                    estimatorParamMaps=paramGrid,
                    evaluator=evaluator,
                    numFolds=3)

In [7]:
# Fit the CrossValidator to the training data
cv_model = cv.fit(train_data)

24/04/13 06:35:40 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 1482.0 KiB
24/04/13 06:35:49 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 2.5 MiB
24/04/13 06:35:59 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.0 MiB
24/04/13 06:36:09 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 6.2 MiB
24/04/13 06:36:17 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 1121.2 KiB
24/04/13 06:36:21 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.4 MiB
24/04/13 06:37:20 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 1484.8 KiB
24/04/13 06:37:28 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 2.5 MiB
24/04/13 06:37:37 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task

In [8]:
# Use the best model to make predictions on the test data
best_model = cv_model.bestModel
predictions = best_model.transform(test_data)

# Evaluate the best model's accuracy on the test data
accuracy = evaluator.evaluate(predictions)
print('Best Model Accuracy: {:.2f}%'.format(accuracy * 100))

24/04/13 06:42:47 WARN org.apache.spark.scheduler.DAGScheduler: Broadcasting large task binary with size 4.9 MiB

Best Model Accuracy: 60.01%


                                                                                

In [9]:
# Specify model save path to use it for the load model notebook
model_path = "gs://ds5460-tlee-spring2024/notebooks/jupyter//data/usa/combined_datasets/rfc_attempt_3"

# Save the model
best_model.save(model_path)

24/04/13 06:43:27 WARN org.apache.spark.scheduler.TaskSetManager: Stage 124 contains a task of very large size (2476 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

In [6]:
spark.stop()