# Milestone 3: Modeling

In [56]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import lit
from pyspark.sql.functions import col, isnan, count, when, isnull, size, split
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, LongType, FloatType, DateType
from pyspark.sql.functions import col, regexp_replace
from pyspark.ml.linalg import Vectors
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier,GBTClassifier,RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [57]:
spark = SparkSession.builder.appName('final_project').getOrCreate()

In [58]:
# Put in your bucket and folder path without the csv file
BUCKET = 'gs://bdbucket27/notebooks/jupyter/' 

data_with_review = spark.read.csv(BUCKET + 'target_column_with_review.csv',inferSchema=True,header=True)
data_with_zero_review = spark.read.csv(BUCKET + 'target_column_with_zero_review.csv',inferSchema=True,header=True)

In [59]:
# Double check the data was read in properly
data_with_review.head()

Row(id=772438920837360569, host_id=382318476, host_since='2020-12-30', host_location='Unknown', host_is_superhost='t', host_listings_count=1.0, host_total_listings_count=3.0, host_has_profile_pic='t', host_identity_verified='t', neighborhood='Southwest Ranches', latitude=26.0338992, longitude=-80.3346054, room_type='Entire home/apt', accommodates=8, num_bath=3.0, bedrooms=4.0, beds=6.0, price=500.0, number_of_reviews=2, review_scores_value=5.0, calculated_host_listings_count=1, city='Broward County', amenities_count=14, neighborhood_city='Southwest Ranches Broward County', full_time_host='f', host_verifications_clean='ep', essential_amenities=3, target='Exceptional')

In [60]:
# Double check the data was read in properly
data_with_zero_review.head()

Row(id=827736378366911479, host_id=475630606, host_since='2022-08-18', host_location='Unknown', host_is_superhost='f', host_listings_count=1.0, host_total_listings_count=3.0, host_has_profile_pic='t', host_identity_verified='t', neighborhood='Fort Lauderdale', latitude=26.09393643124416, longitude=-80.13759087771177, room_type='Entire home/apt', accommodates=2, num_bath=1.0, bedrooms=1.0, beds=1.0, price=222.0, number_of_reviews=0, review_scores_value=None, calculated_host_listings_count=1, city='Broward County', amenities_count=10, neighborhood_city='Fort Lauderdale Broward County', full_time_host='f', host_verifications_clean='p', essential_amenities=3)

In [61]:
# Drop columns that can't be used in the feature pipeline as not applicable to zero reviews
data_with_review = data_with_review.drop('review_scores_value')
data_with_review = data_with_review.drop('number_of_reviews')
data_with_zero_review = data_with_zero_review.drop('review_scores_value')
data_with_zero_review = data_with_zero_review.drop('number_of_reviews')

In [62]:
# Double check the columns in data_with_review
data_with_review.columns

['id',
 'host_id',
 'host_since',
 'host_location',
 'host_is_superhost',
 'host_listings_count',
 'host_total_listings_count',
 'host_has_profile_pic',
 'host_identity_verified',
 'neighborhood',
 'latitude',
 'longitude',
 'room_type',
 'accommodates',
 'num_bath',
 'bedrooms',
 'beds',
 'price',
 'calculated_host_listings_count',
 'city',
 'amenities_count',
 'neighborhood_city',
 'full_time_host',
 'host_verifications_clean',
 'essential_amenities',
 'target']

### Create the Feature Pipeline

In [63]:
# List of string features to be indexed and the features that are already numeric
stringFeatures = ['host_location', 'host_is_superhost', 'host_has_profile_pic',
                  'host_identity_verified', 'city', 'room_type', 'full_time_host', 
                  'host_verifications_clean']

numericFeatures = ['host_listings_count', 'host_total_listings_count', 'accommodates', 
                   'num_bath','bedrooms', 'beds', 'price', 'calculated_host_listings_count',
                   'amenities_count', 'essential_amenities']

# Create StringIndexer stages for the stringFeatures - call the numeric version as _indexed
indexers = [StringIndexer(inputCol=column, outputCol=column+"_indexed", stringOrderType="alphabetAsc").setHandleInvalid("skip")
            for column in stringFeatures]

# Create a StringIndexer for the target column and naming it as target_label after converting it to numeric
labelIndexer = StringIndexer(inputCol='target', outputCol='target_label').setHandleInvalid("skip")

# Create VectorAssembler stage for the features
assemblerInputs = [column+"_indexed" for column in stringFeatures] + numericFeatures
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")

# Create and run the pipeline with the target indexer added to pipeline
pipeline = Pipeline(stages=indexers + [assembler, labelIndexer])

# Fit the pipeline on the training data - which is all of data that has reviews
pipelineModel = pipeline.fit(data_with_review)

# Alternative code if we wanted to do the train/test split earlier as it depends if we want to do 80/20 or 70/30.
# trainingDataTransformed = pipelineModel.transform(trainingData)
# testDataTransformed = pipelineModel.transform(testData)

# Apply the pipeline on the data_with_review - trainingDataTransformed seen below will have to go through a train/test split
trainingDataTransformed = pipelineModel.transform(data_with_review)


# To apply the pipeline on the data_with_zero_review, which is missing the target, we need to do the following:
# Extracting the transformation stages from the fitted pipeline model
# Excldue the LabelIndexer from final pipeline stage as we are applying on the prediction dataset
transformationStages = pipelineModel.stages[:-1]

# Manually apply each transformation stage to the data_with_zero_review
transformedData = data_with_zero_review
for stage in transformationStages:
    transformedData = stage.transform(transformedData)

# We now have dataWithZeroReviewTransformed which doesn't have the target_label column to see what is our final model predictions
dataWithZeroReviewTransformed = transformedData

### Check if Feature Pipeline was done properly

In [64]:
trainingDataTransformed.printSchema()

root
 |-- id: long (nullable = true)
 |-- host_id: integer (nullable = true)
 |-- host_since: string (nullable = true)
 |-- host_location: string (nullable = true)
 |-- host_is_superhost: string (nullable = true)
 |-- host_listings_count: double (nullable = true)
 |-- host_total_listings_count: double (nullable = true)
 |-- host_has_profile_pic: string (nullable = true)
 |-- host_identity_verified: string (nullable = true)
 |-- neighborhood: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- room_type: string (nullable = true)
 |-- accommodates: integer (nullable = true)
 |-- num_bath: double (nullable = true)
 |-- bedrooms: double (nullable = true)
 |-- beds: double (nullable = true)
 |-- price: double (nullable = true)
 |-- calculated_host_listings_count: integer (nullable = true)
 |-- city: string (nullable = true)
 |-- amenities_count: integer (nullable = true)
 |-- neighborhood_city: string (nullable = true)
 |-- full_tim

In [65]:
dataWithZeroReviewTransformed.printSchema()

root
 |-- id: long (nullable = true)
 |-- host_id: integer (nullable = true)
 |-- host_since: string (nullable = true)
 |-- host_location: string (nullable = true)
 |-- host_is_superhost: string (nullable = true)
 |-- host_listings_count: double (nullable = true)
 |-- host_total_listings_count: double (nullable = true)
 |-- host_has_profile_pic: string (nullable = true)
 |-- host_identity_verified: string (nullable = true)
 |-- neighborhood: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- room_type: string (nullable = true)
 |-- accommodates: integer (nullable = true)
 |-- num_bath: double (nullable = true)
 |-- bedrooms: double (nullable = true)
 |-- beds: double (nullable = true)
 |-- price: double (nullable = true)
 |-- calculated_host_listings_count: integer (nullable = true)
 |-- city: string (nullable = true)
 |-- amenities_count: integer (nullable = true)
 |-- neighborhood_city: string (nullable = true)
 |-- full_tim

In [66]:
trainingDataTransformed.show(n=1, truncate=False)

+------------------+---------+----------+-------------+-----------------+-------------------+-------------------------+--------------------+----------------------+-----------------+----------+-----------+---------------+------------+--------+--------+----+-----+------------------------------+--------------+---------------+--------------------------------+--------------+------------------------+-------------------+-----------+---------------------+-------------------------+----------------------------+------------------------------+------------+-----------------+----------------------+--------------------------------+-------------------------------------------------------------------------------+------------+
|id                |host_id  |host_since|host_location|host_is_superhost|host_listings_count|host_total_listings_count|host_has_profile_pic|host_identity_verified|neighborhood     |latitude  |longitude  |room_type      |accommodates|num_bath|bedrooms|beds|price|calculated_host_list

In [67]:
# Doing a quick check that the string indexer worked and converted the categorical values into an index
trainingDataTransformed.select('host_location', 'host_location_indexed').show(5)

trainingDataTransformed.select('room_type', 'room_type_indexed').show(10)

+--------------------+---------------------+
|       host_location|host_location_indexed|
+--------------------+---------------------+
|             Unknown|               2227.0|
|  Fort Lauderdale FL|                725.0|
|Buenos Aires Arge...|                293.0|
|           Irvine CA|                973.0|
|            Miami FL|               1351.0|
+--------------------+---------------------+
only showing top 5 rows

+---------------+-----------------+
|      room_type|room_type_indexed|
+---------------+-----------------+
|Entire home/apt|              0.0|
|Entire home/apt|              0.0|
|Entire home/apt|              0.0|
|Entire home/apt|              0.0|
|Entire home/apt|              0.0|
|Entire home/apt|              0.0|
|   Private room|              2.0|
|   Private room|              2.0|
|   Private room|              2.0|
|   Private room|              2.0|
+---------------+-----------------+
only showing top 10 rows



In [68]:
# Checking that the target_label got created successfully
trainingDataTransformed.select('target', 'target_label').show(5)

+-----------+------------+
|     target|target_label|
+-----------+------------+
|Exceptional|         2.0|
|   Mediocre|         1.0|
|   Mediocre|         1.0|
|   Mediocre|         1.0|
|       Good|         0.0|
+-----------+------------+
only showing top 5 rows



In [69]:
# Examine all of the features with a particular target_label
trainingDataTransformed.select("features", "target_label").show(truncate=False)

+-----------------------------------------------------------------------------------+------------+
|features                                                                           |target_label|
+-----------------------------------------------------------------------------------+------------+
|[2227.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,3.0,8.0,3.0,4.0,6.0,500.0,1.0,14.0,3.0]    |2.0         |
|[725.0,0.0,1.0,0.0,1.0,0.0,0.0,8.0,1.0,12.0,6.0,2.0,2.0,4.0,186.0,3.0,22.0,4.0]    |1.0         |
|[293.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,20.0,7.0,2.0,2.0,5.0,297.0,6.0,17.0,3.0]    |1.0         |
|[973.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,5.0,4.0,1.0,1.0,2.0,162.0,5.0,69.0,5.0]     |1.0         |
|[1351.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,17.0,2.0,1.0,1.0,1.0,92.0,15.0,17.0,4.0]   |0.0         |
|[1941.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1930.0,6.0,2.0,2.0,4.0,258.0,23.0,38.0,5.0]|0.0         |
|[1351.0,1.0,1.0,1.0,1.0,2.0,0.0,4.0,1.0,1.0,4.0,1.0,1.0,2.0,100.0,1.0,72.0,4.0]    |0.0         |
|[542.0,0.

In [70]:
# Examine if the dataWithZeroReviewTransformed features look like it was done correctly
dataWithZeroReviewTransformed.select('features').show(5)

+--------------------+
|            features|
+--------------------+
|[2227.0,0.0,1.0,1...|
|[39.0,0.0,1.0,1.0...|
|[223.0,0.0,1.0,1....|
|[1719.0,0.0,1.0,0...|
|[1237.0,1.0,1.0,1...|
+--------------------+
only showing top 5 rows



In [71]:
# See the count of each dataset to get an understanding on sizes:
num_rows = trainingDataTransformed.count()
num_columns = len(trainingDataTransformed.first())

print("For trainingDataTransformed:")
print(f"Number of Rows: {num_rows}, Number of Columns: {num_columns}")

num_rows = dataWithZeroReviewTransformed.count()
num_columns = len(dataWithZeroReviewTransformed.first())

print("For dataWithZeroReviewTransformed:")
print(f"Number of Rows: {num_rows}, Number of Columns: {num_columns}")

For trainingDataTransformed:
Number of Rows: 143995, Number of Columns: 36
For dataWithZeroReviewTransformed:
Number of Rows: 40408, Number of Columns: 34


# Decision Tree Model

In [72]:
# Do the train test split - we need to discuss what are we settng for the threshold
train_data,test_data = trainingDataTransformed.randomSplit([0.7,0.3], seed=42)

In [73]:
# Decision Tree model 
dtc = DecisionTreeClassifier(labelCol='target_label',featuresCol='features',maxBins = 2432)

In [74]:
# Fit to the train_data
dtc_model = dtc.fit(train_data)

In [75]:
# Get the predictions for the test_data, which is the data with the ground truth known
dtc_predictions = dtc_model.transform(test_data)

In [76]:
acc_evaluator = MulticlassClassificationEvaluator(labelCol="target_label", predictionCol="prediction", metricName="accuracy")

In [77]:
dtc_acc = acc_evaluator.evaluate(dtc_predictions)

In [78]:
print('A Decision Tree had an accuracy of: {0:2.2f}%'.format(dtc_acc*100))

A Decision Tree had an accuracy of: 50.90%


In [79]:
# Just seeing what is inside the rfc_predictions
dtc_predictions.select("features", "target_label", "rawPrediction", "probability", "prediction").show(truncate=False, n=2)

+--------------------------------------------------------------------------------+------------+-----------------------------+--------------------------------------------------------------------------------+----------+
|features                                                                        |target_label|rawPrediction                |probability                                                                     |prediction|
+--------------------------------------------------------------------------------+------------+-----------------------------+--------------------------------------------------------------------------------+----------+
|[1509.0,0.0,1.0,1.0,10.0,0.0,0.0,4.0,7.0,9.0,1.0,1.0,1.0,1.0,150.0,3.0,32.0,5.0]|1.0         |[4574.0,7121.0,2810.0,1787.0]|[0.2807512889761846,0.4370856862263688,0.17247728946722318,0.10968573533022342] |1.0       |
|[1237.0,1.0,1.0,1.0,7.0,2.0,0.0,1.0,2.0,3.0,1.0,1.0,1.0,1.0,118.0,2.0,66.0,5.0] |0.0         |[6977.0,1082.0,1984.0,87.0]  |[0.

## Model Evaluation

In [80]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="target_label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(dtc_predictions)
print("Test Accuracy = %g" % (accuracy))

from pyspark.sql.functions import col

# Count predictions
confusion_matrix = dtc_predictions.groupBy('target_label').pivot('prediction', [0,1,2,3]).count().na.fill(0).orderBy('target_label')
confusion_matrix.show()


Test Accuracy = 0.509004
+------------+-----+----+----+---+
|target_label|    0|   1|   2|  3|
+------------+-----+----+----+---+
|         0.0|14291|2815|1331| 48|
|         1.0| 4499|4930|1500|179|
|         2.0| 5572|2251|2560|155|
|         3.0|  591|1437| 808|182|
+------------+-----+----+----+---+



## Hyperparameter Tuning

In [None]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import TrainValidationSplit

# Initialize the DecisionTreeClassifier
dtc = DecisionTreeClassifier(labelCol='target_label', featuresCol='features', maxBins=2432)


# Create the parameter grid
paramGrid = ParamGridBuilder() \
    .addGrid(dtc.maxDepth, [5, 10, 20]) \
    .addGrid(dtc.maxBins, [2432]) \
    .addGrid(dtc.minInstancesPerNode, [1, 2, 4]) \
    .build()

# Create the evaluator
evaluator = MulticlassClassificationEvaluator(labelCol="target_label", predictionCol="prediction", metricName="accuracy")

# Create the CrossValidator
# cv = CrossValidator(estimator=dtc,
#                     estimatorParamMaps=paramGrid,
#                     evaluator=evaluator,
#                     numFolds=3,
#                     parallelism=4)  # Use 3+ folds in practice

cv = TrainValidationSplit(estimator=dtc,
                    estimatorParamMaps=paramGrid,
                    evaluator=evaluator,
                    trainRatio=0.8,
                    parallelism=4)  

# Run cross-validation, and choose the best set of parameters.
cvModel = cv.fit(train_data)

# Make predictions on test data. cvModel uses the best model found.
prediction = cvModel.transform(test_data)

# Evaluate the best model's performance
#f1_score = evaluator.evaluate(prediction)
acc = acc_evaluator.evaluate(prediction)
print("Test set Accuracy = " + str(acc))

# Print best model's parameters
bestModel = cvModel.bestModel
print(f"Best Param (maxDepth): {bestModel.getMaxDepth()}")
print(f"Best Param (maxBins): {bestModel.getMaxBins()}")
print(f"Best Param (minInstancesPerNode): {bestModel.getMinInstancesPerNode()}")


Test set Accuracy = 0.5138705416116248


AttributeError: 'DecisionTreeClassificationModel' object has no attribute 'getMaxDepth'

In [83]:
# Get the best model from CrossValidator
bestModel = cvModel.bestModel

# Print the parameters of the best model
print("Best model's parameters:")
print(f" - maxDepth: {bestModel._java_obj.getMaxDepth()}")
print(f" - maxBins: {bestModel._java_obj.getMaxBins()}")
print(f" - minInstancesPerNode: {bestModel._java_obj.getMinInstancesPerNode()}")

Best model's parameters:
 - maxDepth: 10
 - maxBins: 2432
 - minInstancesPerNode: 1


In [82]:
# Use the best model found by CrossValidator to make predictions on the test data
predictions = cvModel.transform(test_data)

# Evaluate the model's accuracy on the test data
evaluator = MulticlassClassificationEvaluator(labelCol="target_label", predictionCol="prediction", metricName="accuracy")

# Compute the accuracy on the test data
accuracy = evaluator.evaluate(predictions)
print(f"Test Accuracy: {accuracy}")

# Generate confusion matrix
confusion_matrix = predictions.groupBy('target_label').pivot('prediction', [0, 1, 2, 3]).count().na.fill(0).orderBy('target_label')
confusion_matrix.show()


Test Accuracy: 0.5138705416116248
+------------+-----+----+----+---+
|target_label|    0|   1|   2|  3|
+------------+-----+----+----+---+
|         0.0|13841|2787|1845| 12|
|         1.0| 4317|5052|1675| 64|
|         2.0| 5092|2197|3131|118|
|         3.0|  575|1399| 895|149|
+------------+-----+----+----+---+



## Applying Model on 0 Review Listings

In [94]:
# Applying the trained Random Forest model to the prediction dataset - the hold out zero reviews data
zeroReviewPredictions = cvModel.transform(dataWithZeroReviewTransformed)

In [95]:
# Displaying predictions, probabilities, and features
zeroReviewPredictions.select("features", "rawPrediction", "probability", "prediction").show(truncate=False, n=5)

+----------------------------------------------------------------------------------+-------------------------+--------------------------------------------------------------------------------+----------+
|features                                                                          |rawPrediction            |probability                                                                     |prediction|
+----------------------------------------------------------------------------------+-------------------------+--------------------------------------------------------------------------------+----------+
|[2227.0,0.0,1.0,1.0,1.0,0.0,0.0,7.0,1.0,3.0,2.0,1.0,1.0,1.0,222.0,1.0,10.0,3.0]   |[277.0,479.0,751.0,262.0]|[0.1565856416054268,0.2707744488411532,0.4245336348219333,0.14810627473148671]  |2.0       |
|[39.0,0.0,1.0,1.0,1.0,2.0,0.0,1.0,1.0,112.0,2.0,2.0,2.0,4.0,500.0,5.0,29.0,3.0]   |[104.0,181.0,94.0,53.0]  |[0.24074074074074073,0.41898148148148145,0.2175925925925926,0.1226851851851851

# Random Forest Model

In [84]:
# Do the train test split - we need to discuss what are we settng for the threshold
train_data,test_data = trainingDataTransformed.randomSplit([0.7,0.3], seed=42)

In [87]:
rfc = RandomForestClassifier(labelCol='target_label',featuresCol='features', maxBins = 2432)

In [88]:
# Fit to the train_data
rfc_model = rfc.fit(train_data)

In [89]:
# Get the predictions for the test_data, which is the data with the ground truth known
rfc_predictions = rfc_model.transform(test_data)

In [90]:
acc_evaluator = MulticlassClassificationEvaluator(labelCol="target_label", predictionCol="prediction", metricName="accuracy")

In [91]:
rfc_acc = acc_evaluator.evaluate(rfc_predictions)

In [92]:
print('A random forest ensemble had an accuracy of: {0:2.2f}%'.format(rfc_acc*100))

A random forest ensemble had an accuracy of: 50.81%


## Model Evaluation

In [99]:
evaluator = MulticlassClassificationEvaluator(labelCol="target_label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(rfc_predictions)
print("Test Accuracy = %g" % (accuracy))

from pyspark.sql.functions import col

# Count predictions
confusion_matrix = rfc_predictions.groupBy('target_label').pivot('prediction', [0,1,2,3]).count().na.fill(0).orderBy('target_label')
confusion_matrix.show()

Test Accuracy = 0.5081
+------------+-----+----+----+---+
|target_label|    0|   1|   2|  3|
+------------+-----+----+----+---+
|         0.0|15381|2115| 986|  3|
|         1.0| 5636|4285|1162| 25|
|         2.0| 6473|1835|2197| 33|
|         3.0|  826|1359| 772| 61|
+------------+-----+----+----+---+



## Hyperparameter Tuning

In [None]:
from pyspark.ml.classification import RandomForestClassifier

# Initialize the RandomForestClassifier
rfc = RandomForestClassifier(labelCol='target_label', featuresCol='features', maxBins=2432)

# Create the parameter grid
paramGrid = ParamGridBuilder() \
    .addGrid(rfc.maxDepth, [5, 10, 20]) \
    .addGrid(rfc.maxBins, [2432]) \
    .addGrid(rfc.minInstancesPerNode, [1, 2, 4]) \
    .addGrid(rfc.numTrees, [10, 20, 30]) \
    .build()  


# Assuming evaluator is already defined as MulticlassClassificationEvaluator with accuracy as the metric

# Use TrainValidationSplit for hyperparameter tuning
tvs = TrainValidationSplit(estimator=rfc,
                           estimatorParamMaps=paramGrid,
                           evaluator=evaluator,
                           trainRatio=0.8,  # 80% for training, 20% for validation
                           parallelism=4)  # Adjust based on your available resources

# Run TrainValidationSplit, and choose the best set of parameters.
tvsModel = tvs.fit(train_data)

# Make predictions on test data. tvsModel uses the best model found.
predictions = tvsModel.transform(test_data)

# Evaluate the best model's performance
accuracy = evaluator.evaluate(predictions)
print("Test set Accuracy = " + str(accuracy))

# Print best model's parameters
bestModel = tvsModel.bestModel
print(f"Best Param (maxDepth): {bestModel.getMaxDepth()}")
print(f"Best Param (maxBins): {bestModel.getMaxBins()}")
print(f"Best Param (minInstancesPerNode): {bestModel.getMinInstancesPerNode()}")
print(f"Best Param (numTrees): {bestModel.getNumTrees()}")


In [None]:
# Use the best model found by CrossValidator to make predictions on the test data
predictions = tvsModel.transform(test_data)

# Evaluate the model's accuracy on the test data
evaluator = MulticlassClassificationEvaluator(labelCol="target_label", predictionCol="prediction", metricName="accuracy")

# Compute the accuracy on the test data
accuracy = evaluator.evaluate(predictions)
print(f"Test Accuracy: {accuracy}")

# Generate confusion matrix
confusion_matrix = predictions.groupBy('target_label').pivot('prediction', [0, 1, 2, 3]).count().na.fill(0).orderBy('target_label')
confusion_matrix.show()


## Applying Model on 0 Review Listings

In [93]:
# Just seeing what is inside the rfc_predictions
rfc_predictions.select("features", "target_label", "rawPrediction", "probability", "prediction").show(truncate=False, n=2)

+--------------------------------------------------------------------------------+------------+-----------------------------------------------------------------------------+---------------------------------------------------------------------------------+----------+
|features                                                                        |target_label|rawPrediction                                                                |probability                                                                      |prediction|
+--------------------------------------------------------------------------------+------------+-----------------------------------------------------------------------------+---------------------------------------------------------------------------------+----------+
|[1509.0,0.0,1.0,1.0,10.0,0.0,0.0,4.0,7.0,9.0,1.0,1.0,1.0,1.0,150.0,3.0,32.0,5.0]|1.0         |[6.838360705466454,7.29565682035771,4.060407784157887,1.8055746900179512]    |[0.34191803527332265,0.364

In [96]:
# Applying the trained Random Forest model to the prediction dataset - the hold out zero reviews data
zeroReviewPredictions = rfc_model.transform(dataWithZeroReviewTransformed)

In [97]:
# Displaying predictions, probabilities, and features
zeroReviewPredictions.select("features", "rawPrediction", "probability", "prediction").show(truncate=False, n=5)

+----------------------------------------------------------------------------------+---------------------------------------------------------------------------+---------------------------------------------------------------------------------+----------+
|features                                                                          |rawPrediction                                                              |probability                                                                      |prediction|
+----------------------------------------------------------------------------------+---------------------------------------------------------------------------+---------------------------------------------------------------------------------+----------+
|[2227.0,0.0,1.0,1.0,1.0,0.0,0.0,7.0,1.0,3.0,2.0,1.0,1.0,1.0,222.0,1.0,10.0,3.0]   |[4.717063937420779,4.928950811128885,8.068563945734876,2.2854213057154587] |[0.23585319687103895,0.24644754055644427,0.40342819728674384,0.114271065285772

# Gradient-Boosted Trees

In [102]:
gbt = GBTClassifier(labelCol="target_label", featuresCol="features")

In [104]:
## GBTClassifier is designed for binary classification tasks, which means it expects the target column (labelCol) to have exactly two unique values

In [105]:
# gbt_model = gbt.fit(train_data)

In [95]:
spark.stop()