#### Milestone 4 - Random Forest Test on New Feature Importance with 7 Features

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import lit
from pyspark.sql.functions import col, isnan, count, when, isnull, size, split
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, LongType, FloatType, DateType
from pyspark.sql.functions import col, regexp_replace
from pyspark.ml.linalg import Vectors
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier,GBTClassifier,RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [2]:
spark = SparkSession.builder.appName('final_project_feature_importance').getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/04/14 03:57:54 INFO org.apache.spark.SparkEnv: Registering MapOutputTracker
24/04/14 03:57:54 INFO org.apache.spark.SparkEnv: Registering BlockManagerMaster
24/04/14 03:57:54 INFO org.apache.spark.SparkEnv: Registering BlockManagerMasterHeartbeat
24/04/14 03:57:54 INFO org.apache.spark.SparkEnv: Registering OutputCommitCoordinator


In [3]:
# Put in your bucket and folder path without the csv file
BUCKET = 'gs://ds5460-tlee-spring2024/notebooks/jupyter//data/usa/combined_datasets/' 

data_with_review = spark.read.csv(BUCKET + 'real-final-data.csv',inferSchema=True,header=True)
data_with_zero_review = spark.read.csv(BUCKET + 'target_column_with_zero_review.csv',inferSchema=True,header=True)

                                                                                

In [4]:
# Double check the data was read in properly
data_with_review.head()

Row(host_total_listings_count=3.0, accommodates=8, num_bath=3.0, bedrooms=4.0, beds=6.0, price=500.0, amenities_count=14, essential_amenities=3, host_is_superhost='t', city='Broward County', room_type='Entire home/apt', full_time_host='f', host_verifications_clean='ep', target='Great')

In [5]:
# Double check the data was read in properly
data_with_zero_review.head()

24/04/14 03:58:17 WARN org.apache.spark.sql.catalyst.util.package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


Row(id=827736378366911479, host_id=475630606, host_since='2022-08-18', host_location='Unknown', host_is_superhost='f', host_listings_count=1.0, host_total_listings_count=3.0, host_has_profile_pic='t', host_identity_verified='t', neighborhood='Fort Lauderdale', latitude=26.09393643124416, longitude=-80.13759087771177, room_type='Entire home/apt', accommodates=2, num_bath=1.0, bedrooms=1.0, beds=1.0, price=222.0, number_of_reviews=0, review_scores_value=None, calculated_host_listings_count=1, city='Broward County', amenities_count=10, neighborhood_city='Fort Lauderdale Broward County', full_time_host='f', host_verifications_clean='p', essential_amenities=3)

In [6]:
# Drop columns that can't be used in the feature pipeline as not applicable to zero reviews
data_with_zero_review = data_with_zero_review.drop('review_scores_value')
data_with_zero_review = data_with_zero_review.drop('number_of_reviews')

In [7]:
# Double check the columns in data_with_review
data_with_review.columns

['host_total_listings_count',
 'accommodates',
 'num_bath',
 'bedrooms',
 'beds',
 'price',
 'amenities_count',
 'essential_amenities',
 'host_is_superhost',
 'city',
 'room_type',
 'full_time_host',
 'host_verifications_clean',
 'target']

### Create the Feature Pipeline

In [8]:
# List of string features to be indexed and the features that are already numeric
stringFeatures = ['host_is_superhost', 'city', 'full_time_host']

numericFeatures = ['host_total_listings_count', 'accommodates', 'price', 'amenities_count']

# Create StringIndexer stages for the stringFeatures - call the numeric version as _indexed
indexers = [StringIndexer(inputCol=column, outputCol=column+"_indexed", stringOrderType="alphabetAsc").setHandleInvalid("skip")
            for column in stringFeatures]

# Create a StringIndexer for the target column and naming it as target_label after converting it to numeric
labelIndexer = StringIndexer(inputCol='target', outputCol='target_label').setHandleInvalid("skip")

# Create VectorAssembler stage for the features
assemblerInputs = [column+"_indexed" for column in stringFeatures] + numericFeatures
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")

# Create and run the pipeline with the target indexer added to pipeline
pipeline = Pipeline(stages=indexers + [assembler, labelIndexer])

# Fit the pipeline on the training data - which is all of data that has reviews
pipelineModel = pipeline.fit(data_with_review)

# Alternative code if we wanted to do the train/test split earlier
# trainingDataTransformed = pipelineModel.transform(trainingData)
# testDataTransformed = pipelineModel.transform(testData)

# Apply the pipeline on the data_with_review - trainingDataTransformed seen below will have to go through a train/test split
trainingDataTransformed = pipelineModel.transform(data_with_review)


# To apply the pipeline on the data_with_zero_review, which is missing the target, we need to do the following:
# Extracting the transformation stages from the fitted pipeline model
# Excldue the LabelIndexer from final pipeline stage as we are applying on the prediction dataset
transformationStages = pipelineModel.stages[:-1]

# Manually apply each transformation stage to the data_with_zero_review
transformedData = data_with_zero_review
for stage in transformationStages:
    transformedData = stage.transform(transformedData)

# We now have dataWithZeroReviewTransformed which doesn't have the target_label column to see what is our final model predictions
dataWithZeroReviewTransformed = transformedData

                                                                                

### Check if Feature Pipeline was done properly

In [9]:
trainingDataTransformed.printSchema()

root
 |-- host_total_listings_count: double (nullable = true)
 |-- accommodates: integer (nullable = true)
 |-- num_bath: double (nullable = true)
 |-- bedrooms: double (nullable = true)
 |-- beds: double (nullable = true)
 |-- price: double (nullable = true)
 |-- amenities_count: integer (nullable = true)
 |-- essential_amenities: integer (nullable = true)
 |-- host_is_superhost: string (nullable = true)
 |-- city: string (nullable = true)
 |-- room_type: string (nullable = true)
 |-- full_time_host: string (nullable = true)
 |-- host_verifications_clean: string (nullable = true)
 |-- target: string (nullable = true)
 |-- host_is_superhost_indexed: double (nullable = false)
 |-- city_indexed: double (nullable = false)
 |-- full_time_host_indexed: double (nullable = false)
 |-- features: vector (nullable = true)
 |-- target_label: double (nullable = false)



In [10]:
dataWithZeroReviewTransformed.printSchema()

root
 |-- id: long (nullable = true)
 |-- host_id: integer (nullable = true)
 |-- host_since: string (nullable = true)
 |-- host_location: string (nullable = true)
 |-- host_is_superhost: string (nullable = true)
 |-- host_listings_count: double (nullable = true)
 |-- host_total_listings_count: double (nullable = true)
 |-- host_has_profile_pic: string (nullable = true)
 |-- host_identity_verified: string (nullable = true)
 |-- neighborhood: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- room_type: string (nullable = true)
 |-- accommodates: integer (nullable = true)
 |-- num_bath: double (nullable = true)
 |-- bedrooms: double (nullable = true)
 |-- beds: double (nullable = true)
 |-- price: double (nullable = true)
 |-- calculated_host_listings_count: integer (nullable = true)
 |-- city: string (nullable = true)
 |-- amenities_count: integer (nullable = true)
 |-- neighborhood_city: string (nullable = true)
 |-- full_tim

In [11]:
trainingDataTransformed.show(n=1, truncate=False)

+-------------------------+------------+--------+--------+----+-----+---------------+-------------------+-----------------+--------------+---------------+--------------+------------------------+------+-------------------------+------------+----------------------+--------------------------------+------------+
|host_total_listings_count|accommodates|num_bath|bedrooms|beds|price|amenities_count|essential_amenities|host_is_superhost|city          |room_type      |full_time_host|host_verifications_clean|target|host_is_superhost_indexed|city_indexed|full_time_host_indexed|features                        |target_label|
+-------------------------+------------+--------+--------+----+-----+---------------+-------------------+-----------------+--------------+---------------+--------------+------------------------+------+-------------------------+------------+----------------------+--------------------------------+------------+
|3.0                      |8           |3.0     |4.0     |6.0 |500.0|1

In [12]:
# Doing a quick check that the string indexer worked and converted the categorical values into an index
trainingDataTransformed.select('host_is_superhost', 'host_is_superhost_indexed').show(5)

trainingDataTransformed.select('city', 'city_indexed').show(5)

+-----------------+-------------------------+
|host_is_superhost|host_is_superhost_indexed|
+-----------------+-------------------------+
|                t|                      1.0|
|                f|                      0.0|
|                f|                      0.0|
|                f|                      0.0|
|                f|                      0.0|
+-----------------+-------------------------+
only showing top 5 rows

+--------------+------------+
|          city|city_indexed|
+--------------+------------+
|Broward County|         1.0|
|Broward County|         1.0|
|Broward County|         1.0|
|Broward County|         1.0|
|Broward County|         1.0|
+--------------+------------+
only showing top 5 rows



In [13]:
# Checking that the target_label got created successfully
trainingDataTransformed.select('target', 'target_label').show(20)

+-------+------------+
| target|target_label|
+-------+------------+
|  Great|         0.0|
|Average|         1.0|
|Average|         1.0|
|Average|         1.0|
|  Great|         0.0|
|Average|         1.0|
|  Great|         0.0|
|Average|         1.0|
|Average|         1.0|
|Average|         1.0|
|  Great|         0.0|
|Average|         1.0|
|  Great|         0.0|
|  Great|         0.0|
|Average|         1.0|
|Average|         1.0|
|   Poor|         2.0|
|  Great|         0.0|
|  Great|         0.0|
|Average|         1.0|
+-------+------------+
only showing top 20 rows



In [14]:
# Examine all of the features with a particular target_label
trainingDataTransformed.select("features", "target_label").show(truncate=False)

+-----------------------------------+------------+
|features                           |target_label|
+-----------------------------------+------------+
|[1.0,1.0,0.0,3.0,8.0,500.0,14.0]   |0.0         |
|[0.0,1.0,0.0,12.0,6.0,186.0,22.0]  |1.0         |
|[0.0,1.0,0.0,20.0,7.0,297.0,17.0]  |1.0         |
|[0.0,1.0,0.0,5.0,4.0,162.0,69.0]   |1.0         |
|[0.0,1.0,0.0,17.0,2.0,92.0,17.0]   |0.0         |
|[1.0,1.0,0.0,1930.0,6.0,258.0,38.0]|1.0         |
|[1.0,1.0,0.0,1.0,4.0,100.0,72.0]   |0.0         |
|[0.0,1.0,0.0,19.0,4.0,189.0,60.0]  |1.0         |
|[1.0,1.0,0.0,2.0,3.0,63.0,67.0]    |1.0         |
|[0.0,1.0,0.0,13.0,2.0,127.0,51.0]  |1.0         |
|[0.0,1.0,0.0,112.0,4.0,300.0,81.0] |0.0         |
|[0.0,1.0,0.0,14.0,6.0,218.0,16.0]  |1.0         |
|[0.0,1.0,0.0,2.0,4.0,155.0,52.0]   |0.0         |
|[1.0,1.0,0.0,32.0,9.0,1764.0,57.0] |0.0         |
|[1.0,1.0,0.0,10.0,2.0,91.0,52.0]   |1.0         |
|[1.0,1.0,0.0,2.0,3.0,60.0,66.0]    |1.0         |
|[0.0,1.0,0.0,292.0,6.0,313.0,2

In [15]:
# Examine if the dataWithZeroReviewTransformed features look like it was done correctly
dataWithZeroReviewTransformed.select('features').show(5)

+--------------------+
|            features|
+--------------------+
|[0.0,1.0,0.0,3.0,...|
|[0.0,1.0,0.0,112....|
|[0.0,1.0,0.0,6.0,...|
|[0.0,1.0,0.0,3.0,...|
|[1.0,1.0,0.0,465....|
+--------------------+
only showing top 5 rows



In [16]:
# See the count of each dataset to get an understanding on sizes:
num_rows = trainingDataTransformed.count()
num_columns = len(trainingDataTransformed.first())

print("For trainingDataTransformed:")
print(f"Number of Rows: {num_rows}, Number of Columns: {num_columns}")

num_rows = dataWithZeroReviewTransformed.count()
num_columns = len(dataWithZeroReviewTransformed.first())

print("For dataWithZeroReviewTransformed:")
print(f"Number of Rows: {num_rows}, Number of Columns: {num_columns}")

For trainingDataTransformed:
Number of Rows: 143995, Number of Columns: 19
For dataWithZeroReviewTransformed:
Number of Rows: 40989, Number of Columns: 29


## Random Forest Model

In [17]:
# Do the train test split - we need to discuss what are we settng for the threshold
train_data,test_data = trainingDataTransformed.randomSplit([0.7,0.3], seed=42)

In [18]:
rfc = RandomForestClassifier(labelCol='target_label',featuresCol='features', maxBins = 2200)

In [19]:
# Fit to the train_data
rfc_model = rfc.fit(train_data)

                                                                                

In [20]:
# Get the predictions for the test_data, which is the data with the ground truth known
rfc_predictions = rfc_model.transform(test_data)

In [21]:
acc_evaluator = MulticlassClassificationEvaluator(labelCol="target_label", predictionCol="prediction", metricName="accuracy")

In [22]:
rfc_acc = acc_evaluator.evaluate(rfc_predictions)

                                                                                

In [23]:
print('A random forest ensemble had an accuracy of: {0:2.2f}%'.format(rfc_acc*100))

A random forest ensemble had an accuracy of: 58.02%


In [24]:
# To calculate F1 Score
f1_evaluator = MulticlassClassificationEvaluator(labelCol="target_label", 
                                                 predictionCol="prediction", 
                                                 metricName="f1")

f1_score = f1_evaluator.evaluate(rfc_predictions)  # Assuming 'predictions' is your dataframe with predictions

print('A random forest ensemble had an f1 score of: {0:2.2f}%'.format(f1_score*100))

# To calculate weighted Precision
precision_evaluator = MulticlassClassificationEvaluator(labelCol="target_label", 
                                                        predictionCol="prediction", 
                                                        metricName="weightedPrecision")

precision_score = precision_evaluator.evaluate(rfc_predictions)

print('A random forest ensemble had an precision of: {0:2.2f}%'.format(precision_score*100))

# To calculate weighted Recall
recall_evaluator = MulticlassClassificationEvaluator(labelCol="target_label", 
                                                     predictionCol="prediction", 
                                                     metricName="weightedRecall")

recall_score = recall_evaluator.evaluate(rfc_predictions)

print('A random forest ensemble had an recall of: {0:2.2f}%'.format(recall_score*100))

                                                                                

A random forest ensemble had an f1 score of: 54.91%


                                                                                

A random forest ensemble had an precision of: 57.28%




A random forest ensemble had an recall of: 58.02%


                                                                                

In [25]:
# Just seeing what is inside the rfc_predictions
rfc_predictions.select("features", "target_label", "rawPrediction", "probability", "prediction").show(truncate=False, n=2)

[Stage 54:>                                                         (0 + 1) / 1]

+--------------------------------+------------+--------------------------------------------------------+-----------------------------------------------------------+----------+
|features                        |target_label|rawPrediction                                           |probability                                                |prediction|
+--------------------------------+------------+--------------------------------------------------------+-----------------------------------------------------------+----------+
|[0.0,10.0,0.0,1.0,1.0,60.0,11.0]|0.0         |[10.609483424253412,7.221138055444236,2.169378520302353]|[0.5304741712126706,0.3610569027722118,0.10846892601511764]|0.0       |
|[0.0,7.0,0.0,1.0,1.0,59.0,27.0] |0.0         |[10.670082797606538,8.09793621543918,1.231980986954283] |[0.5335041398803269,0.404896810771959,0.06159904934771415] |0.0       |
+--------------------------------+------------+--------------------------------------------------------+----------------

                                                                                

In [26]:
# Applying the trained Random Forest model to the prediction dataset - the hold out zero reviews data
zeroReviewPredictions = rfc_model.transform(dataWithZeroReviewTransformed)

In [27]:
# Displaying predictions, probabilities, and features
zeroReviewPredictions.select("features", "rawPrediction", "probability", "prediction").show(truncate=False, n=5)

+----------------------------------+----------------------------------------------------------+------------------------------------------------------------+----------+
|features                          |rawPrediction                                             |probability                                                 |prediction|
+----------------------------------+----------------------------------------------------------+------------------------------------------------------------+----------+
|[0.0,1.0,0.0,3.0,2.0,222.0,10.0]  |[10.04496277092483,7.637371903358474,2.3176653257166953]  |[0.5022481385462415,0.3818685951679237,0.11588326628583476] |0.0       |
|[0.0,1.0,0.0,112.0,2.0,500.0,29.0]|[5.957241928496629,11.542271363139188,2.5004867083641833] |[0.29786209642483147,0.5771135681569595,0.12502433541820918]|1.0       |
|[0.0,1.0,0.0,6.0,16.0,868.0,68.0] |[6.683044944206464,11.138327373664245,2.1786276821292887] |[0.33415224721032327,0.5569163686832124,0.10893138410646445]|1.0 

In [30]:
# # Saving the baseline RandomForest model
# baseline_model_path = "gs://ds5460-tlee-spring2024/notebooks/jupyter//data/usa/combined_datasets/random_forest"
# rfc_model.save(baseline_model_path)

                                                                                

In [28]:
spark.stop()