#### Milestone 3 - Random Forest Test

In [11]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import lit
from pyspark.sql.functions import col, isnan, count, when, isnull, size, split
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, LongType, FloatType, DateType
from pyspark.sql.functions import col, regexp_replace
from pyspark.ml.linalg import Vectors
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier,GBTClassifier,RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [2]:
spark = SparkSession.builder.appName('final_project').getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/04/04 19:08:37 INFO org.apache.spark.SparkEnv: Registering MapOutputTracker
24/04/04 19:08:37 INFO org.apache.spark.SparkEnv: Registering BlockManagerMaster
24/04/04 19:08:37 INFO org.apache.spark.SparkEnv: Registering BlockManagerMasterHeartbeat
24/04/04 19:08:37 INFO org.apache.spark.SparkEnv: Registering OutputCommitCoordinator


In [69]:
# Put in your bucket and folder path without the csv file
BUCKET = 'gs://ds5460-tlee-spring2024/notebooks/jupyter//data/usa/combined_datasets/' 

data_with_review = spark.read.csv(BUCKET + 'target_column_with_review.csv',inferSchema=True,header=True)
data_with_zero_review = spark.read.csv(BUCKET + 'target_column_with_zero_review.csv',inferSchema=True,header=True)

In [70]:
# Double check the data was read in properly
data_with_review.head()

Row(id=772438920837360569, host_id=382318476, host_since='2020-12-30', host_location='Unknown', host_is_superhost='t', host_listings_count=1.0, host_total_listings_count=3.0, host_has_profile_pic='t', host_identity_verified='t', neighborhood='Southwest Ranches', latitude=26.0338992, longitude=-80.3346054, room_type='Entire home/apt', accommodates=8, num_bath=3.0, bedrooms=4.0, beds=6.0, price=500.0, number_of_reviews=2, review_scores_value=5.0, calculated_host_listings_count=1, city='Broward County', amenities_count=14, neighborhood_city='Southwest Ranches Broward County', full_time_host='f', host_verifications_clean='ep', essential_amenities=3, target='Exceptional')

In [71]:
# Double check the data was read in properly
data_with_zero_review.head()

Row(id=827736378366911479, host_id=475630606, host_since='2022-08-18', host_location='Unknown', host_is_superhost='f', host_listings_count=1.0, host_total_listings_count=3.0, host_has_profile_pic='t', host_identity_verified='t', neighborhood='Fort Lauderdale', latitude=26.09393643124416, longitude=-80.13759087771177, room_type='Entire home/apt', accommodates=2, num_bath=1.0, bedrooms=1.0, beds=1.0, price=222.0, number_of_reviews=0, review_scores_value=None, calculated_host_listings_count=1, city='Broward County', amenities_count=10, neighborhood_city='Fort Lauderdale Broward County', full_time_host='f', host_verifications_clean='p', essential_amenities=3)

In [72]:
# Drop columns that can't be used in the feature pipeline as not applicable to zero reviews
data_with_review = data_with_review.drop('review_scores_value')
data_with_review = data_with_review.drop('number_of_reviews')
data_with_zero_review = data_with_zero_review.drop('review_scores_value')
data_with_zero_review = data_with_zero_review.drop('number_of_reviews')

In [73]:
# Double check the columns in data_with_review
data_with_review.columns

['id',
 'host_id',
 'host_since',
 'host_location',
 'host_is_superhost',
 'host_listings_count',
 'host_total_listings_count',
 'host_has_profile_pic',
 'host_identity_verified',
 'neighborhood',
 'latitude',
 'longitude',
 'room_type',
 'accommodates',
 'num_bath',
 'bedrooms',
 'beds',
 'price',
 'calculated_host_listings_count',
 'city',
 'amenities_count',
 'neighborhood_city',
 'full_time_host',
 'host_verifications_clean',
 'essential_amenities',
 'target']

### Create the Feature Pipeline

In [75]:
# List of string features to be indexed and the features that are already numeric
stringFeatures = ['host_location', 'host_is_superhost', 'host_has_profile_pic',
                  'host_identity_verified', 'city', 'room_type', 'full_time_host', 
                  'host_verifications_clean']

numericFeatures = ['host_listings_count', 'host_total_listings_count', 'accommodates', 
                   'num_bath','bedrooms', 'beds', 'price', 'calculated_host_listings_count',
                   'amenities_count', 'essential_amenities']

# Create StringIndexer stages for the stringFeatures - call the numeric version as _indexed
indexers = [StringIndexer(inputCol=column, outputCol=column+"_indexed", stringOrderType="alphabetAsc").setHandleInvalid("skip")
            for column in stringFeatures]

# Create a StringIndexer for the target column and naming it as target_label after converting it to numeric
labelIndexer = StringIndexer(inputCol='target', outputCol='target_label').setHandleInvalid("skip")

# Create VectorAssembler stage for the features
assemblerInputs = [column+"_indexed" for column in stringFeatures] + numericFeatures
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")

# Create and run the pipeline with the target indexer added to pipeline
pipeline = Pipeline(stages=indexers + [assembler, labelIndexer])

# Fit the pipeline on the training data - which is all of data that has reviews
pipelineModel = pipeline.fit(data_with_review)

# Alternative code if we wanted to do the train/test split earlier as it depends if we want to do 80/20 or 70/30.
# trainingDataTransformed = pipelineModel.transform(trainingData)
# testDataTransformed = pipelineModel.transform(testData)

# Apply the pipeline on the data_with_review - trainingDataTransformed seen below will have to go through a train/test split
trainingDataTransformed = pipelineModel.transform(data_with_review)


# To apply the pipeline on the data_with_zero_review, which is missing the target, we need to do the following:
# Extracting the transformation stages from the fitted pipeline model
# Excldue the LabelIndexer from final pipeline stage as we are applying on the prediction dataset
transformationStages = pipelineModel.stages[:-1]

# Manually apply each transformation stage to the data_with_zero_review
transformedData = data_with_zero_review
for stage in transformationStages:
    transformedData = stage.transform(transformedData)

# We now have dataWithZeroReviewTransformed which doesn't have the target_label column to see what is our final model predictions
dataWithZeroReviewTransformed = transformedData

                                                                                

### Check if Feature Pipeline was done properly

In [76]:
trainingDataTransformed.printSchema()

root
 |-- id: long (nullable = true)
 |-- host_id: integer (nullable = true)
 |-- host_since: string (nullable = true)
 |-- host_location: string (nullable = true)
 |-- host_is_superhost: string (nullable = true)
 |-- host_listings_count: double (nullable = true)
 |-- host_total_listings_count: double (nullable = true)
 |-- host_has_profile_pic: string (nullable = true)
 |-- host_identity_verified: string (nullable = true)
 |-- neighborhood: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- room_type: string (nullable = true)
 |-- accommodates: integer (nullable = true)
 |-- num_bath: double (nullable = true)
 |-- bedrooms: double (nullable = true)
 |-- beds: double (nullable = true)
 |-- price: double (nullable = true)
 |-- calculated_host_listings_count: integer (nullable = true)
 |-- city: string (nullable = true)
 |-- amenities_count: integer (nullable = true)
 |-- neighborhood_city: string (nullable = true)
 |-- full_tim

In [77]:
dataWithZeroReviewTransformed.printSchema()

root
 |-- id: long (nullable = true)
 |-- host_id: integer (nullable = true)
 |-- host_since: string (nullable = true)
 |-- host_location: string (nullable = true)
 |-- host_is_superhost: string (nullable = true)
 |-- host_listings_count: double (nullable = true)
 |-- host_total_listings_count: double (nullable = true)
 |-- host_has_profile_pic: string (nullable = true)
 |-- host_identity_verified: string (nullable = true)
 |-- neighborhood: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- room_type: string (nullable = true)
 |-- accommodates: integer (nullable = true)
 |-- num_bath: double (nullable = true)
 |-- bedrooms: double (nullable = true)
 |-- beds: double (nullable = true)
 |-- price: double (nullable = true)
 |-- calculated_host_listings_count: integer (nullable = true)
 |-- city: string (nullable = true)
 |-- amenities_count: integer (nullable = true)
 |-- neighborhood_city: string (nullable = true)
 |-- full_tim

In [78]:
trainingDataTransformed.show(n=1, truncate=False)

+------------------+---------+----------+-------------+-----------------+-------------------+-------------------------+--------------------+----------------------+-----------------+----------+-----------+---------------+------------+--------+--------+----+-----+------------------------------+--------------+---------------+--------------------------------+--------------+------------------------+-------------------+-----------+---------------------+-------------------------+----------------------------+------------------------------+------------+-----------------+----------------------+--------------------------------+-------------------------------------------------------------------------------+------------+
|id                |host_id  |host_since|host_location|host_is_superhost|host_listings_count|host_total_listings_count|host_has_profile_pic|host_identity_verified|neighborhood     |latitude  |longitude  |room_type      |accommodates|num_bath|bedrooms|beds|price|calculated_host_list

In [33]:
# Doing a quick check that the string indexer worked and converted the categorical values into an index
trainingDataTransformed.select('host_location', 'host_location_indexed').show(5)

trainingDataTransformed.select('room_type', 'room_type_indexed').show(10)

+----------------+---------------------+
|   host_location|host_location_indexed|
+----------------+---------------------+
|San Francisco CA|               1723.0|
|     New York NY|               1350.0|
| Santa Monica CA|               1754.0|
|   Washington DC|               2066.0|
|   Washington DC|               2066.0|
+----------------+---------------------+
only showing top 5 rows

+---------------+-----------------+
|      room_type|room_type_indexed|
+---------------+-----------------+
|Entire home/apt|              0.0|
|Entire home/apt|              0.0|
|   Private room|              2.0|
|   Private room|              2.0|
|   Private room|              2.0|
+---------------+-----------------+
only showing top 5 rows



In [81]:
# Checking that the target_label got created successfully
trainingDataTransformed.select('target', 'target_label').show(5)

+-----------+------------+
|     target|target_label|
+-----------+------------+
|Exceptional|         2.0|
|   Mediocre|         1.0|
|   Mediocre|         1.0|
|   Mediocre|         1.0|
|       Good|         0.0|
+-----------+------------+
only showing top 5 rows



In [41]:
# Examine all of the features with a particular target_label
trainingDataTransformed.select("features", "target_label").show(truncate=False)

+---------------------------------------------------------------------------------------------+------------+
|features                                                                                     |target_label|
+---------------------------------------------------------------------------------------------+------------+
|[2000.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,3.0,8.0,3.0,4.0,6.0,500.0,2.0,5.0,1.0,14.0,3.0]      |2.0         |
|[644.0,0.0,1.0,0.0,1.0,0.0,0.0,8.0,1.0,12.0,6.0,2.0,2.0,4.0,186.0,129.0,4.68,3.0,22.0,4.0]   |1.0         |
|[262.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,20.0,7.0,2.0,2.0,5.0,297.0,27.0,4.44,6.0,17.0,3.0]    |1.0         |
|[869.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,5.0,4.0,1.0,1.0,2.0,162.0,162.0,4.64,5.0,69.0,5.0]    |1.0         |
|[1205.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,17.0,2.0,1.0,1.0,1.0,92.0,36.0,4.83,15.0,17.0,4.0]   |0.0         |
|[1747.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1930.0,6.0,2.0,2.0,4.0,258.0,35.0,4.71,23.0,38.0,5.0]|0.0         |
|[1205.0,1.0,1.0,1.

In [79]:
# Examine if the dataWithZeroReviewTransformed features look like it was done correctly
dataWithZeroReviewTransformed.select('features').show(5)

+--------------------+
|            features|
+--------------------+
|[2000.0,0.0,1.0,1...|
|[34.0,0.0,1.0,1.0...|
|[198.0,0.0,1.0,1....|
|[1547.0,0.0,1.0,0...|
|[1104.0,1.0,1.0,1...|
+--------------------+
only showing top 5 rows



In [82]:
# See the count of each dataset to get an understanding on sizes:
num_rows = trainingDataTransformed.count()
num_columns = len(trainingDataTransformed.first())

print("For trainingDataTransformed:")
print(f"Number of Rows: {num_rows}, Number of Columns: {num_columns}")

num_rows = dataWithZeroReviewTransformed.count()
num_columns = len(dataWithZeroReviewTransformed.first())

print("For dataWithZeroReviewTransformed:")
print(f"Number of Rows: {num_rows}, Number of Columns: {num_columns}")

                                                                                

For trainingDataTransformed:
Number of Rows: 143728, Number of Columns: 36
For dataWithZeroReviewTransformed:
Number of Rows: 40351, Number of Columns: 34


## Random Forest Model

In [83]:
# Do the train test split - we need to discuss what are we settng for the threshold
train_data,test_data = trainingDataTransformed.randomSplit([0.7,0.3], seed=42)

In [84]:
rfc = RandomForestClassifier(labelCol='target_label',featuresCol='features', maxBins = 2200)

In [85]:
# Fit to the train_data
rfc_model = rfc.fit(train_data)

                                                                                

In [86]:
# Get the predictions for the test_data, which is the data with the ground truth known
rfc_predictions = rfc_model.transform(test_data)

In [88]:
acc_evaluator = MulticlassClassificationEvaluator(labelCol="target_label", predictionCol="prediction", metricName="accuracy")

In [89]:
rfc_acc = acc_evaluator.evaluate(rfc_predictions)

                                                                                

In [90]:
print('A random forest ensemble had an accuracy of: {0:2.2f}%'.format(rfc_acc*100))

A random forest ensemble had an accuracy of: 50.93%


In [91]:
# Just seeing what is inside the rfc_predictions
rfc_predictions.select("features", "target_label", "rawPrediction", "probability", "prediction").show(truncate=False, n=2)

[Stage 296:>                                                        (0 + 1) / 1]

+-------------------------------------------------------------------------------+------------+-----------------------------------------------------------------------------+---------------------------------------------------------------------------------+----------+
|features                                                                       |target_label|rawPrediction                                                                |probability                                                                      |prediction|
+-------------------------------------------------------------------------------+------------+-----------------------------------------------------------------------------+---------------------------------------------------------------------------------+----------+
|[1104.0,1.0,1.0,1.0,7.0,2.0,0.0,1.0,2.0,3.0,1.0,1.0,1.0,1.0,118.0,2.0,66.0,5.0]|0.0         |[13.06955253499168,2.472512429280414,4.17763168874424,0.28030334698366727]   |[0.6534776267495839,0.12362562

                                                                                

In [93]:
# Applying the trained Random Forest model to the prediction dataset - the hold out zero reviews data
zeroReviewPredictions = rfc_model.transform(dataWithZeroReviewTransformed)

In [94]:
# Displaying predictions, probabilities, and features
zeroReviewPredictions.select("features", "rawPrediction", "probability", "prediction").show(truncate=False, n=5)

+----------------------------------------------------------------------------------+--------------------------------------------------------------------------+---------------------------------------------------------------------------------+----------+
|features                                                                          |rawPrediction                                                             |probability                                                                      |prediction|
+----------------------------------------------------------------------------------+--------------------------------------------------------------------------+---------------------------------------------------------------------------------+----------+
|[2000.0,0.0,1.0,1.0,1.0,0.0,0.0,7.0,1.0,3.0,2.0,1.0,1.0,1.0,222.0,1.0,10.0,3.0]   |[4.807130387152922,5.019180848291375,7.831948446366062,2.34174031818964]  |[0.2403565193576461,0.2509590424145688,0.3915974223183031,0.117087015909482]     |

In [95]:
spark.stop()