# New York Rent Analysis Model

### Initialize Spark Session

In [1]:
# libraries
import os
import matplotlib.pyplot as plt
from pyspark.sql import SparkSession, functions as F
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml import Pipeline
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator


In [2]:
# spark session initialization
spark = SparkSession.builder\
    .master("local[5]")\
    .appName("main")\
    .config("spark.sql.debug.maxToStringFields", 100)\
    .config("spark.driver.memory", '4g')\
    .config("spark.executor.instances", 5)\
    .getOrCreate()

spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/05/08 22:18:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/05/08 22:18:59 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/05/08 22:18:59 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


### Dataset

The model here deals with daily rental price and require latitute and longitude to predict the price

In [3]:
# import dataset
raw_rent_df = spark.read.csv('new_york.csv', header=True, inferSchema=True)
raw_ss_df = spark.read.csv('results/ss_df.csv', header=True, inferSchema=True)

In [4]:
# rent schema
raw_rent_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- host_id: string (nullable = true)
 |-- host_name: string (nullable = true)
 |-- neighbourhood_group: string (nullable = true)
 |-- neighbourhood: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- room_type: string (nullable = true)
 |-- price: string (nullable = true)
 |-- minimum_nights: string (nullable = true)
 |-- number_of_reviews: string (nullable = true)
 |-- last_review: string (nullable = true)
 |-- reviews_per_month: string (nullable = true)
 |-- calculated_host_listings_count: string (nullable = true)
 |-- availability_365: integer (nullable = true)



### Data Preprocessing

In [5]:
raw_ss_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- line: string (nullable = true)
 |-- stop_name: string (nullable = true)
 |-- borough: string (nullable = true)
 |-- lat: double (nullable = true)
 |-- long: double (nullable = true)
 |-- n_label: string (nullable = true)
 |-- s_label: string (nullable = true)
 |-- entries: double (nullable = true)
 |-- exits: double (nullable = true)
 |-- arrests: integer (nullable = true)
 |-- complaints: integer (nullable = true)
 |-- summons: integer (nullable = true)



In [6]:
ss_df = raw_ss_df.withColumn('latitude', F.col('lat'))\
    .withColumn('longitude', F.col('long'))\
    .drop('lat')\
    .drop('long')
ss_df.toPandas().head()

Unnamed: 0,id,line,stop_name,borough,n_label,s_label,entries,exits,arrests,complaints,summons,latitude,longitude
0,F12,Queens Blvd,5 Av/53 St,M,Queens,Downtown & Brooklyn,171540100000.0,183989500000.0,1811,3603,1056,40.760167,-73.975224
1,637,Lexington Av,Bleecker St,M,Uptown & The Bronx,Downtown,196565200000.0,153466700000.0,3444,5079,1046,40.725915,-73.994659
2,603,Pelham,Middletown Rd,Bx,Pelham Bay Park,Manhattan,16027570000.0,9955782000.0,442,1094,842,40.843863,-73.836322
3,725,Flushing,Times Sq-42 St,M,Queens,34 St - Hudson Yards,494357900000.0,184005900000.0,11199,12406,14342,40.755477,-73.987691
4,606,Pelham,Zerega Av,Bx,Pelham Bay Park,Manhattan,9471628000.0,5313407000.0,649,1661,996,40.836488,-73.847036


In [7]:
# select required columns
rent_df = raw_rent_df.select(F.col('latitude'), F.col('longitude'), F.col('price'))\
    .filter(F.col('latitude').isNotNull()\
        & F.col('longitude').isNotNull()\
        & F.col('price').isNotNull())\
    .withColumn("longitude", F.col("longitude").cast("double"))\
    .withColumn("latitude", F.col("latitude").cast("double"))\
    .withColumn("price", F.col("price").cast("double"))\
    .na.drop()

# print selected schema
rent_df.printSchema()

root
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- price: double (nullable = true)



In [8]:
# top 5 rows of dataset
rent_df.show(5)

+--------+---------+-----+
|latitude|longitude|price|
+--------+---------+-----+
|40.64749|-73.97237|149.0|
|40.75362|-73.98377|225.0|
|40.80902| -73.9419|150.0|
|40.68514|-73.95976| 89.0|
|40.79851|-73.94399| 80.0|
+--------+---------+-----+
only showing top 5 rows



### Analysis Model
Here we will use PySpark native library to develop a model called Vector Assembler

In [9]:
# required features
features = ['latitude', 'longitude']
target = 'features'

In [10]:
# vectorize data
rent_assembler = VectorAssembler(inputCols=features, outputCol=target)

In [11]:
# regression model
rent_regress = RandomForestRegressor(featuresCol = target, labelCol = 'price')

In [12]:
# create pipeline
rent_pipeline  = Pipeline(stages = [rent_assembler,rent_regress])


In [13]:
# save pipeline
rent_pipeline.write().overwrite().save('rent_pipeline')

In [14]:
# get saved pipeline
rent_pipeline_model = Pipeline.load('./rent_pipeline')

In [15]:
# hyperparameter space
rent_hyper_space = ParamGridBuilder()\
    .addGrid(rent_regress.numTrees,[100,500])\
    .build()

In [16]:
# define cross validator
rent_cross_val = CrossValidator(estimator = rent_pipeline_model,\
                                estimatorParamMaps = rent_hyper_space,\
                                evaluator = RegressionEvaluator(labelCol='price'),\
                                numFolds = 3)

In [17]:
# divide train and test data
rent_train_df , rent_test_df = rent_df.randomSplit([0.8,0.2], seed = 1432)

In [18]:
# train data using cross valuation
rent_model = rent_cross_val.fit(rent_train_df)

23/05/08 22:19:10 WARN DAGScheduler: Broadcasting large task binary with size 1143.9 KiB
23/05/08 22:19:11 WARN DAGScheduler: Broadcasting large task binary with size 2.2 MiB
23/05/08 22:19:11 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
23/05/08 22:19:18 WARN DAGScheduler: Broadcasting large task binary with size 1141.7 KiB
23/05/08 22:19:19 WARN DAGScheduler: Broadcasting large task binary with size 2.2 MiB
23/05/08 22:19:26 WARN DAGScheduler: Broadcasting large task binary with size 1146.4 KiB
23/05/08 22:19:27 WARN DAGScheduler: Broadcasting large task binary with size 2.2 MiB
                                                                                

In [19]:
# get best fitted model
rent_best_model = rent_model.bestModel
rent_best_model.stages

[VectorAssembler_2941afb1fce4,
 RandomForestRegressionModel: uid=RandomForestRegressor_ded5f2a83f1a, numTrees=100, numFeatures=2]

In [20]:
# transform test data
rent_pred = rent_model.transform(rent_test_df)


In [21]:
# evaluate model performance
rent_eval = RegressionEvaluator(labelCol='price')

In [22]:
# rmse error calculation
rent_model_rmse = rent_eval.evaluate(rent_pred, {rent_eval.metricName:'rmse'})
print("RMSE: %.3f" % rent_model_rmse)

RMSE: 233.906


In [23]:
# mae score
rent_model_mae = rent_eval.evaluate(rent_pred, {rent_eval.metricName:"mae"})
print("MAE: %.3f" %rent_model_mae)

MAE: 79.350


In [24]:
# r2 score
rent_model_r2 = rent_eval.evaluate(rent_pred, {rent_eval.metricName:'r2'})
print("R2: %.3f" %rent_model_r2)

R2: 0.063


### Predict price

In [25]:
# predict prices
final_rent_eval = rent_model.transform(ss_df)\
    .withColumn('pred_price', F.col('prediction'))\
    .drop('prediction')\
    .drop('features')

In [26]:
# save prediction prices
final_rent_eval.write.option('header', True).mode('overwrite').csv(os.path.join('results', 'prediction.csv'))
final_rent_eval.show()

+---+-------------------+--------------------+-------+------------------+--------------------+-----------------+-----------------+-------+----------+-------+---------+----------+------------------+
| id|               line|           stop_name|borough|           n_label|             s_label|          entries|            exits|arrests|complaints|summons| latitude| longitude|        pred_price|
+---+-------------------+--------------------+-------+------------------+--------------------+-----------------+-----------------+-------+----------+-------+---------+----------+------------------+
|F12|        Queens Blvd|          5 Av/53 St|      M|            Queens| Downtown & Brooklyn| 1.71540057484E11| 1.83989511603E11|   1811|      3603|   1056|40.760167|-73.975224| 229.7236260322771|
|637|       Lexington Av|         Bleecker St|      M|Uptown & The Bronx|            Downtown| 1.96565180833E11| 1.53466659135E11|   3444|      5079|   1046|40.725915|-73.994659|215.20943593671655|
|603|     