In [1]:
# useful libraries
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# pyspark libs
from pyspark import SparkContext

from pyspark.sql.types import *
import pyspark.sql.functions as F

from pyspark.mllib.feature import StandardScaler, Normalizer, HashingTF, IDF
from pyspark.mllib.linalg import Vectors, SparseVector
from pyspark.mllib.stat import Statistics

from pyspark.ml.feature import StringIndexer, VectorIndexer, Imputer

In [2]:
from pyspark.ml.linalg import DenseMatrix, Vectors
from pyspark.ml.stat import Correlation

In [3]:
from pyspark.ml.feature import MinMaxScaler, VectorAssembler
from pyspark.sql import SQLContext, SparkSession, Row, Window

In [4]:
from pyspark.ml.feature import StringIndexer, VectorAssembler, Tokenizer, HashingTF, IDF
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.sql.functions import when

#### Load the pre-processed dataframe

In [5]:
ss = SparkSession.builder.appName("ML").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/12/19 19:21:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [6]:
# Read the CSV file with multiline values (description and facilities)
data = ss.read.options(header = True, inferSchema = True, multiline = True, escape = "\"").csv("dataframe_totalCleaned.csv")
# Show the corrected description and facilities fields
data.count()
len(data.columns)

                                                                                

27

In [7]:
from pyspark.sql.functions import col
data = data.withColumn("geo_plz", col("geo_plz").cast("string"))

In [8]:
print(data.dtypes)

[('regio1', 'string'), ('heatingType', 'string'), ('balcony', 'boolean'), ('firingTypes', 'string'), ('hasKitchen', 'boolean'), ('cellar', 'boolean'), ('condition', 'string'), ('interiorQual', 'string'), ('petsAllowed', 'string'), ('streetPlain', 'string'), ('lift', 'boolean'), ('typeOfFlat', 'string'), ('geo_plz', 'string'), ('garden', 'boolean'), ('regio2', 'string'), ('regio3', 'string'), ('description', 'string'), ('facilities', 'string'), ('log_pictureCount', 'double'), ('log_yearConstructed', 'double'), ('log_noRooms', 'double'), ('log_floor', 'double'), ('log_numberOfFloors', 'double'), ('log_serviceCharge', 'double'), ('log_baseRent', 'double'), ('log_livingSpace', 'double'), ('log_thermalChar', 'double')]


### Feature Importance

In [9]:


# List of categorical columns with string values
categorical_cols = ['regio2', 'geo_plz', 'regio3', 'regio1', 'heatingType', 'firingTypes', 'condition', 'interiorQual', 'petsAllowed', 'typeOfFlat']
# Fill null values in 'description' column with an empty string
data = data.withColumn('description', when(data['description'].isNull(), '').otherwise(data['description']))
data = data.withColumn('facilities', when(data['facilities'].isNull(), '').otherwise(data['facilities']))
data = data.withColumn('streetPlain', when(data['streetPlain'].isNull(), '').otherwise(data['streetPlain']))

# Convert categorical columns to numerical indices
indexers = [StringIndexer(inputCol=col, outputCol=f"{col}_index").fit(data) for col in categorical_cols]
indexed_data = data
for indexer in indexers:
    indexed_data = indexer.transform(indexed_data)

# List of text columns
text_columns = ['description', 'facilities', 'streetPlain']

# Tokenize text columns
tokenizer_desc = Tokenizer(inputCol="description", outputCol="words_desc")
tokenizer_fac = Tokenizer(inputCol="facilities", outputCol="words_fac")
tokenizer_street = Tokenizer(inputCol="streetPlain", outputCol="words_street")

data_with_text = tokenizer_desc.transform(indexed_data)
data_with_text = tokenizer_fac.transform(data_with_text)
data_with_text = tokenizer_street.transform(data_with_text)

# Apply TF-IDF on text columns
hashingTF_desc = HashingTF(inputCol="words_desc", outputCol="rawFeatures_desc", numFeatures=100)
hashingTF_fac = HashingTF(inputCol="words_fac", outputCol="rawFeatures_fac", numFeatures=100)
hashingTF_street = HashingTF(inputCol="words_street", outputCol="rawFeatures_street", numFeatures=100)

featurized_data = hashingTF_desc.transform(data_with_text)
featurized_data = hashingTF_fac.transform(featurized_data)
featurized_data = hashingTF_street.transform(featurized_data)

idf_desc = IDF(inputCol="rawFeatures_desc", outputCol="features_desc")
idf_fac = IDF(inputCol="rawFeatures_fac", outputCol="features_fac")
idf_street = IDF(inputCol="rawFeatures_street", outputCol="features_street")

idfModel_desc = idf_desc.fit(featurized_data)
idfModel_fac = idf_fac.fit(featurized_data)
idfModel_street = idf_street.fit(featurized_data)

rescaled_data = idfModel_desc.transform(featurized_data)
rescaled_data = idfModel_fac.transform(rescaled_data)
rescaled_data = idfModel_street.transform(rescaled_data)

# List of columns except the target column (log_baseRent)
columns =  ['regio2_index','geo_plz_index','regio3_index','regio1_index','heatingType_index', 'balcony', 'firingTypes_index', 'hasKitchen', 'cellar', 'condition_index', 
           'interiorQual_index', 'petsAllowed_index', 'lift', 'typeOfFlat_index', 'garden',
           'log_pictureCount', 'log_yearConstructed', 'log_noRooms', 'log_floor',
           'log_numberOfFloors', 'log_serviceCharge', 'log_livingSpace', 'log_thermalChar']

# List of new text columns with numerical features
text_features = ['features_desc', 'features_fac', 'features_street']

# Extend the columns list
columns.extend(text_features)

# Assemble all features into a single vector column
assembler = VectorAssembler(inputCols=columns, outputCol="final_features")
assembled_df = assembler.transform(rescaled_data)

# Define the DecisionTreeRegressor model
dt = DecisionTreeRegressor(featuresCol="final_features", labelCol="log_baseRent", maxBins=10000)

# Fit the DecisionTreeRegressor model
dt_model = dt.fit(assembled_df)

# Get feature importances
feature_importances = dt_model.featureImportances

# Show feature importances
for i, col in enumerate(columns):
    print(f"{col}: {feature_importances[i]}")

23/12/19 19:27:02 WARN MemoryStore: Not enough space to cache rdd_120_0 in memory! (computed 245.3 MiB so far)
23/12/19 19:27:02 WARN BlockManager: Persisting block rdd_120_0 to disk instead.
23/12/19 19:27:06 WARN MemoryStore: Not enough space to cache rdd_120_0 in memory! (computed 249.5 MiB so far)
23/12/19 19:27:07 WARN DAGScheduler: Broadcasting large task binary with size 1049.7 KiB
23/12/19 19:27:09 WARN MemoryStore: Not enough space to cache rdd_120_0 in memory! (computed 249.5 MiB so far)
23/12/19 19:27:11 WARN DAGScheduler: Broadcasting large task binary with size 1050.3 KiB
23/12/19 19:27:13 WARN MemoryStore: Not enough space to cache rdd_120_0 in memory! (computed 249.5 MiB so far)
23/12/19 19:27:16 WARN DAGScheduler: Broadcasting large task binary with size 1220.1 KiB
23/12/19 19:27:18 WARN MemoryStore: Not enough space to cache rdd_120_0 in memory! (computed 249.5 MiB so far)
23/12/19 19:27:23 WARN DAGScheduler: Broadcasting large task binary with size 1233.6 KiB
23/12/19

regio2_index: 0.0
geo_plz_index: 0.6398650396266593
regio3_index: 0.02155902264133965
regio1_index: 0.0
heatingType_index: 0.0
balcony: 0.0
firingTypes_index: 0.0
hasKitchen: 0.0
cellar: 0.0
condition_index: 0.0
interiorQual_index: 0.0
petsAllowed_index: 0.0
lift: 0.0
typeOfFlat_index: 0.0
garden: 0.0
log_pictureCount: 0.0
log_yearConstructed: 0.0032075929095949566
log_noRooms: 0.0
log_floor: 0.0
log_numberOfFloors: 0.0
log_serviceCharge: 0.0
log_livingSpace: 0.3103269497855507
log_thermalChar: 0.025041395036855443
features_desc: 0.0
features_fac: 0.0
features_street: 0.0


Select only important variables

In [10]:
data_modelling=data.select('geo_plz','regio3','log_yearConstructed','log_livingSpace','log_thermalChar','log_baseRent')

In [11]:
print(data_modelling.dtypes)

[('geo_plz', 'string'), ('regio3', 'string'), ('log_yearConstructed', 'double'), ('log_livingSpace', 'double'), ('log_thermalChar', 'double'), ('log_baseRent', 'double')]


Create assembled dataset and split train and test

In [12]:
# Convert categorical columns to numerical indices if needed
indexers = [StringIndexer(inputCol=col, outputCol=f"{col}_index").fit(data_modelling) for col in ['geo_plz', 'regio3']]
indexed_data = data_modelling
for indexer in indexers:
    indexed_data = indexer.transform(indexed_data)

    #DROP geo_plz and regio3 because we use _index version
indexed_data = indexed_data.drop('geo_plz','regio3')
    
indexed_data.select('*').show()



                                                                                

+-------------------+--------------------+--------------------+--------------------+-------------+------------+
|log_yearConstructed|     log_livingSpace|     log_thermalChar|        log_baseRent|geo_plz_index|regio3_index|
+-------------------+--------------------+--------------------+--------------------+-------------+------------+
| 0.6282416195944327|0.058268908123975824| 0.09956940864919347|0.058268908123975824|        159.0|       176.0|
| 0.6169198151720611| 0.03941300235683506| 0.06013502281789924| 0.04286790022717587|          0.0|         3.0|
| 0.6461901974721743| 0.03597506712258816| 0.03901896805752461| 0.04305948946044701|       1660.0|      2143.0|
| 0.6004482673286274| 0.04879016416943205|  0.1220493658481239| 0.03922071315328133|         35.0|        35.0|
| 0.6399205844585789|0.055112990052911745|0.041939328615527835| 0.05832551029562754|        207.0|      1175.0|
|  0.608967486981566| 0.03440142671733232| 0.08406522211322813| 0.03246719013750141|        103.0|      

In [13]:
#Only once

#We try to calculate correlation by using string indexing values. After, we try models with and without one of this features to see differences
indexed_data.stat.corr('geo_plz_index','regio3_index') #0.7802

                                                                                

0.780274345102174

In [14]:
#highly correlated
indexed_data=indexed_data.drop('regio3_index')
indexed_data.select('*').show()

+-------------------+--------------------+--------------------+--------------------+-------------+
|log_yearConstructed|     log_livingSpace|     log_thermalChar|        log_baseRent|geo_plz_index|
+-------------------+--------------------+--------------------+--------------------+-------------+
| 0.6282416195944327|0.058268908123975824| 0.09956940864919347|0.058268908123975824|        159.0|
| 0.6169198151720611| 0.03941300235683506| 0.06013502281789924| 0.04286790022717587|          0.0|
| 0.6461901974721743| 0.03597506712258816| 0.03901896805752461| 0.04305948946044701|       1660.0|
| 0.6004482673286274| 0.04879016416943205|  0.1220493658481239| 0.03922071315328133|         35.0|
| 0.6399205844585789|0.055112990052911745|0.041939328615527835| 0.05832551029562754|        207.0|
|  0.608967486981566| 0.03440142671733232| 0.08406522211322813| 0.03246719013750141|        103.0|
| 0.6159292256903649|0.042992437404078994|0.040472949799644324| 0.04802796901058147|         62.0|
|  0.64474

We also tried to rerun feature importance by excluding regio1/2/3 columns, in order to see if some other variables have importance, but nothing is figured out: just this features have importance.

In [15]:
#Scale also geo_plz_index -> geo_plz 
#Only once
from pyspark.ml.feature import StandardScaler,StandardScalerModel

col_to_scale=['geo_plz_index']

assembler = VectorAssembler().setInputCols(col_to_scale).setOutputCol("features")
indexed_data = assembler.transform(indexed_data)

In [16]:
indexed_data.select('*').show()

+-------------------+--------------------+--------------------+--------------------+-------------+--------+
|log_yearConstructed|     log_livingSpace|     log_thermalChar|        log_baseRent|geo_plz_index|features|
+-------------------+--------------------+--------------------+--------------------+-------------+--------+
| 0.6282416195944327|0.058268908123975824| 0.09956940864919347|0.058268908123975824|        159.0| [159.0]|
| 0.6169198151720611| 0.03941300235683506| 0.06013502281789924| 0.04286790022717587|          0.0|   [0.0]|
| 0.6461901974721743| 0.03597506712258816| 0.03901896805752461| 0.04305948946044701|       1660.0|[1660.0]|
| 0.6004482673286274| 0.04879016416943205|  0.1220493658481239| 0.03922071315328133|         35.0|  [35.0]|
| 0.6399205844585789|0.055112990052911745|0.041939328615527835| 0.05832551029562754|        207.0| [207.0]|
|  0.608967486981566| 0.03440142671733232| 0.08406522211322813| 0.03246719013750141|        103.0| [103.0]|
| 0.6159292256903649|0.04299

In [17]:
from pyspark.ml.feature import StandardScaler,StandardScalerModel,MinMaxScaler
from pyspark.ml.linalg import Vectors
scaler = MinMaxScaler().setInputCol("features").setOutputCol("geo_plz_scaled")


In [18]:
scaler_model = scaler.fit(indexed_data.select("features"))
indexed_data = scaler_model.transform(indexed_data)
indexed_data.show()

                                                                                

+-------------------+--------------------+--------------------+--------------------+-------------+--------+--------------------+
|log_yearConstructed|     log_livingSpace|     log_thermalChar|        log_baseRent|geo_plz_index|features|      geo_plz_scaled|
+-------------------+--------------------+--------------------+--------------------+-------------+--------+--------------------+
| 0.6282416195944327|0.058268908123975824| 0.09956940864919347|0.058268908123975824|        159.0| [159.0]|[0.02300014465499...|
| 0.6169198151720611| 0.03941300235683506| 0.06013502281789924| 0.04286790022717587|          0.0|   [0.0]|               [0.0]|
| 0.6461901974721743| 0.03597506712258816| 0.03901896805752461| 0.04305948946044701|       1660.0|[1660.0]|[0.24012729639809...|
| 0.6004482673286274| 0.04879016416943205|  0.1220493658481239| 0.03922071315328133|         35.0|  [35.0]|[0.00506292492405...|
| 0.6399205844585789|0.055112990052911745|0.041939328615527835| 0.05832551029562754|        207.0

In [19]:
#drop the not needed columns (geo_plz_index and features)
indexed_data=indexed_data.drop('geo_plz_index','features')
indexed_data.select('*').show()

+-------------------+--------------------+--------------------+--------------------+--------------------+
|log_yearConstructed|     log_livingSpace|     log_thermalChar|        log_baseRent|      geo_plz_scaled|
+-------------------+--------------------+--------------------+--------------------+--------------------+
| 0.6282416195944327|0.058268908123975824| 0.09956940864919347|0.058268908123975824|[0.02300014465499...|
| 0.6169198151720611| 0.03941300235683506| 0.06013502281789924| 0.04286790022717587|               [0.0]|
| 0.6461901974721743| 0.03597506712258816| 0.03901896805752461| 0.04305948946044701|[0.24012729639809...|
| 0.6004482673286274| 0.04879016416943205|  0.1220493658481239| 0.03922071315328133|[0.00506292492405...|
| 0.6399205844585789|0.055112990052911745|0.041939328615527835| 0.05832551029562754|[0.02994358455084...|
|  0.608967486981566| 0.03440142671733232| 0.08406522211322813| 0.03246719013750141|[0.01489946477650...|
| 0.6159292256903649|0.042992437404078994|0.04

In [20]:
print(indexed_data.dtypes)

[('log_yearConstructed', 'double'), ('log_livingSpace', 'double'), ('log_thermalChar', 'double'), ('log_baseRent', 'double'), ('geo_plz_scaled', 'vector')]


In [21]:
###Change type from vector to double
from pyspark.sql.functions import udf
change_type = udf(lambda x: float(list(x)[0]), DoubleType()) #take the only element making
#the list

indexed_data = indexed_data.withColumn("geo_plz_scaled", change_type("geo_plz_scaled")) 
#output / input

indexed_data.show()

[Stage 69:>                                                         (0 + 1) / 1]

+-------------------+--------------------+--------------------+--------------------+--------------------+
|log_yearConstructed|     log_livingSpace|     log_thermalChar|        log_baseRent|      geo_plz_scaled|
+-------------------+--------------------+--------------------+--------------------+--------------------+
| 0.6282416195944327|0.058268908123975824| 0.09956940864919347|0.058268908123975824| 0.02300014465499783|
| 0.6169198151720611| 0.03941300235683506| 0.06013502281789924| 0.04286790022717587|                 0.0|
| 0.6461901974721743| 0.03597506712258816| 0.03901896805752461| 0.04305948946044701| 0.24012729639809058|
| 0.6004482673286274| 0.04879016416943205|  0.1220493658481239| 0.03922071315328133|0.005062924924056127|
| 0.6399205844585789|0.055112990052911745|0.041939328615527835| 0.05832551029562754|0.029943584550846232|
|  0.608967486981566| 0.03440142671733232| 0.08406522211322813| 0.03246719013750141| 0.01489946477650803|
| 0.6159292256903649|0.042992437404078994|0.04

                                                                                

In [22]:
indexed_data.show()

[Stage 70:>                                                         (0 + 1) / 1]

+-------------------+--------------------+--------------------+--------------------+--------------------+
|log_yearConstructed|     log_livingSpace|     log_thermalChar|        log_baseRent|      geo_plz_scaled|
+-------------------+--------------------+--------------------+--------------------+--------------------+
| 0.6282416195944327|0.058268908123975824| 0.09956940864919347|0.058268908123975824| 0.02300014465499783|
| 0.6169198151720611| 0.03941300235683506| 0.06013502281789924| 0.04286790022717587|                 0.0|
| 0.6461901974721743| 0.03597506712258816| 0.03901896805752461| 0.04305948946044701| 0.24012729639809058|
| 0.6004482673286274| 0.04879016416943205|  0.1220493658481239| 0.03922071315328133|0.005062924924056127|
| 0.6399205844585789|0.055112990052911745|0.041939328615527835| 0.05832551029562754|0.029943584550846232|
|  0.608967486981566| 0.03440142671733232| 0.08406522211322813| 0.03246719013750141| 0.01489946477650803|
| 0.6159292256903649|0.042992437404078994|0.04

                                                                                

In [23]:
print(indexed_data.dtypes)

[('log_yearConstructed', 'double'), ('log_livingSpace', 'double'), ('log_thermalChar', 'double'), ('log_baseRent', 'double'), ('geo_plz_scaled', 'double')]


In [24]:
# Assemble features into a single vector column
assembler = VectorAssembler(inputCols=['geo_plz_scaled', 'log_yearConstructed', 'log_livingSpace', 'log_thermalChar'],
                            outputCol="features")
assembled_data = assembler.transform(indexed_data)

In [25]:
print(assembled_data.dtypes)

[('log_yearConstructed', 'double'), ('log_livingSpace', 'double'), ('log_thermalChar', 'double'), ('log_baseRent', 'double'), ('geo_plz_scaled', 'double'), ('features', 'vector')]


In [26]:
assembled_data.show(3)

[Stage 71:>                                                         (0 + 1) / 1]

+-------------------+--------------------+-------------------+--------------------+-------------------+--------------------+
|log_yearConstructed|     log_livingSpace|    log_thermalChar|        log_baseRent|     geo_plz_scaled|            features|
+-------------------+--------------------+-------------------+--------------------+-------------------+--------------------+
| 0.6282416195944327|0.058268908123975824|0.09956940864919347|0.058268908123975824|0.02300014465499783|[0.02300014465499...|
| 0.6169198151720611| 0.03941300235683506|0.06013502281789924| 0.04286790022717587|                0.0|[0.0,0.6169198151...|
| 0.6461901974721743| 0.03597506712258816|0.03901896805752461| 0.04305948946044701|0.24012729639809058|[0.24012729639809...|
+-------------------+--------------------+-------------------+--------------------+-------------------+--------------------+
only showing top 3 rows



                                                                                

In [27]:
train,test=assembled_data.randomSplit([0.8, 0.2], seed=1234)

In [28]:
max_value=assembled_data.agg({"geo_plz_scaled":"max"}).collect()[0][0]
print("Max value:", max_value)

[Stage 72:>                                                         (0 + 1) / 1]

Max value: 1.0


                                                                                

In [29]:
min_value=assembled_data.agg({"geo_plz_scaled":"min"}).collect()[0][0]
print("Min value:", min_value)

[Stage 75:>                                                         (0 + 1) / 1]

Min value: 0.0


                                                                                

### Linear Regression - target log_baseRent

In [29]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator, TrainValidationSplit

In [31]:


# Define the Linear Regression model
lr = LinearRegression(featuresCol="features", labelCol="log_baseRent", regParam=0.1)

# Train the model
lr_model = lr.fit(train)

# Make predictions on the test set
predictions = lr_model.transform(test)
# List of metrics to evaluate
metrics = ["rmse", "mse", "r2", "mae"]


# Loop through each metric and evaluate the predictions
for metric_name in metrics:
    # Evaluate the model
    evaluator = RegressionEvaluator(labelCol="log_baseRent", predictionCol="prediction", metricName=metric_name)
    msle = evaluator.evaluate(predictions)
    metric_value = evaluator.evaluate(predictions)
    print(f"{metric_name.capitalize()}: {metric_value}")

                                                                                

Rmse: 0.04973005059844042


                                                                                

Mse: 0.002473077932523444


                                                                                

R2: 0.3162508215042621


[Stage 40:>                                                         (0 + 1) / 1]

Mae: 0.03823325040709592


                                                                                

###hyper parameter tuning for Regressor models

In [30]:
train_sample=train.sample(fraction=0.3, seed=42, withReplacement=False)

In [31]:
# Split the data into training and validation sets
train_tuning, validation_tuning = train_sample.randomSplit([0.8, 0.2], seed=1234)


In [30]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import DecisionTreeRegressor

In [33]:
def RegTuning (param,model):

  

    # Define an evaluator
    evaluator = RegressionEvaluator(labelCol="log_baseRent", predictionCol="prediction", metricName="r2")

    # Create CrossValidator
    cv = CrossValidator(estimator=model,
                        estimatorParamMaps=param,
                        evaluator=evaluator,
                        numFolds=3,
                        seed=42,
                        parallelism=20)
    # Fit the model
    cv_model = cv.fit(train_tuning)

    # Make predictions on the test set
    #predictions = cv_model.transform(validation_tuning)

    # Best model from hyperparameter tuning
    best_model = cv_model.bestModel

    # Evaluate the model using R2
    r2 = evaluator.evaluate(best_model.transform(validation_tuning))
    print(f"R2 Score: {r2}")



    return best_model


Decision Tree Regressor Tuning

In [36]:
# Create a DecisionTreeRegressor model
dt = DecisionTreeRegressor(featuresCol="features", labelCol="log_baseRent",maxBins=10000)

In [37]:
  # Create a parameter grid
param_grid = (ParamGridBuilder()
                .addGrid(dt.maxDepth, [5, 10, 15])
                .build())

In [38]:
best_model=RegTuning(param_grid,dt)

23/12/09 01:09:41 WARN DAGScheduler: Broadcasting large task binary with size 1020.4 KiB
23/12/09 01:09:54 WARN DAGScheduler: Broadcasting large task binary with size 1240.8 KiB
23/12/09 01:10:07 WARN DAGScheduler: Broadcasting large task binary with size 1460.8 KiB
23/12/09 01:10:24 WARN DAGScheduler: Broadcasting large task binary with size 1675.0 KiB
23/12/09 01:10:31 WARN DAGScheduler: Broadcasting large task binary with size 1823.3 KiB
23/12/09 01:10:37 WARN DAGScheduler: Broadcasting large task binary with size 1923.2 KiB
23/12/09 01:10:40 WARN DAGScheduler: Broadcasting large task binary with size 1266.3 KiB
23/12/09 01:11:46 WARN DAGScheduler: Broadcasting large task binary with size 1049.9 KiB
23/12/09 01:11:57 WARN DAGScheduler: Broadcasting large task binary with size 1269.1 KiB
23/12/09 01:12:16 WARN DAGScheduler: Broadcasting large task binary with size 1488.8 KiB
23/12/09 01:12:27 WARN DAGScheduler: Broadcasting large task binary with size 1705.0 KiB
23/12/09 01:12:39 WAR

R2 Score: 0.668878812022607


                                                                                

In [39]:
best_max_depth = best_model.getOrDefault("maxDepth")
print(f"Best maxDepth: {best_max_depth}")

Best maxDepth: 10


In [40]:
dt = DecisionTreeRegressor(featuresCol="features", labelCol="log_baseRent",maxBins=10000,maxDepth=best_max_depth)

In [41]:
param_grid = (ParamGridBuilder()
                .addGrid(dt.minInstancesPerNode, [1, 5, 10])
                .build())

In [42]:
best_model=RegTuning(param_grid,dt)

[Stage 588:>                                                        (0 + 1) / 1]

R2 Score: 0.6839531164651129


                                                                                

In [43]:
best_minInstancesPerNode=best_model.getOrDefault("minInstancesPerNode")
print(f"Best minInstancesPerNode: {best_minInstancesPerNode}")


Best minInstancesPerNode: 10


In [44]:
dt = DecisionTreeRegressor(featuresCol="features", labelCol="log_baseRent",maxBins=10000,maxDepth=best_max_depth,minInstancesPerNode=best_minInstancesPerNode)

In [46]:
param_grid = (ParamGridBuilder()
                .addGrid(dt.minInfoGain, [0.0, 0.1, 0.2]) 
                .build())

In [45]:
best_model=RegTuning(param_grid,dt)

[Stage 856:>                                                        (0 + 1) / 1]

R2 Score: 0.6839531164651129


                                                                                

In [47]:
best_minInfoGain=best_model.getOrDefault("minInfoGain")
print(f"Best best_minInfoGain: {best_minInfoGain}")

Best best_minInfoGain: 0.0


In [48]:
dt = DecisionTreeRegressor(featuresCol="features", labelCol="log_baseRent",maxBins=10000,maxDepth=best_max_depth,minInstancesPerNode=best_minInstancesPerNode,minInfoGain=best_minInfoGain)

In [49]:
print(f"Best maxDepth: {best_max_depth}")
print(f"Best minInstancesPerNode: {best_minInstancesPerNode}")
print(f"Best best_minInfoGain: {best_minInfoGain}")





Best maxDepth: 10
Best minInstancesPerNode: 10
Best best_minInfoGain: 0.0


Gradient Boosting Tuning

In [50]:
from pyspark.ml.regression import GBTRegressor

In [51]:
gbt = GBTRegressor(featuresCol="features", labelCol="log_baseRent",maxBins=10000)

In [52]:
param_grid = (ParamGridBuilder()
              .addGrid(gbt.maxDepth, [5, 10, 15])
              .build())

In [53]:
best_model=RegTuning(param_grid,gbt)

23/12/09 01:22:53 WARN DAGScheduler: Broadcasting large task binary with size 1020.9 KiB
23/12/09 01:23:07 WARN DAGScheduler: Broadcasting large task binary with size 1241.3 KiB
23/12/09 01:23:25 WARN DAGScheduler: Broadcasting large task binary with size 1461.3 KiB
23/12/09 01:23:44 WARN DAGScheduler: Broadcasting large task binary with size 1675.4 KiB
23/12/09 01:23:44 WARN DAGScheduler: Broadcasting large task binary with size 1007.9 KiB
23/12/09 01:23:45 WARN DAGScheduler: Broadcasting large task binary with size 1021.5 KiB
23/12/09 01:23:48 WARN DAGScheduler: Broadcasting large task binary with size 1040.5 KiB
23/12/09 01:23:50 WARN DAGScheduler: Broadcasting large task binary with size 1065.4 KiB
23/12/09 01:23:52 WARN DAGScheduler: Broadcasting large task binary with size 1054.7 KiB
23/12/09 01:23:52 WARN DAGScheduler: Broadcasting large task binary with size 1055.1 KiB
23/12/09 01:23:52 WARN DAGScheduler: Broadcasting large task binary with size 1055.7 KiB
23/12/09 01:23:52 WAR

R2 Score: 0.7061501582215965


                                                                                

In [54]:
best_max_depth = best_model.getOrDefault("maxDepth")
print(f"Best maxDepth: {best_max_depth}")

Best maxDepth: 10


In [55]:
gbt = GBTRegressor(featuresCol="features", labelCol="log_baseRent",maxBins=10000,maxDepth=best_max_depth)

In [56]:
param_grid = (ParamGridBuilder()
              .addGrid(gbt.maxIter, [10, 20, 30])
              .build())

In [57]:
best_model=RegTuning(param_grid,gbt)

23/12/09 01:57:31 WARN DAGScheduler: Broadcasting large task binary with size 1007.9 KiB
23/12/09 01:57:31 WARN DAGScheduler: Broadcasting large task binary with size 1007.9 KiB
23/12/09 01:57:32 WARN DAGScheduler: Broadcasting large task binary with size 1007.9 KiB
23/12/09 01:57:32 WARN DAGScheduler: Broadcasting large task binary with size 1021.5 KiB
23/12/09 01:57:32 WARN DAGScheduler: Broadcasting large task binary with size 1021.5 KiB
23/12/09 01:57:33 WARN DAGScheduler: Broadcasting large task binary with size 1021.5 KiB
23/12/09 01:57:35 WARN DAGScheduler: Broadcasting large task binary with size 1040.5 KiB
23/12/09 01:57:35 WARN DAGScheduler: Broadcasting large task binary with size 1040.5 KiB
23/12/09 01:57:35 WARN DAGScheduler: Broadcasting large task binary with size 1040.5 KiB
23/12/09 01:57:37 WARN DAGScheduler: Broadcasting large task binary with size 1065.4 KiB
23/12/09 01:57:37 WARN DAGScheduler: Broadcasting large task binary with size 1065.4 KiB
23/12/09 01:57:38 WAR

R2 Score: 0.7108845669145437


                                                                                

In [58]:
best_max_iter = best_model.getOrDefault("maxIter")
print(f"Best maxIter: {best_max_iter}")

Best maxIter: 30


In [59]:
gbt = GBTRegressor(featuresCol="features", labelCol="log_baseRent",maxBins=10000,maxDepth=best_max_depth,maxIter=best_max_iter)

In [60]:
param_grid = (ParamGridBuilder()
              .addGrid(gbt.minInstancesPerNode, [1, 5, 10])
              .build())

In [61]:
best_model=RegTuning(param_grid,gbt)

23/12/09 02:15:42 WARN DAGScheduler: Broadcasting large task binary with size 1007.9 KiB
23/12/09 02:15:45 WARN DAGScheduler: Broadcasting large task binary with size 1021.5 KiB
23/12/09 02:15:46 WARN DAGScheduler: Broadcasting large task binary with size 1040.5 KiB
23/12/09 02:15:49 WARN DAGScheduler: Broadcasting large task binary with size 1065.4 KiB
23/12/09 02:15:51 WARN DAGScheduler: Broadcasting large task binary with size 1054.7 KiB
23/12/09 02:15:52 WARN DAGScheduler: Broadcasting large task binary with size 1055.2 KiB
23/12/09 02:15:52 WARN DAGScheduler: Broadcasting large task binary with size 1055.7 KiB
23/12/09 02:15:52 WARN DAGScheduler: Broadcasting large task binary with size 1056.9 KiB
23/12/09 02:15:52 WARN DAGScheduler: Broadcasting large task binary with size 1059.1 KiB
23/12/09 02:15:53 WARN DAGScheduler: Broadcasting large task binary with size 1063.7 KiB
23/12/09 02:15:54 WARN DAGScheduler: Broadcasting large task binary with size 1072.3 KiB
23/12/09 02:15:55 WAR

R2 Score: 0.7373882757730534


                                                                                

In [62]:
best_minInstancesPerNode=best_model.getOrDefault("minInstancesPerNode")
print(f"Best minInstancesPerNode: {best_minInstancesPerNode}")

Best minInstancesPerNode: 10


In [63]:
gbt = GBTRegressor(featuresCol="features", labelCol="log_baseRent",maxBins=10000,maxDepth=best_max_depth,maxIter=best_max_iter,minInstancesPerNode=best_minInstancesPerNode)

In [64]:
param_grid = (ParamGridBuilder()
              .addGrid(gbt.stepSize, [0.1, 0.05, 0.01])
              .build())

In [65]:
best_model=RegTuning(param_grid,gbt)

23/12/09 02:35:28 WARN DAGScheduler: Broadcasting large task binary with size 1001.0 KiB
23/12/09 02:35:34 WARN DAGScheduler: Broadcasting large task binary with size 1006.4 KiB
23/12/09 02:35:36 WARN DAGScheduler: Broadcasting large task binary with size 1018.2 KiB
23/12/09 02:35:37 WARN DAGScheduler: Broadcasting large task binary with size 1032.8 KiB
23/12/09 02:35:40 WARN DAGScheduler: Broadcasting large task binary with size 1051.8 KiB
23/12/09 02:35:43 WARN DAGScheduler: Broadcasting large task binary with size 1000.2 KiB
23/12/09 02:35:44 WARN DAGScheduler: Broadcasting large task binary with size 1038.3 KiB
23/12/09 02:35:44 WARN DAGScheduler: Broadcasting large task binary with size 1004.3 KiB
23/12/09 02:35:44 WARN DAGScheduler: Broadcasting large task binary with size 1038.7 KiB
23/12/09 02:35:44 WARN DAGScheduler: Broadcasting large task binary with size 1039.3 KiB
23/12/09 02:35:44 WARN DAGScheduler: Broadcasting large task binary with size 1000.7 KiB
23/12/09 02:35:44 WAR

R2 Score: 0.7373882757730534


                                                                                

In [66]:
best_stepSize=best_model.getOrDefault("stepSize")
print(f"Best stepSize: {best_stepSize}")

Best stepSize: 0.1


In [67]:
gbt = GBTRegressor(featuresCol="features", labelCol="log_baseRent",maxBins=10000,maxDepth=best_max_depth,maxIter=best_max_iter,
                   minInstancesPerNode=best_minInstancesPerNode,stepSize=best_stepSize)

In [68]:
param_grid = (ParamGridBuilder()
              .addGrid(gbt.subsamplingRate, [0.8, 0.9])
              .build())

In [69]:
best_model=RegTuning(param_grid,gbt)

23/12/09 02:50:46 WARN DAGScheduler: Broadcasting large task binary with size 1008.5 KiB
23/12/09 02:50:47 WARN DAGScheduler: Broadcasting large task binary with size 1001.7 KiB
23/12/09 02:50:48 WARN DAGScheduler: Broadcasting large task binary with size 1002.2 KiB
23/12/09 02:50:51 WARN DAGScheduler: Broadcasting large task binary with size 1002.8 KiB
23/12/09 02:50:51 WARN DAGScheduler: Broadcasting large task binary with size 1003.9 KiB
23/12/09 02:50:51 WARN DAGScheduler: Broadcasting large task binary with size 1005.9 KiB
23/12/09 02:50:51 WARN DAGScheduler: Broadcasting large task binary with size 1008.6 KiB
23/12/09 02:50:52 WARN DAGScheduler: Broadcasting large task binary with size 1003.2 KiB
23/12/09 02:50:52 WARN DAGScheduler: Broadcasting large task binary with size 1012.1 KiB
23/12/09 02:50:52 WARN DAGScheduler: Broadcasting large task binary with size 1011.4 KiB
23/12/09 02:50:52 WARN DAGScheduler: Broadcasting large task binary with size 1015.1 KiB
23/12/09 02:50:53 WAR

R2 Score: 0.7349290234096502


                                                                                

In [70]:
best_subsamplingRate=best_model.getOrDefault("subsamplingRate")
print(f"Best subsamplingRate: {best_subsamplingRate}")

Best subsamplingRate: 0.9


In [71]:

print(f"Best maxDepth: {best_max_depth}")
print(f"Best maxIter: {best_max_iter}")
print(f"Best minInstancesPerNode: {best_minInstancesPerNode}")
print(f"Best stepSize: {best_stepSize}")
print(f"Best subsamplingRate: {best_subsamplingRate}")






Best maxDepth: 10
Best maxIter: 30
Best minInstancesPerNode: 10
Best stepSize: 0.1
Best subsamplingRate: 0.9


Random Forest Regressor Tuning

In [34]:
from pyspark.ml.regression import RandomForestRegressor

In [36]:
rf = RandomForestRegressor(featuresCol="features", labelCol="log_baseRent", maxBins=10000)

In [37]:
# Create a parameter grid
param_grid = (ParamGridBuilder()
              .addGrid(rf.maxDepth, [10])
              .build())

In [38]:
best_model=RegTuning(param_grid,rf)

23/12/09 11:02:36 WARN DAGScheduler: Broadcasting large task binary with size 1044.9 KiB
23/12/09 11:02:47 WARN DAGScheduler: Broadcasting large task binary with size 1085.6 KiB
23/12/09 11:02:55 WARN DAGScheduler: Broadcasting large task binary with size 1212.3 KiB
23/12/09 11:03:07 WARN DAGScheduler: Broadcasting large task binary with size 1246.8 KiB
23/12/09 11:03:19 WARN DAGScheduler: Broadcasting large task binary with size 1242.9 KiB
23/12/09 11:03:31 WARN DAGScheduler: Broadcasting large task binary with size 1033.2 KiB
23/12/09 11:03:47 WARN DAGScheduler: Broadcasting large task binary with size 1082.4 KiB
23/12/09 11:04:28 WARN DAGScheduler: Broadcasting large task binary with size 1013.7 KiB
23/12/09 11:04:38 WARN DAGScheduler: Broadcasting large task binary with size 1084.7 KiB
23/12/09 11:04:50 WARN DAGScheduler: Broadcasting large task binary with size 1211.6 KiB
23/12/09 11:04:59 WARN DAGScheduler: Broadcasting large task binary with size 1171.0 KiB
23/12/09 11:05:09 WAR

R2 Score: 0.712101837099531


                                                                                

In [39]:
best_max_depth = best_model.getOrDefault("maxDepth")
print(f"Best maxDepth: {best_max_depth}")
print(best_model.getOrDefault("numTrees"))

Best maxDepth: 10
20


In [41]:
rf = RandomForestRegressor(featuresCol="features", labelCol="log_baseRent", maxBins=10000,maxDepth=best_max_depth)

In [42]:
param_grid = (ParamGridBuilder()
              .addGrid(rf.numTrees, [20,30])
              .build())

In [43]:
best_model=RegTuning(param_grid,rf)

23/12/09 11:12:23 WARN DAGScheduler: Broadcasting large task binary with size 1044.9 KiB
23/12/09 11:12:34 WARN DAGScheduler: Broadcasting large task binary with size 1014.5 KiB
23/12/09 11:12:40 WARN DAGScheduler: Broadcasting large task binary with size 1085.6 KiB
23/12/09 11:12:50 WARN DAGScheduler: Broadcasting large task binary with size 1087.2 KiB
23/12/09 11:12:57 WARN DAGScheduler: Broadcasting large task binary with size 1212.3 KiB
23/12/09 11:13:08 WARN DAGScheduler: Broadcasting large task binary with size 1206.3 KiB
23/12/09 11:13:14 WARN DAGScheduler: Broadcasting large task binary with size 1246.8 KiB
23/12/09 11:13:26 WARN DAGScheduler: Broadcasting large task binary with size 1187.7 KiB
23/12/09 11:13:34 WARN DAGScheduler: Broadcasting large task binary with size 1242.9 KiB
23/12/09 11:13:51 WARN DAGScheduler: Broadcasting large task binary with size 1279.5 KiB
23/12/09 11:13:57 WARN DAGScheduler: Broadcasting large task binary with size 1033.2 KiB
23/12/09 11:14:09 WAR

R2 Score: 0.712101837099531


                                                                                

In [44]:
best_numTrees = best_model.getOrDefault("numTrees")
print(f"Best numTrees: {best_numTrees}")
print(best_model.getOrDefault("minInstancesPerNode"))

Best numTrees: 20
1


In [36]:
rf = RandomForestRegressor(featuresCol="features", labelCol="log_baseRent", maxBins=10000,maxDepth=10,numTrees=20)

In [37]:
param_grid = (ParamGridBuilder()
              .addGrid(rf.minInstancesPerNode, [50, 100, 150])
              .build())

In [38]:
best_model=RegTuning(param_grid,rf)

23/12/09 11:39:23 WARN DAGScheduler: Broadcasting large task binary with size 1076.7 KiB
23/12/09 11:39:25 WARN DAGScheduler: Broadcasting large task binary with size 1015.8 KiB
23/12/09 11:39:42 WARN DAGScheduler: Broadcasting large task binary with size 1049.7 KiB
23/12/09 11:39:53 WARN DAGScheduler: Broadcasting large task binary with size 1240.4 KiB
23/12/09 11:40:00 WARN DAGScheduler: Broadcasting large task binary with size 1245.9 KiB
23/12/09 11:40:23 WARN DAGScheduler: Broadcasting large task binary with size 1473.3 KiB
23/12/09 11:41:29 WARN DAGScheduler: Broadcasting large task binary with size 1017.0 KiB
23/12/09 11:42:31 WARN DAGScheduler: Broadcasting large task binary with size 1075.4 KiB
23/12/09 11:43:14 WARN DAGScheduler: Broadcasting large task binary with size 1050.9 KiB
23/12/09 11:43:21 WARN DAGScheduler: Broadcasting large task binary with size 1248.5 KiB
23/12/09 11:44:13 WARN DAGScheduler: Broadcasting large task binary with size 1233.3 KiB
23/12/09 11:44:28 WAR

R2 Score: 0.7006580914862413


                                                                                

In [39]:
best_minInstancesPerNode=best_model.getOrDefault("minInstancesPerNode")
print(f"Best minInstancesPerNode: {best_minInstancesPerNode}")

Best minInstancesPerNode: 50


Results of all tunings:

DT best values
Best maxDepth: 10
Best minInstancesPerNode: 10
Best best_minInfoGain: 0.0

GBT best values:
Best maxDepth: 10
Best maxIter: 30
Best minInstancesPerNode: 10
Best stepSize: 0.1
Best subsamplingRate: 0.9



RF best values
maxDepth=10,numTrees=20,minInstancesPerNode=50



### DT Regressor

In [31]:


# Define the DecisionTreeRegressor model
dt = DecisionTreeRegressor(featuresCol="features", labelCol="log_baseRent", maxBins=10000,maxDepth=10,minInstancesPerNode=10,minInfoGain=0.0)

# Train the model
dt_model = dt.fit(train)

# Make predictions on the test set
predictions = dt_model.transform(test)

# List of metrics to evaluate
metrics = ["rmse", "mse", "r2", "mae"]


# Loop through each metric and evaluate the predictions
for metric_name in metrics:
    # Evaluate the model
    evaluator = RegressionEvaluator(labelCol="log_baseRent", predictionCol="prediction", metricName=metric_name)
    metric_value = evaluator.evaluate(predictions)
    print(f"{metric_name.capitalize()}: {metric_value}")


                                                                                

Rmse: 0.032580121077015994


                                                                                

Mse: 0.001061464289393022


                                                                                

R2: 0.7065295329636117


[Stage 58:>                                                         (0 + 1) / 1]

Mae: 0.02119453680548442


                                                                                

### GRADIENT BOOSTING REGRESSOR

In [32]:
from pyspark.ml.regression import GBTRegressor

In [34]:
# Define the DecisionTreeRegressor model
dt = GBTRegressor(featuresCol="features", labelCol="log_baseRent", maxBins=10000,maxDepth=10,maxIter=30,minInstancesPerNode=10,stepSize=0.1,subsamplingRate=0.9)

# Train the model
dt_model = dt.fit(train)

# Make predictions on the test set
predictions = dt_model.transform(test)

# List of metrics to evaluate
metrics = ["rmse", "mse", "r2", "mae"]


# Loop through each metric and evaluate the predictions
for metric_name in metrics:
    # Evaluate the model
    evaluator = RegressionEvaluator(labelCol="log_baseRent", predictionCol="prediction", metricName=metric_name)
    metric_value = evaluator.evaluate(predictions)
    print(f"{metric_name.capitalize()}: {metric_value}")

# View feature importances
feature_importances = dt_model.featureImportances
print("Feature Importances:", feature_importances)

# Display predicted results
predictions.select("feature_columns", "log_baseRent", "prediction").show()


23/12/09 12:16:40 WARN DAGScheduler: Broadcasting large task binary with size 1023.1 KiB
23/12/09 12:16:44 WARN DAGScheduler: Broadcasting large task binary with size 1014.7 KiB
23/12/09 12:16:44 WARN DAGScheduler: Broadcasting large task binary with size 1015.1 KiB
23/12/09 12:16:45 WARN DAGScheduler: Broadcasting large task binary with size 1015.7 KiB
23/12/09 12:16:45 WARN DAGScheduler: Broadcasting large task binary with size 1016.8 KiB
23/12/09 12:16:45 WARN DAGScheduler: Broadcasting large task binary with size 1019.1 KiB
23/12/09 12:16:46 WARN DAGScheduler: Broadcasting large task binary with size 1022.3 KiB
23/12/09 12:16:46 WARN DAGScheduler: Broadcasting large task binary with size 1027.8 KiB
23/12/09 12:16:47 WARN DAGScheduler: Broadcasting large task binary with size 1037.3 KiB
23/12/09 12:16:49 WARN DAGScheduler: Broadcasting large task binary with size 1052.5 KiB
23/12/09 12:16:51 WARN DAGScheduler: Broadcasting large task binary with size 1072.9 KiB
23/12/09 12:16:54 WAR

Rmse: 0.028368201149091683


                                                                                

Mse: 0.0008047548364353267


                                                                                

R2: 0.777503793525152


[Stage 666:>                                                        (0 + 1) / 1]

Mae: 0.018249242460911922
Feature Importances: (4,[0,1,2,3],[0.39404865799076744,0.10152469398389263,0.32379489298596986,0.18063175503937012])


                                                                                

### RANDOM FOREST REGRESSOR

In [33]:
from pyspark.ml.regression import RandomForestRegressor

In [35]:
# Define the DecisionTreeRegressor model
dt = RandomForestRegressor(featuresCol="features", labelCol="log_baseRent", maxBins=10000,maxDepth=10,numTrees=20,minInstancesPerNode=50)

# Train the model
dt_model = dt.fit(train)

# Make predictions on the test set
predictions = dt_model.transform(test)

# List of metrics to evaluate
metrics = ["rmse", "mse", "r2", "mae"]


# Loop through each metric and evaluate the predictions
for metric_name in metrics:
    # Evaluate the model
    evaluator = RegressionEvaluator(labelCol="log_baseRent", predictionCol="prediction", metricName=metric_name)
    metric_value = evaluator.evaluate(predictions)
    print(f"{metric_name.capitalize()}: {metric_value}")

# View feature importances
feature_importances = dt_model.featureImportances
print("Feature Importances:", feature_importances)



23/12/09 12:24:32 WARN DAGScheduler: Broadcasting large task binary with size 1010.8 KiB
23/12/09 12:24:47 WARN DAGScheduler: Broadcasting large task binary with size 1043.2 KiB
                                                                                

Rmse: 0.032051686026789726


                                                                                

Mse: 0.0010273105771599078


                                                                                

R2: 0.7159722490118453


[Stage 720:>                                                        (0 + 1) / 1]

Mae: 0.021045446428159056
Feature Importances: (4,[0,1,2,3],[0.07825539538368845,0.2293726417402749,0.518378284984744,0.17399367789129278])


                                                                                