In [1]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import *

In [2]:
home_data = spark.read.csv('FileStore/tables/home_data.csv',header = True,inferSchema = True)

In [3]:
display(home_data)

id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
7129300520,20141013T000000,221900,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
6414100192,20141209T000000,538000,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
5631500400,20150225T000000,180000,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
2487200875,20141209T000000,604000,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
1954400510,20150218T000000,510000,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503
7237550310,20140512T000000,1225000,4,4.5,5420,101930,1.0,0,0,3,11,3890,1530,2001,0,98053,47.6561,-122.005,4760,101930
1321400060,20140627T000000,257500,3,2.25,1715,6819,2.0,0,0,3,7,1715,0,1995,0,98003,47.3097,-122.327,2238,6819
2008000270,20150115T000000,291850,3,1.5,1060,9711,1.0,0,0,3,7,1060,0,1963,0,98198,47.4095,-122.315,1650,9711
2414600126,20150415T000000,229500,3,1.0,1780,7470,1.0,0,0,3,7,1050,730,1960,0,98146,47.5123,-122.337,1780,8113
3793500160,20150312T000000,323000,3,2.5,1890,6560,2.0,0,0,3,7,1890,0,2003,0,98038,47.3684,-122.031,2390,7570


In [4]:
sqft_living_assembler = VectorAssembler(inputCols=["sqft_living"],outputCol="features")
sqft_living_df= sqft_living_assembler.transform(home_data)
(trainingData, testData) = sqft_living_df.randomSplit([0.8, 0.2],seed = 11L)
sqft_living_lr = LinearRegression(labelCol="price", featuresCol="features")
sqft_living_lr_model = sqft_living_lr.fit(trainingData)
sqft_living_predictions = sqft_living_lr_model.transform(testData)

In [5]:
testData.select(mean('price').alias('price_value')).collect()
evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="rmse")
sqft_rmse = evaluator.evaluate(sqft_living_predictions) 
print("Root Mean Squared Error (RMSE) on test data = %g" % sqft_rmse)

In [6]:
my_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode']
sales.select(my_features).show()
my_features_assembler = VectorAssembler(inputCols=my_features,outputCol="features")
my_features_df= my_features_assembler.transform(home_data)
(my_features_trainingData, my_features_testData) = my_features_df.randomSplit([0.8, 0.2],seed = 11L)

In [7]:
my_features_lr = LinearRegression(labelCol="price", featuresCol="features")
my_features_model = my_features_lr.fit(my_features_trainingData)
my_features_predictions = my_features_model.transform(my_features_testData)
my_features_predictions.select("prediction", "price", "features").show(5)
my_features_testData.select(mean('price').alias('price_value')).collect()  #544128.029162747
evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(my_features_predictions) #
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse) #273356 (previous 281234)

In [8]:
(my_features_trainingData, my_features_testData) = home_data.randomSplit([0.8, 0.2],seed = 11L)
my_features_assembler = VectorAssembler(inputCols=my_features,outputCol="features")
my_features_lr = LinearRegression(labelCol="price", featuresCol="features")

#Pipeline

from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[my_features_assembler, my_features_lr])
pipelineModel = pipeline.fit(my_features_trainingData)
predictions = pipelineModel.transform(my_features_testData)
predictions.select("price", "prediction", "features").show()
rmse = evaluator.evaluate(predictions)
print "RMSE on our test set: %g" % rmse


In [9]:
#RandomForestRegressor

from pyspark.ml.regression import RandomForestRegressor
rf = RandomForestRegressor(labelCol="price", featuresCol="features")
pipeline_rf = Pipeline(stages=[my_features_assembler, rf])
pipeline_rf_Model = pipeline_rf.fit(my_features_trainingData)
predictions_rf = pipeline_rf_Model.transform(my_features_testData)
predictions_rf.select("price", "prediction", "features").show()
rmse_rf = evaluator.evaluate(predictions_rf)
print "rmse_rf on our test set: %g" % rmse_rf


In [10]:
#Gradient-boosted trees (GBTs)
from pyspark.ml.regression import GBTRegressor
gbt = GBTRegressor(labelCol="price", featuresCol="features")
pipeline_gbt = Pipeline(stages=[my_features_assembler, gbt])
pipeline_gbt_Model = pipeline_gbt.fit(my_features_trainingData)
predictions_gbt = pipeline_gbt_Model.transform(my_features_testData)
predictions_gbt.select("price", "prediction", "features").show()
rmse_gbt = evaluator.evaluate(predictions_gbt)
print "rmse_gbt on our test set: %g" % rmse_gbt
