In [1]:
#111111111222222222233333333334444444444555555555566666666667777777777888888888899999999990000000000

In [2]:
# create a spark session
import pyspark
spark_context = pyspark.SparkContext()
spark_session = pyspark.sql.SparkSession(spark_context)

In [3]:
# load csv file into dataframe
df = spark_session.read.csv("../nhousing.csv", header=False, inferSchema=True)
df.show()

+-------+----+----+---+-----+-----+-----+------+---+-----+----+------+-----+----+
|    _c0| _c1| _c2|_c3|  _c4|  _c5|  _c6|   _c7|_c8|  _c9|_c10|  _c11| _c12|_c13|
+-------+----+----+---+-----+-----+-----+------+---+-----+----+------+-----+----+
|0.00632|18.0|2.31|  0|0.538|6.575| 65.2|  4.09|  1|296.0|15.3| 396.9| 4.98|24.0|
|0.02731| 0.0|7.07|  0|0.469|6.421| 78.9|4.9671|  2|242.0|17.8| 396.9| 9.14|21.6|
|0.02729| 0.0|7.07|  0|0.469|7.185| 61.1|4.9671|  2|242.0|17.8|392.83| 4.03|34.7|
|0.03237| 0.0|2.18|  0|0.458|6.998| 45.8|6.0622|  3|222.0|18.7|394.63| 2.94|33.4|
|0.06905| 0.0|2.18|  0|0.458|7.147| 54.2|6.0622|  3|222.0|18.7| 396.9| 5.33|36.2|
|0.02985| 0.0|2.18|  0|0.458| 6.43| 58.7|6.0622|  3|222.0|18.7|394.12| 5.21|28.7|
|0.08829|12.5|7.87|  0|0.524|6.012| 66.6|5.5605|  5|311.0|15.2| 395.6|12.43|22.9|
|0.14455|12.5|7.87|  0|0.524|6.172| 96.1|5.9505|  5|311.0|15.2| 396.9|19.15|27.1|
|0.21124|12.5|7.87|  0|0.524|5.631|100.0|6.0821|  5|311.0|15.2|386.63|29.93|16.5|
|0.17004|12.5|7.

In [4]:
# rename the columns to be more descriptive
df = df.selectExpr("_c0 as crim", "_c1 as zn", "_c2 as indux", \
                   "_c3 as chas", "_c4 as nox", "_c5 as rm", \
                   "_c6 as age", "_c7 as dis", "_c8 as rad", \
                   "_c9 as tax", "_c10 as ptratio", "_c11 as b", \
                   "_c12 as lstat", "_c13 as price")
df.printSchema()   

root
 |-- crim: double (nullable = true)
 |-- zn: double (nullable = true)
 |-- indux: double (nullable = true)
 |-- chas: integer (nullable = true)
 |-- nox: double (nullable = true)
 |-- rm: double (nullable = true)
 |-- age: double (nullable = true)
 |-- dis: double (nullable = true)
 |-- rad: integer (nullable = true)
 |-- tax: double (nullable = true)
 |-- ptratio: double (nullable = true)
 |-- b: double (nullable = true)
 |-- lstat: double (nullable = true)
 |-- price: double (nullable = true)



In [5]:
# transform into a dataframe suitable for machine learning
from pyspark.ml.feature import VectorAssembler
vectorAssembler = VectorAssembler( inputCols = \
    ['crim', 'zn', 'indux', 'chas', 'nox', 'rm', 'age', \
     'dis', 'rad', 'tax', 'ptratio', 'b', 'lstat'], \
    outputCol = 'features' )
mldf = vectorAssembler.transform(df)
mldf = mldf.select(['features','price'])
mldf.show(5)

+--------------------+-----+
|            features|price|
+--------------------+-----+
|[0.00632,18.0,2.3...| 24.0|
|[0.02731,0.0,7.07...| 21.6|
|[0.02729,0.0,7.07...| 34.7|
|[0.03237,0.0,2.18...| 33.4|
|[0.06905,0.0,2.18...| 36.2|
+--------------------+-----+
only showing top 5 rows



In [6]:
# Split the data into a training and test dataframe
train_df, test_df = mldf.randomSplit([0.8, 0.2],seed=5) 

In [7]:
# Train user linear regression
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol='features', \
        labelCol='price', maxIter=1)
lr_model = lr.fit(train_df)

In [8]:
# Do some predictions on the hold-out, test set
predictions = lr_model.transform(test_df)
predictions.select("prediction","price","features").show()

+------------------+-----+--------------------+
|        prediction|price|            features|
+------------------+-----+--------------------+
|27.642081520013004| 24.5|[0.01501,80.0,2.0...|
| 37.38343171546305| 44.0|[0.01538,90.0,3.7...|
| 30.88036863886255| 32.9|[0.01778,95.0,1.4...|
|24.996936690822892| 28.7|[0.02985,0.0,2.18...|
|28.607146504811347| 31.2|[0.03049,55.0,3.7...|
|28.657301643210953| 33.4|[0.03237,0.0,2.18...|
|21.930980863804976| 20.6|[0.03306,0.0,5.19...|
| 19.92105697438567| 19.5|[0.03427,0.0,5.19...|
| 28.61431265464769| 24.1|[0.03445,82.5,2.0...|
| 41.86804404177727| 48.5|[0.0351,95.0,2.68...|
|21.953281009049668| 20.6|[0.04527,0.0,11.9...|
|  19.9581230578766| 17.1|[0.05023,35.0,6.0...|
|22.224641784694743| 22.2|[0.05083,0.0,5.19...|
| 21.92355620842548| 22.5|[0.05188,0.0,4.49...|
| 27.65879461624145| 25.0|[0.0536,21.0,5.64...|
|27.415326668145696| 27.1|[0.05372,0.0,13.9...|
| 35.96042318109285| 50.0|[0.05602,0.0,2.46...|
| 27.45995140184931| 26.6|[0.05735,0.0,4

In [9]:
print("Training R Squared: %f" % lr_model.summary.r2)

Training R Squared: 0.749762


In [10]:
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator( predictionCol="prediction", \
    labelCol="price", metricName="r2" )
print("Test R Squared: %g" % evaluator.evaluate(predictions))

Test R Squared: 0.694189


In [11]:
# Train user linear regression
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol='features', \
        labelCol='price', maxIter=50)
lr_model = lr.fit(train_df)

# Do some predictions on the hold-out, test set
predictions = lr_model.transform(test_df)
predictions.select("prediction","price","features")

print("Training R Squared: %f" % lr_model.summary.r2)

from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator( predictionCol="prediction", \
    labelCol="price", metricName="r2" )
print("Test R Squared: %g" % evaluator.evaluate(predictions))

Training R Squared: 0.749762
Test R Squared: 0.694189


In [12]:
# Train user linear regression
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol='features', \
        labelCol='price', maxIter=100, regParam=1, elasticNetParam=1 )
lr_model = lr.fit(train_df)

# Do some predictions on the hold-out, test set
predictions = lr_model.transform(test_df)
predictions.select("prediction","price","features")

print("Training R Squared: %f" % lr_model.summary.r2)

from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator( predictionCol="prediction", \
    labelCol="price", metricName="r2" )
print("Test R Squared: %g" % evaluator.evaluate(predictions))

Training R Squared: 0.681938
Test R Squared: 0.605846


In [13]:
# Train user linear regression
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol='features', \
        labelCol='price', maxIter=50, regParam=0.2, elasticNetParam=0.85 )
lr_model = lr.fit(train_df)

# Do some predictions on the hold-out, test set
predictions = lr_model.transform(test_df)
predictions.select("prediction","price","features")

print("Training R Squared: %f" % lr_model.summary.r2)

from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator( predictionCol="prediction", \
    labelCol="price", metricName="r2" )
print("Test R Squared: %g" % evaluator.evaluate(predictions))

Training R Squared: 0.735784
Test R Squared: 0.655029


In [14]:
# Train user linear regression
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol='features', \
        labelCol='price', maxIter=100, regParam=0.2, elasticNetParam=0.85 )
lr_model = lr.fit(train_df)

# Do some predictions on the hold-out, test set
predictions = lr_model.transform(test_df)
predictions.select("prediction","price","features")

print("Training R Squared: %f" % lr_model.summary.r2)

from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator( predictionCol="prediction", \
    labelCol="price", metricName="r2" )
print("Test R Squared: %g" % evaluator.evaluate(predictions))

Training R Squared: 0.735784
Test R Squared: 0.655029


In [15]:
# Split the data into a training and test dataframe
train_df, test_df = mldf.randomSplit([0.8, 0.2],seed=20) 

# Train user linear regression
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol='features', \
        labelCol='price', maxIter=100, regParam=0.2, elasticNetParam=0.85 )
lr_model = lr.fit(train_df)

# Do some predictions on the hold-out, test set
predictions = lr_model.transform(test_df)
predictions.select("prediction","price","features")

print("Training R Squared: %f" % lr_model.summary.r2)

from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator( predictionCol="prediction", \
    labelCol="price", metricName="r2" )
print("Test R Squared: %g" % evaluator.evaluate(predictions))

Training R Squared: 0.724882
Test R Squared: 0.691233
