In [0]:
from sklearn.datasets import fetch_california_housing
from sklearn import datasets
import numpy as np
import pandas as pd
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark regression example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

dataset = datasets.fetch_california_housing()
# print(dataset.DESCR)

In [0]:
pdf = pd.DataFrame(data=dataset.data, columns=dataset.feature_names)
pdf['target'] = dataset.target

In [0]:
df = sqlContext.createDataFrame(pdf)

In [0]:
from pyspark.sql import Row
from pyspark.ml.linalg import Vectors

def transData(data):
    return data.rdd.map(lambda r: [Vectors.dense(r[:-1]),r[-1]]).toDF(['features','label'])

transformed= transData(df)
transformed.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[8.3252,41.0,6.98...|4.526|
|[8.3014,21.0,6.23...|3.585|
|[7.2574,52.0,8.28...|3.521|
|[5.6431,52.0,5.81...|3.413|
|[3.8462,52.0,6.28...|3.422|
+--------------------+-----+
only showing top 5 rows



In [0]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator

featureIndexer = VectorIndexer(inputCol="features", \
                               outputCol="indexedFeatures",\
                               maxCategories=4).fit(transformed)


data = featureIndexer.transform(transformed)
data.show(5,True)

+--------------------+-----+--------------------+
|            features|label|     indexedFeatures|
+--------------------+-----+--------------------+
|[8.3252,41.0,6.98...|4.526|[8.3252,41.0,6.98...|
|[8.3014,21.0,6.23...|3.585|[8.3014,21.0,6.23...|
|[7.2574,52.0,8.28...|3.521|[7.2574,52.0,8.28...|
|[5.6431,52.0,5.81...|3.413|[5.6431,52.0,5.81...|
|[3.8462,52.0,6.28...|3.422|[3.8462,52.0,6.28...|
+--------------------+-----+--------------------+
only showing top 5 rows



In [0]:
(trainingData, testData) = transformed.randomSplit([0.8, 0.2])
trainingData.show(5)
testData.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[0.4999,46.0,1.71...|0.675|
|[0.536,46.0,3.142...|0.875|
|[0.6991,26.0,2.66...|0.895|
|[0.716,39.0,4.730...|1.042|
|[0.7286,46.0,3.37...|0.952|
+--------------------+-----+
only showing top 5 rows

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[0.8012,28.0,5.28...|0.818|
|[0.9393,17.0,4.51...|0.889|
|[0.949,52.0,2.524...|  3.5|
|[0.9637,29.0,3.24...|  1.6|
|[0.977,40.0,2.315...|1.188|
+--------------------+-----+
only showing top 5 rows



In [0]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression()
pipeline = Pipeline(stages=[featureIndexer, lr]) 
model = pipeline.fit(trainingData)

In [0]:
predictions = model.transform(testData)
predictions.select("features","label","prediction").show()

+--------------------+-------+--------------------+
|            features|  label|          prediction|
+--------------------+-------+--------------------+
|[0.8012,28.0,5.28...|  0.818|  1.0388410661333012|
|[0.9393,17.0,4.51...|  0.889|0.030030460171872164|
|[0.949,52.0,2.524...|    3.5|   1.446748197178806|
|[0.9637,29.0,3.24...|    1.6|  1.2773765176670082|
|[0.977,40.0,2.315...|  1.188|  1.5523724329125557|
|[1.0472,15.0,5.08...|  0.578|  0.3952319402333231|
|[1.0481,19.0,4.18...|  0.485|  0.4890638334718105|
|[1.0521,40.0,4.45...|  0.691|  1.3037322783087433|
|[1.1019,13.0,4.45...|  1.202| 0.06706445850292653|
|[1.1667,52.0,3.75...|  1.125|  1.4826288559942356|
|[1.1696,52.0,2.43...|5.00001|  1.5368470649690806|
|[1.175,52.0,4.958...|  0.561|  0.7794754084512263|
|[1.2185,22.0,2.94...|    1.7|  1.3073066754939546|
|[1.2434,52.0,2.92...|5.00001|  1.5468473688149231|
|[1.2852,51.0,3.75...|    1.5|  1.6671190385516041|
|[1.3221,30.0,5.38...|  0.612|  0.6248784652940031|
|[1.3281,40.