In [0]:
from sklearn.datasets import fetch_california_housing
from sklearn import datasets
import numpy as np
import pandas as pd
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark regression example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

dataset = datasets.fetch_california_housing()
#print(dataset.DESCR)

In [0]:
pdf = pd.DataFrame(data=dataset.data, columns=dataset.feature_names)
pdf['target'] = dataset.target

list1 = []
for i in range(0, 206):
    list1.append(pdf.iloc[i*100])
    
arr = np.array(list1)
pdf2 = pd.DataFrame(arr, columns=dataset.feature_names+dataset.target_names)

In [0]:
a = pd.DataFrame(pdf2, columns=['MedInc'])
b = pd.DataFrame(pdf2, columns=['HouseAge'])
c = pd.DataFrame(pdf2, columns=['AveRooms'])
d = pd.DataFrame(pdf2, columns=['MedHouseVal'])

f = pd.concat([a, b, c, d], axis=1)

In [0]:
df = sqlContext.createDataFrame(f)

In [0]:
from pyspark.sql import Row
from pyspark.ml.linalg import Vectors

def transData(data):
    return data.rdd.map(lambda r: [Vectors.dense(r[:-1]),r[-1]]).toDF(['features','label'])

transformed = transData(df)
transformed.show()

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[8.3252,41.0,6.98...|4.526|
|[2.4912,29.0,3.7248]|2.578|
|[3.0257,52.0,4.04...|0.808|
|[1.9479,50.0,4.82...|0.895|
|[4.2614,52.0,5.52...|2.613|
|[1.5817,52.0,5.43...|1.536|
|[3.0,32.0,4.89913...|2.232|
|[6.5,20.0,6.45161...|  3.4|
|[2.4808,27.0,4.01...|1.576|
|[4.75,35.0,5.5133...| 1.97|
|[3.3841,29.0,4.84...|1.844|
|[2.7143,16.0,6.38...|0.953|
|     [1.91,46.0,5.0]|0.639|
|[2.905,37.0,5.044...|0.973|
|[4.7386,36.0,5.98...|1.864|
|[5.6292,24.0,6.45...|2.314|
|[4.125,30.0,4.717...|2.571|
|[6.0666,10.0,6.52...|2.724|
|[1.125,36.0,4.440...|0.938|
|[1.9727,21.0,5.36...|1.141|
+--------------------+-----+
only showing top 20 rows



In [0]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator

featureIndexer = VectorIndexer(inputCol="features", \
                               outputCol="indexedFeatures",\
                               maxCategories=4).fit(transformed)


data = featureIndexer.transform(transformed)
data.show(5,True)

+--------------------+-----+--------------------+
|            features|label|     indexedFeatures|
+--------------------+-----+--------------------+
|[8.3252,41.0,6.98...|4.526|[8.3252,41.0,6.98...|
|[2.4912,29.0,3.7248]|2.578|[2.4912,29.0,3.7248]|
|[3.0257,52.0,4.04...|0.808|[3.0257,52.0,4.04...|
|[1.9479,50.0,4.82...|0.895|[1.9479,50.0,4.82...|
|[4.2614,52.0,5.52...|2.613|[4.2614,52.0,5.52...|
+--------------------+-----+--------------------+
only showing top 5 rows



In [0]:
(trainingData, testData) = transformed.randomSplit([0.8, 0.2])
trainingData.show(5)
testData.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[1.125,36.0,4.440...|0.938|
|[1.5817,52.0,5.43...|1.536|
|     [1.91,46.0,5.0]|0.639|
|[1.9479,50.0,4.82...|0.895|
|[2.2202,13.0,5.49...|  1.0|
+--------------------+-----+
only showing top 5 rows

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[1.6548,35.0,4.87...|0.494|
|[1.9727,21.0,5.36...|1.141|
|[2.7143,16.0,6.38...|0.953|
|[6.7181,15.0,7.19...|1.603|
|  [4.0167,5.0,4.875]|0.875|
+--------------------+-----+
only showing top 5 rows



In [0]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression()
pipeline = Pipeline(stages=[featureIndexer, lr]) 
model = pipeline.fit(trainingData)

In [0]:
predictions = model.transform(testData)
predictions.select("features","label","prediction").show(1)

+--------------------+-----+------------------+
|            features|label|        prediction|
+--------------------+-----+------------------+
|[1.6548,35.0,4.87...|0.494|1.0930895319995515|
+--------------------+-----+------------------+
only showing top 1 row

