In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = SparkSession.builder.appName("LinearRegression").getOrCreate()

23/12/19 11:31:12 WARN Utils: Your hostname, perezs-zeenbook resolves to a loopback address: 127.0.1.1; using 192.168.7.202 instead (on interface wlo1)
23/12/19 11:31:12 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/12/19 11:31:13 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
df = spark.read.csv("../data/regression.txt", sep=",", inferSchema=True)

column_names = ["x", "y"]
df = df.toDF(*column_names)

df.show(5)

+-----+-----+
|    x|    y|
+-----+-----+
|-1.74| 1.66|
| 1.24|-1.18|
| 0.29| -0.4|
|-0.13| 0.09|
|-0.39| 0.38|
+-----+-----+
only showing top 5 rows



In [3]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator


# train test split
df_ = df.withColumnRenamed("y", "label")
train, test = df_.randomSplit([0.8, 0.2], seed=42)

# Create model pipeline
# The regressor should be fed a 'features' column containing a vector with all the features
# and a 'label' column
vector_assembler = VectorAssembler(inputCols=["x"], outputCol="x_vectorized")
std_scaler = StandardScaler(inputCol="x_vectorized", outputCol="features")
lin_reg = LinearRegression()
pipe = Pipeline(stages=[vector_assembler, std_scaler, lin_reg])

# param grid for the regressor
paramGrid = (
    ParamGridBuilder()
    .addGrid(lin_reg.regParam, [0.01, 0.1, 1])
    .addGrid(lin_reg.fitIntercept, [False, True])
    .addGrid(lin_reg.elasticNetParam, [0.0, 0.5, 1.0])
    .build()
)

# create CrossValidator object
rmse_evaluator = RegressionEvaluator(metricName="rmse")
crossval = CrossValidator(
    estimator=pipe,
    estimatorParamMaps=paramGrid,
    evaluator=rmse_evaluator,
    numFolds=3,
    seed=42,
)  # use 3+ folds in practice

# Fit CrossValidator
cvModel = crossval.fit(train)

In [4]:
import numpy as np

# print best params
print("Best params: ")
for param_info, param_value in cvModel.getEstimatorParamMaps()[
    np.argmin(cvModel.avgMetrics)
].items():
    param_name = param_info.name
    print(param_name, param_value)

# print evaluations metrics
cv_error = min(cvModel.avgMetrics)
print("Cross Validation Error: ", round(cv_error, 5))
training_error = rmse_evaluator.evaluate(cvModel.transform(train))
print("Training Error: ", round(training_error, 5))

Best params: 
regParam 0.01
fitIntercept False
elasticNetParam 0.0
Cross Validation Error:  0.10226
Training Error:  0.10199


In [5]:
# print test error
test_error = rmse_evaluator.evaluate(cvModel.transform(test))
print("Test Error: ", round(test_error, 5))

Test Error:  0.09352


In [6]:
# Fit model to the whole dataset

# final model
vector_assembler = VectorAssembler(inputCols=["x"], outputCol="x_vectorized")
std_scaler = StandardScaler(inputCol="x_vectorized", outputCol="features")
lin_reg = LinearRegression(regParam=0.01, fitIntercept=False, elasticNetParam=0)
pipe = Pipeline(stages=[vector_assembler, std_scaler, lin_reg])

# fit and predict
df_ = df.withColumnRenamed("y", "label")
df_pred = pipe.fit(df_).transform(df_).select(["x", "label", "prediction"])
df_pred.show(5)

+-----+-----+-------------------+
|    x|label|         prediction|
+-----+-----+-------------------+
|-1.74| 1.66| 1.7140814699495912|
| 1.24|-1.18|-1.2215293234123523|
| 0.29| -0.4|-0.2856802449915985|
|-0.13| 0.09|0.12806355809968212|
|-0.39| 0.38|0.38419067429904635|
+-----+-----+-------------------+
only showing top 5 rows



In [7]:
spark.stop()