# Linear Regression

In [1]:
import numpy as np
import pandas as pd
import pyspark
import sys

In [2]:
import pyspark.sql.functions as fn

In [3]:
import pyspark.pandas as ps

In [4]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler

In [None]:
# Local mode
spark = SparkSession\
        .builder\
        .appName("lr")\
        .getOrCreate()

In [None]:
# yarn mode
spark = SparkSession\
        .builder\
        .master("yarn")\
        .config('spark.executor.instances','99')\
        .config('spark.executor.memory','4G')\
        .appName("iris")\
        .getOrCreate()

In [5]:
# Check spark app name
spark.sparkContext.appName

'PySparkShell'

In [6]:
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", True)

In [7]:
ps.set_option("compute.default_index_type", "distributed")

In [8]:
# print runtime versions
# Python version
sys.version

'3.8.10 (default, Sep 28 2021, 16:10:42) \n[GCC 9.3.0]'

In [9]:
# Spark version
spark.version

'3.2.0'

### Exploring Data

In [10]:
# load iris.csv into Spark dataframe
#df = spark.read.csv('file:///vagrant/data/lr_dataset.csv', header=True, inferSchema=True)
psdf = ps.read_csv('data/lr_dataset.csv')

In [11]:
#validate the size of data
psdf.shape

(1232, 6)

In [12]:
# First 5 rows of Iris dataset
psdf.head()

Unnamed: 0,var_1,var_2,var_3,var_4,var_5,output
0,734,688,81,0.328,0.259,0.418
1,700,600,94,0.32,0.247,0.389
2,712,705,93,0.311,0.247,0.417
3,734,806,69,0.315,0.26,0.415
4,613,759,61,0.302,0.24,0.378


In [13]:
psdf.dtypes

var_1       int32
var_2       int32
var_3       int32
var_4     float64
var_5     float64
output    float64
dtype: object

In [14]:
psdf.describe()

Unnamed: 0,var_1,var_2,var_3,var_4,var_5,output
count,1232.0,1232.0,1232.0,1232.0,1232.0,1232.0
mean,715.081981,715.081981,80.904221,0.326331,0.259273,0.397342
std,91.534294,93.079933,11.458139,0.015013,0.012907,0.033267
min,463.0,472.0,40.0,0.277,0.214,0.301
25%,652.0,649.0,73.0,0.317,0.251,0.375
50%,711.0,709.0,81.0,0.326,0.26,0.396
75%,775.0,774.0,89.0,0.337,0.268,0.421
max,1009.0,1103.0,116.0,0.373,0.294,0.491


In [15]:
psdf.corr()
# check for correlation

  Unsupported type in conversion to Arrow: MatrixUDT
Attempting non-optimization as 'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to true.


Unnamed: 0,var_1,var_2,var_3,var_4,var_5,output
var_1,1.0,0.380139,0.511745,0.900492,0.827,0.91874
var_2,0.380139,1.0,-0.532394,0.32636,0.330764,0.436527
var_3,0.511745,-0.532394,1.0,0.481836,0.408716,0.401496
var_4,0.900492,0.32636,0.481836,1.0,0.851958,0.79091
var_5,0.827,0.330764,0.408716,0.851958,1.0,0.790481
output,0.91874,0.436527,0.401496,0.79091,0.790481,1.0


### Feature Engineering

In [16]:
df = psdf.to_spark()

In [17]:
# check for correlation
df.select(fn.corr('var_1','output')).show()

+-------------------+
|corr(var_1, output)|
+-------------------+
| 0.9187399607627283|
+-------------------+



In [18]:
# display all column names
df.columns

['var_1', 'var_2', 'var_3', 'var_4', 'var_5', 'output']

In [19]:
# vectorize all numerical columns into a single feature column
feature_cols = df.columns[:-1]
assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')
features_df = assembler.transform(df)

In [20]:
# validate the presence of dense vectors 
features_df.printSchema()

root
 |-- var_1: integer (nullable = true)
 |-- var_2: integer (nullable = true)
 |-- var_3: integer (nullable = true)
 |-- var_4: double (nullable = true)
 |-- var_5: double (nullable = true)
 |-- output: double (nullable = true)
 |-- features: vector (nullable = true)



In [21]:
# view the details of dense vector
features_df.select('features').show(5,False)

+------------------------------+
|features                      |
+------------------------------+
|[734.0,688.0,81.0,0.328,0.259]|
|[700.0,600.0,94.0,0.32,0.247] |
|[712.0,705.0,93.0,0.311,0.247]|
|[734.0,806.0,69.0,0.315,0.26] |
|[613.0,759.0,61.0,0.302,0.24] |
+------------------------------+
only showing top 5 rows



In [22]:
# only select the features and label column
model_df = features_df.select(['features', 'output'])

In [23]:
# Reading for machine learning
model_df.show(10,False)

+------------------------------+------+
|features                      |output|
+------------------------------+------+
|[734.0,688.0,81.0,0.328,0.259]|0.418 |
|[700.0,600.0,94.0,0.32,0.247] |0.389 |
|[712.0,705.0,93.0,0.311,0.247]|0.417 |
|[734.0,806.0,69.0,0.315,0.26] |0.415 |
|[613.0,759.0,61.0,0.302,0.24] |0.378 |
|[748.0,676.0,85.0,0.318,0.255]|0.422 |
|[669.0,588.0,97.0,0.315,0.251]|0.411 |
|[667.0,845.0,68.0,0.324,0.251]|0.381 |
|[758.0,890.0,64.0,0.33,0.274] |0.436 |
|[726.0,670.0,88.0,0.335,0.268]|0.422 |
+------------------------------+------+
only showing top 10 rows



In [24]:
# size of model df
model_df.count(), len(model_df.columns)

(1232, 2)

### Split Data - Train & Test sets

In [25]:
# use Logistic Regression to train on the training set
train_df, test_df = model_df.randomSplit([0.70, 0.30], seed=42)

In [26]:
train_df.count(), len(train_df.columns)

(913, 2)

In [27]:
test_df.count(), len(test_df.columns)

(319, 2)

### Build Linear Regression Model 

In [28]:
reg = 0.01

In [29]:
# Build Linear Regression model 
lin_Reg=LinearRegression(labelCol='output', regParam=reg)

In [30]:
# fit the linear regression model on training data set 
lr_model=lin_Reg.fit(train_df)

In [31]:
lr_model.intercept

0.011465069326171773

In [32]:
lr_model.coefficients

DenseVector([0.0001, 0.0001, 0.0004, 0.2526, 0.4707])

In [33]:
training_predictions=lr_model.evaluate(train_df)

In [34]:
training_predictions.meanSquaredError

0.00019144601401315477

In [35]:
training_predictions.r2

0.8216224494227173

### Evaluate Model

In [36]:
# make predictions on test data 
test_results=lr_model.evaluate(test_df)

In [37]:
# view the residual errors based on predictions 
test_results.residuals.show(10,False)

+----------------------+
|residuals             |
+----------------------+
|0.002382055197662103  |
|-0.014547863851062492 |
|-0.012039623917925624 |
|-0.011109420628882039 |
|-0.002616531852948356 |
|-0.009854073751058456 |
|-0.008927654496298232 |
|-2.0828312883258704E-4|
|-0.013569479790375094 |
|-0.001270181110074986 |
+----------------------+
only showing top 10 rows





In [38]:
# coefficient of determination value for model
test_results.r2

0.8143179975922307

In [39]:
# RMSE
test_results.rootMeanSquaredError

0.014908637721399215

In [40]:
# MSE
test_results.meanSquaredError

0.0002222674787079276