# Pyspark Simple Linear Regression

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('simple Regressor').getOrCreate()

In [3]:
student = spark.read.csv('Student_Grades_Data.csv', header=True, inferSchema=True)

In [4]:
student.printSchema()

root
 |-- Time_to_Study: integer (nullable = true)
 |-- Grades: double (nullable = true)



In [8]:


student.show(5)

+-------------+------+
|Time_to_Study|Grades|
+-------------+------+
|            1|   1.5|
|            5|   2.7|
|            7|   3.1|
|            3|   2.1|
|            2|   1.8|
+-------------+------+
only showing top 5 rows



## Feature selection X_variable

In [12]:
from pyspark.ml.feature import VectorAssembler

In [15]:
va = VectorAssembler(inputCols=['Time_to_Study'], outputCol='X_features')

In [16]:
data_features = va.transform(student)

In [17]:
data_features.show()

+-------------+------+----------+
|Time_to_Study|Grades|X_features|
+-------------+------+----------+
|            1|   1.5|     [1.0]|
|            5|   2.7|     [5.0]|
|            7|   3.1|     [7.0]|
|            3|   2.1|     [3.0]|
|            2|   1.8|     [2.0]|
|            9|   3.9|     [9.0]|
|            6|   2.9|     [6.0]|
|           12|   4.5|    [12.0]|
|           11|   4.3|    [11.0]|
|            2|   1.8|     [2.0]|
|            4|   2.4|     [4.0]|
|            8|   3.5|     [8.0]|
|           13|   4.8|    [13.0]|
|            9|   3.9|     [9.0]|
|           14|   5.0|    [14.0]|
|           10|   4.1|    [10.0]|
|            6|   2.9|     [6.0]|
|           12|   4.5|    [12.0]|
|            1|   1.5|     [1.0]|
|            4|   2.4|     [4.0]|
+-------------+------+----------+
only showing top 20 rows



## split into train_data, and test_data

In [18]:
train_data, test_data = data_features.randomSplit([0.70, 0.30])

In [19]:
train_data.describe().show()

+-------+-----------------+------------------+
|summary|    Time_to_Study|            Grades|
+-------+-----------------+------------------+
|  count|               39|                39|
|   mean|7.076923076923077|3.2153846153846146|
| stddev|4.093541867410533|1.1172189300454147|
|    min|                1|               1.5|
|    max|               14|               5.0|
+-------+-----------------+------------------+



In [20]:
test_data.describe().show()

+-------+------------------+------------------+
|summary|     Time_to_Study|            Grades|
+-------+------------------+------------------+
|  count|                11|                11|
|   mean|7.2727272727272725|3.2454545454545456|
| stddev| 4.076540422733696|1.1120824037486037|
|    min|                 1|               1.5|
|    max|                14|               5.0|
+-------+------------------+------------------+



In [22]:
train_data.show(5)

+-------------+------+----------+
|Time_to_Study|Grades|X_features|
+-------------+------+----------+
|            1|   1.5|     [1.0]|
|            1|   1.5|     [1.0]|
|            1|   1.5|     [1.0]|
|            2|   1.8|     [2.0]|
|            2|   1.8|     [2.0]|
+-------------+------+----------+
only showing top 5 rows



## Lienar Regressor

In [21]:
from pyspark.ml.regression import LinearRegression

In [25]:
model = LinearRegression(featuresCol='X_features', labelCol='Grades')
lr = model.fit(train_data)

### Predict

In [26]:
yhat = lr.evaluate(test_data)

In [28]:
#show the predicted values
yhat.predictions.show()

+-------------+------+----------+------------------+
|Time_to_Study|Grades|X_features|        prediction|
+-------------+------+----------+------------------+
|            1|   1.5|     [1.0]|1.5600507368929701|
|            2|   1.8|     [2.0]|1.8324474510751396|
|            4|   2.4|     [4.0]|2.3772408794394786|
|            6|   2.9|     [6.0]|2.9220343078038176|
|            7|   3.1|     [7.0]| 3.194431021985987|
|            7|   3.1|     [7.0]| 3.194431021985987|
|            8|   3.5|     [8.0]|3.4668277361681565|
|            8|   3.5|     [8.0]|3.4668277361681565|
|           10|   4.1|    [10.0]| 4.011621164532496|
|           13|   4.8|    [13.0]| 4.828811307079004|
|           14|   5.0|    [14.0]| 5.101208021261174|
+-------------+------+----------+------------------+



In [29]:
lr.coefficients

DenseVector([0.2724])

In [30]:
lr.intercept

1.2876540227108006

### Evaluate Model Using Metrics

In [31]:
from pyspark.ml.evaluation import RegressionEvaluator

In [32]:
evaluation = RegressionEvaluator(predictionCol='prediction', labelCol='Grades')

In [33]:
#root mean squared error
rmse = evaluation.evaluate(yhat.predictions,{evaluation.metricName: 'rmse'} )
rmse

0.06366219365856986

In [34]:
# mean squared error
mse = evaluation.evaluate(yhat.predictions,{evaluation.metricName: 'mse'} )
mse

0.0040528749014212525

In [35]:
# mean absolute error
mae = evaluation.evaluate(yhat.predictions,{evaluation.metricName: 'mae'} )
mae

0.05553603197961739

In [36]:
#r2 
r2 =evaluation.evaluate(yhat.predictions,{evaluation.metricName: 'r2'} )
r2

0.9963951935969423

In [None]:
print('')