In [2]:
from pyspark.sql import SparkSession
import os

In [3]:
Spark = SparkSession.builder.appName('project').getOrCreate()

In [60]:
pwd = os.getcwd()
file_path= os.path.join(pwd, 'sample-1.csv')
file_path

'c:\\Users\\Admin\\Videos\\work\\Portfolio\\10.pyspark\\Pyspark_liner_regression\\sample-1.csv'

In [61]:
training = Spark.read.csv(file_path, header= True, inferSchema=True)
training.show(10)

+--------------+---+----------+------+
|          Name|Age|Experience|Salary|
+--------------+---+----------+------+
|       Laraine| 32|       5.0| 90000|
|           Eli| 28|       3.0| 65000|
|         Arlin| 45|      15.0|150000|
|        Talbot| 36|       7.0| 60000|
|Sheila-kathryn| 52|      20.0|200000|
|          Curr| 29|       2.0| 55000|
|          Fina| 42|      12.0|120000|
|           Rod| 31|       4.0| 80000|
|          Mala| 26|       1.0| 45000|
|      Fiorenze| 38|      10.0|110000|
+--------------+---+----------+------+
only showing top 10 rows



In [62]:
training.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: double (nullable = true)
 |-- Salary: integer (nullable = true)



In [63]:
training.columns

['Name', 'Age', 'Experience', 'Salary']

In [64]:
from pyspark.ml.feature import VectorAssembler
feature_assembler = VectorAssembler(inputCols=['Experience',  'Salary'], outputCol='Indipendent features')

In [65]:
feature_assembler

VectorAssembler_179d81e794be

In [66]:
output = feature_assembler.transform(training)

In [67]:
output.show(10)

+--------------+---+----------+------+--------------------+
|          Name|Age|Experience|Salary|Indipendent features|
+--------------+---+----------+------+--------------------+
|       Laraine| 32|       5.0| 90000|       [5.0,90000.0]|
|           Eli| 28|       3.0| 65000|       [3.0,65000.0]|
|         Arlin| 45|      15.0|150000|     [15.0,150000.0]|
|        Talbot| 36|       7.0| 60000|       [7.0,60000.0]|
|Sheila-kathryn| 52|      20.0|200000|     [20.0,200000.0]|
|          Curr| 29|       2.0| 55000|       [2.0,55000.0]|
|          Fina| 42|      12.0|120000|     [12.0,120000.0]|
|           Rod| 31|       4.0| 80000|       [4.0,80000.0]|
|          Mala| 26|       1.0| 45000|       [1.0,45000.0]|
|      Fiorenze| 38|      10.0|110000|     [10.0,110000.0]|
+--------------+---+----------+------+--------------------+
only showing top 10 rows



In [68]:
output.columns

['Name', 'Age', 'Experience', 'Salary', 'Indipendent features']

In [69]:
finelized_data = output.select( 'Indipendent features','Salary')

In [70]:
finelized_data.show(5)

+--------------------+------+
|Indipendent features|Salary|
+--------------------+------+
|       [5.0,90000.0]| 90000|
|       [3.0,65000.0]| 65000|
|     [15.0,150000.0]|150000|
|       [7.0,60000.0]| 60000|
|     [20.0,200000.0]|200000|
+--------------------+------+
only showing top 5 rows



# linear regression

In [71]:
from pyspark.ml.regression import LinearRegression

In [72]:
train_data, test_data = finelized_data.randomSplit([0.8, 0.2])

In [73]:
regressor = LinearRegression(featuresCol= 'Indipendent features', labelCol='Salary' )
regressor= regressor.fit(train_data)

In [74]:
regressor.coefficients

DenseVector([0.0, 1.0])

In [75]:
regressor.intercept

1.9010943340238607e-11

In [76]:
pred_result = regressor.evaluate(test_data)

In [77]:
pred_result.predictions.show(10)

+--------------------+------+-----------------+
|Indipendent features|Salary|       prediction|
+--------------------+------+-----------------+
|       [1.5,35000.0]| 35000|35000.00000000001|
|       [2.0,40000.0]| 40000|40000.00000000001|
|       [2.0,40000.0]| 40000|40000.00000000001|
|       [2.0,40000.0]| 40000|40000.00000000001|
|       [2.0,50000.0]| 50000|          50000.0|
|       [3.0,45000.0]| 45000|45000.00000000001|
|       [3.0,50000.0]| 50000|50000.00000000001|
|       [3.0,50000.0]| 50000|50000.00000000001|
|       [3.0,55000.0]| 55000|55000.00000000001|
|       [3.0,75000.0]| 75000|74999.99999999999|
+--------------------+------+-----------------+
only showing top 10 rows



In [78]:
pred_result.meanAbsoluteError, pred_result.meanSquaredError

(1.3497428617615631e-11, 3.1610287524345266e-22)