https://www.youtube.com/watch?v=_C8kWso4ne4&t=1046s

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f

In [2]:
spark = SparkSession.builder.appName("Pyspark ML").getOrCreate()

In [3]:
training = spark.read.csv('ML_test_data.csv', header=True, inferSchema=True)
training.show()

+------+---+----------+------+
|  name|age|experience|salary|
+------+---+----------+------+
| BLAKE| 24|         2| 20000|
| CLARK| 24|         2| 21000|
| SCOTT| 25|         4| 22000|
|  WARD| 26|         5| 23000|
|  KING| 26|         5| 25000|
| ALLEN| 27|         6| 30000|
|TURNER| 27|         7| 32000|
|MARTIN| 28|         7| 32000|
| SMITH| 31|        10| 50000|
| JONES| 33|        12| 70000|
+------+---+----------+------+



In [4]:
training.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- experience: integer (nullable = true)
 |-- salary: integer (nullable = true)



In [7]:
# [age, experience] --> a new feature --> independent
from pyspark.ml.feature import VectorAssembler
feature_assembler = VectorAssembler(inputCols=['age', 'experience'], outputCol='independent feature')

In [9]:
output = feature_assembler.transform(training)

In [10]:
output.show()

+------+---+----------+------+-------------------+
|  name|age|experience|salary|independent feature|
+------+---+----------+------+-------------------+
| BLAKE| 24|         2| 20000|         [24.0,2.0]|
| CLARK| 24|         2| 21000|         [24.0,2.0]|
| SCOTT| 25|         4| 22000|         [25.0,4.0]|
|  WARD| 26|         5| 23000|         [26.0,5.0]|
|  KING| 26|         5| 25000|         [26.0,5.0]|
| ALLEN| 27|         6| 30000|         [27.0,6.0]|
|TURNER| 27|         7| 32000|         [27.0,7.0]|
|MARTIN| 28|         7| 32000|         [28.0,7.0]|
| SMITH| 31|        10| 50000|        [31.0,10.0]|
| JONES| 33|        12| 70000|        [33.0,12.0]|
+------+---+----------+------+-------------------+



In [11]:
finalised_data = output.select('independent feature', 'salary')

In [12]:
finalised_data.show()

+-------------------+------+
|independent feature|salary|
+-------------------+------+
|         [24.0,2.0]| 20000|
|         [24.0,2.0]| 21000|
|         [25.0,4.0]| 22000|
|         [26.0,5.0]| 23000|
|         [26.0,5.0]| 25000|
|         [27.0,6.0]| 30000|
|         [27.0,7.0]| 32000|
|         [28.0,7.0]| 32000|
|        [31.0,10.0]| 50000|
|        [33.0,12.0]| 70000|
+-------------------+------+



# ML Packages

In [31]:
from pyspark.ml.regression import LinearRegression
# train -- test split
train_data, test_data = finalised_data.randomSplit([0.75, 0.25])

In [32]:
train_data.show()

+-------------------+------+
|independent feature|salary|
+-------------------+------+
|         [24.0,2.0]| 20000|
|         [24.0,2.0]| 21000|
|         [25.0,4.0]| 22000|
|         [26.0,5.0]| 23000|
|         [26.0,5.0]| 25000|
|         [27.0,6.0]| 30000|
|         [27.0,7.0]| 32000|
+-------------------+------+



In [33]:
test_data.show()

+-------------------+------+
|independent feature|salary|
+-------------------+------+
|         [28.0,7.0]| 32000|
|        [31.0,10.0]| 50000|
|        [33.0,12.0]| 70000|
+-------------------+------+



In [34]:
regressor = LinearRegression(featuresCol='independent feature', labelCol='salary')
regressor = regressor.fit(train_data)

In [36]:
# coefficients
regressor.coefficients

DenseVector([2083.3333, 833.3333])

In [37]:
regressor.intercept

-32249.999999990352

In [38]:
# prediction
pred_results = regressor.evaluate(test_data)

In [40]:
pred_results.predictions.show()

+-------------------+------+------------------+
|independent feature|salary|        prediction|
+-------------------+------+------------------+
|         [28.0,7.0]| 32000|31916.666666666322|
|        [31.0,10.0]| 50000| 40666.66666666586|
|        [33.0,12.0]| 70000|46499.999999998865|
+-------------------+------+------------------+

