### Examples of PySpark ML

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Practice').getOrCreate()

24/04/04 10:02:41 WARN Utils: Your hostname, Phoenxs-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.102.52 instead (on interface en0)
24/04/04 10:02:41 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/04/04 10:02:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
training = spark.read.csv('table6.csv', header=True, inferSchema=True)
training.show()

+-----+---+----------+------+
| Name|Age|Experience|Salary|
+-----+---+----------+------+
|Alice| 30|         5| 60000|
|  Bob| 28|         3| 50000|
|Carol| 35|         8| 75000|
|David| 32|         8| 65000|
|Emily| 27|         2| 48000|
+-----+---+----------+------+



In [6]:
training.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [8]:
training.columns

['Name', 'Age', 'Experience', 'Salary']

In [9]:
# [Age, Experience] ----> new feature ----> Independent feature
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=['Age', 'Experience'], outputCol='Independent Features')

In [12]:
output = assembler.transform(training)
output.show()

+-----+---+----------+------+--------------------+
| Name|Age|Experience|Salary|Independent Features|
+-----+---+----------+------+--------------------+
|Alice| 30|         5| 60000|          [30.0,5.0]|
|  Bob| 28|         3| 50000|          [28.0,3.0]|
|Carol| 35|         8| 75000|          [35.0,8.0]|
|David| 32|         8| 65000|          [32.0,8.0]|
|Emily| 27|         2| 48000|          [27.0,2.0]|
+-----+---+----------+------+--------------------+



In [14]:
finalized_data = output.select('Independent Features', 'Salary')
finalized_data.show()

+--------------------+------+
|Independent Features|Salary|
+--------------------+------+
|          [30.0,5.0]| 60000|
|          [28.0,3.0]| 50000|
|          [35.0,8.0]| 75000|
|          [32.0,8.0]| 65000|
|          [27.0,2.0]| 48000|
+--------------------+------+



In [19]:
from pyspark.ml.regression import LinearRegression

train_data, test_data = finalized_data.randomSplit([0.75, 0.25])
regressor = LinearRegression(featuresCol='Independent Features', labelCol='Salary')
regressor = regressor.fit(train_data)

24/04/04 11:04:56 WARN Instrumentation: [0c26df4e] regParam is zero, which might cause numerical instability and overfitting.


In [21]:
regressor.coefficients

DenseVector([1500.0, 2500.0])

In [22]:
regressor.intercept

2499.9999999711763

In [26]:
pred_results = regressor.evaluate(test_data)
pred_results.predictions.show()

+--------------------+------+------------------+
|Independent Features|Salary|        prediction|
+--------------------+------+------------------+
|          [28.0,3.0]| 50000|51999.999999999985|
|          [32.0,8.0]| 65000| 70499.99999999681|
+--------------------+------+------------------+



In [27]:
pred_results.meanAbsoluteError, pred_results.meanSquaredError

(3749.9999999983993, 17124999.999982443)