**Example of PySpark ML**

In [None]:
from pyspark.sql import SparkSession
spark= SparkSession.builder.appName('Missing').getOrCreate()

In [None]:
training= spark.read.csv('/content/sheetforspark - Sheet1.csv',header=True, inferSchema=True)

In [None]:
training.show()

+------+---+-----------+------+
|  name|age|experience |salary|
+------+---+-----------+------+
| nakib| 31|         10| 30000|
| sunny| 30|          8| 25000|
|  paul| 39|          4| 20000|
|raihan| 24|          3| 15000|
| tamim| 23|          1| 20000|
|  naim| 42|          2| 18000|
+------+---+-----------+------+



In [None]:
training.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- experience : integer (nullable = true)
 |-- salary: integer (nullable = true)



In [None]:
training.columns

['name', 'age', 'experience ', 'salary']

Create a way where we can group the independent features, known as VectorAssembler. \
['age','experience'] --> new feature --> independent feature.

In [None]:
from pyspark.ml.feature import VectorAssembler
featureassembler= VectorAssembler(inputCols=['age','experience '], outputCol="Independent Features")

In [None]:
output=featureassembler.transform(training)

In [None]:
output.show()

+------+---+-----------+------+--------------------+
|  name|age|experience |salary|Independent Features|
+------+---+-----------+------+--------------------+
| nakib| 31|         10| 30000|         [31.0,10.0]|
| sunny| 30|          8| 25000|          [30.0,8.0]|
|  paul| 39|          4| 20000|          [39.0,4.0]|
|raihan| 24|          3| 15000|          [24.0,3.0]|
| tamim| 23|          1| 20000|          [23.0,1.0]|
|  naim| 42|          2| 18000|          [42.0,2.0]|
+------+---+-----------+------+--------------------+



In [None]:
output.columns

['name', 'age', 'experience ', 'salary', 'Independent Features']

In [None]:
finalized_data= output.select("Independent Features", "salary")

In [None]:
finalized_data.show()

+--------------------+------+
|Independent Features|salary|
+--------------------+------+
|         [31.0,10.0]| 30000|
|          [30.0,8.0]| 25000|
|          [39.0,4.0]| 20000|
|          [24.0,3.0]| 15000|
|          [23.0,1.0]| 20000|
|          [42.0,2.0]| 18000|
+--------------------+------+



In [None]:
from pyspark.ml.regression import LinearRegression
train_data, test_data= finalized_data.randomSplit([0.75,0.25])
regressor= LinearRegression(featuresCol="Independent Features", labelCol="salary")
regressor= regressor.fit(train_data)

In [None]:
#Coefficient
regressor.coefficients

DenseVector([-19.5763, 1345.1915])

In [None]:
#Intercepts
regressor.intercept

15580.548137008445

In [None]:
#prediction
pred_result= regressor.evaluate(test_data)

In [None]:
pred_result.predictions.show()

+--------------------+------+------------------+
|Independent Features|salary|        prediction|
+--------------------+------+------------------+
|          [42.0,2.0]| 18000|17448.725594420048|
+--------------------+------+------------------+



In [None]:
pred_result.meanAbsoluteError, pred_result.meanSquaredError

(551.2744055799521, 303903.47024752956)