# using salary and age predict the salary
# Example 0f Pyspark ML

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('Missing').getOrCreate()

In [3]:
## in the Spark ML there are two techniques 
## 1. RDD technique,2.DataFrame API

In [4]:
import csv
import random

with open('employee_data.csv', mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['name', 'salary', 'age', 'experience'])
    for i in range(20):
        name = f'Employee{i+1}'
        salary = random.randint(40000, 100000)
        age = random.randint(25, 50)
        experience = random.randint(1, 11)
        writer.writerow([name, salary, age, experience])


In [5]:
# read the dataset 
training = spark.read.csv('employee_data.csv',header=True,inferSchema=True)

In [6]:
training.show()

+----------+------+---+----------+
|      name|salary|age|experience|
+----------+------+---+----------+
| Employee1| 53099| 31|         4|
| Employee2| 56674| 48|         4|
| Employee3| 52554| 34|         1|
| Employee4| 40455| 28|         5|
| Employee5| 67486| 28|         5|
| Employee6| 84934| 36|        10|
| Employee7| 94830| 37|         8|
| Employee8| 73575| 26|        11|
| Employee9| 65166| 41|         2|
|Employee10| 98356| 43|        10|
|Employee11| 93948| 49|         1|
|Employee12| 56675| 33|         8|
|Employee13| 93428| 43|         4|
|Employee14| 72495| 30|         5|
|Employee15| 74599| 27|         4|
|Employee16| 89258| 50|         4|
|Employee17| 41857| 29|         2|
|Employee18| 80670| 25|         6|
|Employee19| 97576| 50|         7|
|Employee20| 50685| 31|         8|
+----------+------+---+----------+



In [7]:
training.printSchema()

root
 |-- name: string (nullable = true)
 |-- salary: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- experience: integer (nullable = true)



In [8]:
training.columns

['name', 'salary', 'age', 'experience']

In [9]:
from pyspark.ml.feature import VectorAssembler
featureassembler=VectorAssembler(inputCols=['age','salary'],outputCol='Independent Features')

In [10]:
output=featureassembler.transform(training)

In [11]:
output.show()

+----------+------+---+----------+--------------------+
|      name|salary|age|experience|Independent Features|
+----------+------+---+----------+--------------------+
| Employee1| 53099| 31|         4|      [31.0,53099.0]|
| Employee2| 56674| 48|         4|      [48.0,56674.0]|
| Employee3| 52554| 34|         1|      [34.0,52554.0]|
| Employee4| 40455| 28|         5|      [28.0,40455.0]|
| Employee5| 67486| 28|         5|      [28.0,67486.0]|
| Employee6| 84934| 36|        10|      [36.0,84934.0]|
| Employee7| 94830| 37|         8|      [37.0,94830.0]|
| Employee8| 73575| 26|        11|      [26.0,73575.0]|
| Employee9| 65166| 41|         2|      [41.0,65166.0]|
|Employee10| 98356| 43|        10|      [43.0,98356.0]|
|Employee11| 93948| 49|         1|      [49.0,93948.0]|
|Employee12| 56675| 33|         8|      [33.0,56675.0]|
|Employee13| 93428| 43|         4|      [43.0,93428.0]|
|Employee14| 72495| 30|         5|      [30.0,72495.0]|
|Employee15| 74599| 27|         4|      [27.0,74

In [12]:
output.columns

['name', 'salary', 'age', 'experience', 'Independent Features']

In [13]:
finalized_data=output.select('Independent features','salary')

In [14]:
finalized_data.show()

+--------------------+------+
|Independent features|salary|
+--------------------+------+
|      [31.0,53099.0]| 53099|
|      [48.0,56674.0]| 56674|
|      [34.0,52554.0]| 52554|
|      [28.0,40455.0]| 40455|
|      [28.0,67486.0]| 67486|
|      [36.0,84934.0]| 84934|
|      [37.0,94830.0]| 94830|
|      [26.0,73575.0]| 73575|
|      [41.0,65166.0]| 65166|
|      [43.0,98356.0]| 98356|
|      [49.0,93948.0]| 93948|
|      [33.0,56675.0]| 56675|
|      [43.0,93428.0]| 93428|
|      [30.0,72495.0]| 72495|
|      [27.0,74599.0]| 74599|
|      [50.0,89258.0]| 89258|
|      [29.0,41857.0]| 41857|
|      [25.0,80670.0]| 80670|
|      [50.0,97576.0]| 97576|
|      [31.0,50685.0]| 50685|
+--------------------+------+



In [15]:
from pyspark.ml.regression import LinearRegression
# train test data
train_data,test_data=finalized_data.randomSplit([0.75,0.25])
regressor = LinearRegression(featuresCol='Independent features', labelCol ='salary')
regressor = regressor.fit(train_data)

In [16]:
## coefficients
regressor.coefficients

DenseVector([-0.0, 1.0])

In [17]:
## Intercepts
regressor.intercept

-6.104876229065598e-12

In [18]:
## prediction
pred_results = regressor.evaluate(test_data)

In [19]:
pred_results.predictions.show()

+--------------------+------+-----------------+
|Independent features|salary|       prediction|
+--------------------+------+-----------------+
|      [27.0,74599.0]| 74599|74599.00000000001|
|      [29.0,41857.0]| 41857|41856.99999999999|
|      [36.0,84934.0]| 84934|84934.00000000001|
|      [49.0,93948.0]| 93948|93948.00000000001|
+--------------------+------+-----------------+



In [20]:
pred_results.meanAbsoluteError

1.2732925824820995e-11

In [21]:
pred_results.meanSquaredError

1.7205356741102973e-22