### PySpark ML

In [27]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

In [20]:
spark = SparkSession \
        .builder \
        .appName('ML_exercise') \
        .getOrCreate()

In [21]:
# read the dateset
training_data = spark \
                .read \
                .csv('test_dataset_03.csv', header=True, inferSchema=True)

training_data.show()
training_data.printSchema()

+-------+---+----------+------+
|   Name|Age|Experience|Salary|
+-------+---+----------+------+
|  Tarek| 23|         5| 30000|
|  Forid| 24|         8| 25000|
|  Ridoy| 24|        10| 20000|
|  Imran| 25|         7| 20000|
| Saiful| 27|         4| 15000|
|   Mitu| 23|         2| 18000|
|Khadiza| 27|         9| 48000|
|   Ritu| 24|        10| 38000|
|   Riya| 27|         7| 29000|
+-------+---+----------+------+

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [22]:
# columns list
training_data.columns

['Name', 'Age', 'Experience', 'Salary']

In [24]:
feature_assembler = VectorAssembler(inputCols=['Age', 'Experience'], outputCol="Independent Features")

In [25]:
output = feature_assembler.transform(training_data)
output.show()

+-------+---+----------+------+--------------------+
|   Name|Age|Experience|Salary|Independent Features|
+-------+---+----------+------+--------------------+
|  Tarek| 23|         5| 30000|          [23.0,5.0]|
|  Forid| 24|         8| 25000|          [24.0,8.0]|
|  Ridoy| 24|        10| 20000|         [24.0,10.0]|
|  Imran| 25|         7| 20000|          [25.0,7.0]|
| Saiful| 27|         4| 15000|          [27.0,4.0]|
|   Mitu| 23|         2| 18000|          [23.0,2.0]|
|Khadiza| 27|         9| 48000|          [27.0,9.0]|
|   Ritu| 24|        10| 38000|         [24.0,10.0]|
|   Riya| 27|         7| 29000|          [27.0,7.0]|
+-------+---+----------+------+--------------------+



In [26]:
finalized_data = output.select("Independent Features", "Salary")
finalized_data.show()

+--------------------+------+
|Independent Features|Salary|
+--------------------+------+
|          [23.0,5.0]| 30000|
|          [24.0,8.0]| 25000|
|         [24.0,10.0]| 20000|
|          [25.0,7.0]| 20000|
|          [27.0,4.0]| 15000|
|          [23.0,2.0]| 18000|
|          [27.0,9.0]| 48000|
|         [24.0,10.0]| 38000|
|          [27.0,7.0]| 29000|
+--------------------+------+



In [30]:
# train & test dataset
train_data, test_data = finalized_data.randomSplit([0.75, 0.25])

regressor = LinearRegression(featuresCol='Independent Features',
                            labelCol='Salary')
regressor = regressor.fit(train_data)

In [31]:
# coefficients
regressor.coefficients

DenseVector([1078.9474, -70.1754])

In [33]:
# intercepts
regressor.intercept

-2131.5789473686973

In [34]:
# prediction
pred_result = regressor.evaluate(test_data)

In [37]:
# show prediction result
pred_result.predictions.show()

+--------------------+------+------------------+
|Independent Features|Salary|        prediction|
+--------------------+------+------------------+
|         [24.0,10.0]| 38000|23061.403508771917|
|          [27.0,4.0]| 15000|26719.298245614085|
|          [27.0,9.0]| 48000|26368.421052631606|
+--------------------+------+------------------+

