# Examples Of Pyspark ML

## 1. Dependencies

In [1]:
from pyspark.sql import SparkSession

## 2. Create a session

In [2]:
spark = SparkSession.builder.appName('ml_exmple').getOrCreate()

## 3. Read dataset

In [3]:
df = spark.read.csv('test1.csv', header=True, inferSchema=True)

In [4]:
df.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [5]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



## Select Features or combine

In [6]:
from pyspark.ml.feature import VectorAssembler

In [7]:
feature_assembler = VectorAssembler(inputCols=['age', 'Experience'], outputCol='X_features')

In [8]:
ouput_features = feature_assembler.transform(df)

In [9]:
ouput_features.show()

+---------+---+----------+------+-----------+
|     Name|age|Experience|Salary| X_features|
+---------+---+----------+------+-----------+
|    Krish| 31|        10| 30000|[31.0,10.0]|
|Sudhanshu| 30|         8| 25000| [30.0,8.0]|
|    Sunny| 29|         4| 20000| [29.0,4.0]|
|     Paul| 24|         3| 20000| [24.0,3.0]|
|   Harsha| 21|         1| 15000| [21.0,1.0]|
|  Shubham| 23|         2| 18000| [23.0,2.0]|
+---------+---+----------+------+-----------+



In [10]:
ouput_features.columns

['Name', 'age', 'Experience', 'Salary', 'X_features']

In [11]:
finaled_data = ouput_features.select('X_features', 'Salary')

In [12]:
finaled_data.show()

+-----------+------+
| X_features|Salary|
+-----------+------+
|[31.0,10.0]| 30000|
| [30.0,8.0]| 25000|
| [29.0,4.0]| 20000|
| [24.0,3.0]| 20000|
| [21.0,1.0]| 15000|
| [23.0,2.0]| 18000|
+-----------+------+



## Split into train and test set

In [28]:
train_data, test_data = finaled_data.randomSplit([0.70, 0.30])

In [29]:
train_data.describe().show()

+-------+-----------------+
|summary|           Salary|
+-------+-----------------+
|  count|                5|
|   mean|          22600.0|
| stddev|4878.524367060188|
|    min|            18000|
|    max|            30000|
+-------+-----------------+



In [30]:
test_data.describe().show()

+-------+-------+
|summary| Salary|
+-------+-------+
|  count|      1|
|   mean|15000.0|
| stddev|    NaN|
|    min|  15000|
|    max|  15000|
+-------+-------+



In [36]:
test_data.show()

+----------+------+
|X_features|Salary|
+----------+------+
|[21.0,1.0]| 15000|
+----------+------+



## Build a model

In [13]:
from pyspark.ml.regression import LinearRegression

In [31]:
lr = LinearRegression(featuresCol='X_features',labelCol='Salary')
lr = lr.fit(train_data)

In [32]:
lr.intercept

22295.299605312008

In [33]:
lr.coefficients

DenseVector([-323.2867, 1696.8066])

In [34]:
yhat = lr.evaluate(test_data)

In [35]:
yhat.predictions.show()

+----------+------+------------------+
|X_features|Salary|        prediction|
+----------+------+------------------+
|[21.0,1.0]| 15000|17203.085755292603|
+----------+------+------------------+



In [38]:
yhat.meanAbsoluteError, yhat.meanAbsoluteError

(2203.0857552926027, 2203.0857552926027)