# Simple linear regression

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark=SparkSession.builder.appName('training').getOrCreate()

In [8]:
training=spark.read.csv('test.csv',header=True,inferSchema=True)

In [9]:
training.show()

+------+---+---------+------+
|  Name|age|Expreince|salary|
+------+---+---------+------+
|Mayank| 20|        2| 10000|
| vikki| 18|        1| 14000|
| shyam| 22|        3| 12000|
|   ram| 17|        8| 20000|
|radhey| 24|        5| 25000|
|  ramu| 18|        7| 20000|
| karan| 30|       12| 32000|
|  aman| 32|       14| 15000|
+------+---+---------+------+



In [12]:
training.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Expreince: integer (nullable = true)
 |-- salary: integer (nullable = true)



In [13]:
training.columns

['Name', 'age', 'Expreince', 'salary']

In [14]:
from pyspark.ml.feature import VectorAssembler

In [15]:
output=VectorAssembler(inputCols=['age','Expreince'],outputCol='independent feature')

In [16]:
output=output.transform(training)

In [17]:
output.show()

+------+---+---------+------+-------------------+
|  Name|age|Expreince|salary|independent feature|
+------+---+---------+------+-------------------+
|Mayank| 20|        2| 10000|         [20.0,2.0]|
| vikki| 18|        1| 14000|         [18.0,1.0]|
| shyam| 22|        3| 12000|         [22.0,3.0]|
|   ram| 17|        8| 20000|         [17.0,8.0]|
|radhey| 24|        5| 25000|         [24.0,5.0]|
|  ramu| 18|        7| 20000|         [18.0,7.0]|
| karan| 30|       12| 32000|        [30.0,12.0]|
|  aman| 32|       14| 15000|        [32.0,14.0]|
+------+---+---------+------+-------------------+



In [18]:
finalized_data=output.select('independent feature','salary')

In [19]:
finalized_data.show()

+-------------------+------+
|independent feature|salary|
+-------------------+------+
|         [20.0,2.0]| 10000|
|         [18.0,1.0]| 14000|
|         [22.0,3.0]| 12000|
|         [17.0,8.0]| 20000|
|         [24.0,5.0]| 25000|
|         [18.0,7.0]| 20000|
|        [30.0,12.0]| 32000|
|        [32.0,14.0]| 15000|
+-------------------+------+



In [22]:
train_data,test_data=finalized_data.randomSplit([0.75,0.25])

In [23]:
from pyspark.ml.regression import LinearRegression

In [24]:
regressor=LinearRegression(featuresCol='independent feature',labelCol='salary')

In [25]:
regressor=regressor.fit(train_data)

In [26]:
pred_result=regressor.evaluate(test_data)

In [30]:
pred_result.predictions.show()


+-------------------+------+------------------+
|independent feature|salary|        prediction|
+-------------------+------+------------------+
|         [18.0,1.0]| 14000|19034.547004768367|
|         [20.0,2.0]| 10000| 18561.18356648877|
|        [30.0,12.0]| 32000|16748.255281523354|
+-------------------+------+------------------+



In [31]:
pred_result.meanAbsoluteError

9615.825096577928

In [32]:
pred_result.meanSquaredError

110418748.18663992

In [33]:
regressor.coefficients

DenseVector([-292.0706, 110.7778])

In [34]:
regressor.intercept

24181.040199576848