In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Mlb").getOrCreate()

In [2]:
df = spark.read.csv(r"C:\Users\pavan\OneDrive\Desktop\prep\datasets\test3.csv", header = True, inferSchema = True)

In [3]:
df.show()

+-------+---+----------+------+
|   Name|Age|Experience|Salary|
+-------+---+----------+------+
|   john| 31|        10| 10000|
|Stephen| 30|         8|  5000|
|  Stacy| 29|         4|  4000|
|   paul| 24|         3|  3000|
|  Krish| 21|         1| 20000|
|    Joe| 23|         2| 10000|
|Stephen| 22|         2|  5000|
|   jack| 19|         1| 10000|
|  Krish| 20|         2|  8000|
+-------+---+----------+------+



In [4]:
df.printSchema

<bound method DataFrame.printSchema of DataFrame[Name: string, Age: int, Experience: int, Salary: int]>

In [5]:
df.columns

['Name', 'Age', 'Experience', 'Salary']

[Age,Experience] ---> new feature --> independent feature

In [6]:
from pyspark.ml.feature import VectorAssembler
featureassembler = VectorAssembler(inputCols = ['Age', 'Experience'], outputCol="Independent Feature")

In [8]:
output = featureassembler.transform(df)

In [9]:
output.show()

+-------+---+----------+------+-------------------+
|   Name|Age|Experience|Salary|Independent Feature|
+-------+---+----------+------+-------------------+
|   john| 31|        10| 10000|        [31.0,10.0]|
|Stephen| 30|         8|  5000|         [30.0,8.0]|
|  Stacy| 29|         4|  4000|         [29.0,4.0]|
|   paul| 24|         3|  3000|         [24.0,3.0]|
|  Krish| 21|         1| 20000|         [21.0,1.0]|
|    Joe| 23|         2| 10000|         [23.0,2.0]|
|Stephen| 22|         2|  5000|         [22.0,2.0]|
|   jack| 19|         1| 10000|         [19.0,1.0]|
|  Krish| 20|         2|  8000|         [20.0,2.0]|
+-------+---+----------+------+-------------------+



In [10]:
output.columns

['Name', 'Age', 'Experience', 'Salary', 'Independent Feature']

In [13]:
finalised_data = output.select(['Independent feature', 'Salary'])

In [16]:
finalised_data.show()

+-------------------+------+
|Independent feature|Salary|
+-------------------+------+
|        [31.0,10.0]| 10000|
|         [30.0,8.0]|  5000|
|         [29.0,4.0]|  4000|
|         [24.0,3.0]|  3000|
|         [21.0,1.0]| 20000|
|         [23.0,2.0]| 10000|
|         [22.0,2.0]|  5000|
|         [19.0,1.0]| 10000|
|         [20.0,2.0]|  8000|
+-------------------+------+



In [19]:
from pyspark.ml.regression import LinearRegression

#train_test_split
train_data,test_data = finalised_data.randomSplit([0.75,0.25])
regressor = LinearRegression(featuresCol = "Independent feature", labelCol = 'Salary')
regressor = regressor.fit(train_data)

In [23]:
#coefficeints
regressor.coefficients

DenseVector([2214.2857, -3597.1897])

In [22]:
regressor.intercept

-27158.07962528605

In [25]:
##Prediction
pred_results = regressor.evaluate(test_data)

In [26]:
pred_results.predictions.show()

+-------------------+------+------------------+
|Independent feature|Salary|        prediction|
+-------------------+------+------------------+
|         [22.0,2.0]|  5000|14361.826697891902|
|         [23.0,2.0]| 10000|16576.112412177255|
|         [24.0,3.0]|  3000|15193.208430912742|
|         [29.0,4.0]|  4000| 22667.44730678963|
+-------------------+------+------------------+



In [None]:
pred_results