In [2]:
from pyspark.sql import SparkSession


In [4]:
spark = SparkSession.builder.appName('mllib').getOrCreate() 

In [5]:
spark


In [11]:
df_pd = spark.read.csv('../data/friends.csv',header=True,inferSchema=True)
df_pd.show()



+---------+---+------+----------+------+
|     name|age|gender|experience|salary|
+---------+---+------+----------+------+
|   aditya| 26|     m|         4| 50000|
|  nainesh| 25|     m|         3| 40000|
|    nitin| 25|     m|         2| 45000|
|   snehal| 24|     f|         1| 25000|
|samruddhi| 25|     f|         2| 45000|
|  prajwal| 27|     m|         4| 35000|
|   sanket| 23|     m|         1| 20000|
|    viraj| 25|     m|         2| 37000|
|     amol| 22|     m|         2| 25000|
|   ashish| 26|     m|         4| 23000|
+---------+---+------+----------+------+



In [13]:
df_pd.columns

['name', 'age', 'gender', 'experience', 'salary']

In [17]:
from pyspark.ml.feature import VectorAssembler
featureassembler = VectorAssembler(inputCols=['age','experience'],outputCol='Independent features')

In [19]:
output = featureassembler.transform(df_pd)

In [21]:
output.columns

['name', 'age', 'gender', 'experience', 'salary', 'Independent features']

In [22]:
output.show()

+---------+---+------+----------+------+--------------------+
|     name|age|gender|experience|salary|Independent features|
+---------+---+------+----------+------+--------------------+
|   aditya| 26|     m|         4| 50000|          [26.0,4.0]|
|  nainesh| 25|     m|         3| 40000|          [25.0,3.0]|
|    nitin| 25|     m|         2| 45000|          [25.0,2.0]|
|   snehal| 24|     f|         1| 25000|          [24.0,1.0]|
|samruddhi| 25|     f|         2| 45000|          [25.0,2.0]|
|  prajwal| 27|     m|         4| 35000|          [27.0,4.0]|
|   sanket| 23|     m|         1| 20000|          [23.0,1.0]|
|    viraj| 25|     m|         2| 37000|          [25.0,2.0]|
|     amol| 22|     m|         2| 25000|          [22.0,2.0]|
|   ashish| 26|     m|         4| 23000|          [26.0,4.0]|
+---------+---+------+----------+------+--------------------+



In [24]:
finalized_data = output.select('Independent features','salary')

In [28]:
finalized_data.show()

+--------------------+------+
|Independent features|salary|
+--------------------+------+
|          [26.0,4.0]| 50000|
|          [25.0,3.0]| 40000|
|          [25.0,2.0]| 45000|
|          [24.0,1.0]| 25000|
|          [25.0,2.0]| 45000|
|          [27.0,4.0]| 35000|
|          [23.0,1.0]| 20000|
|          [25.0,2.0]| 37000|
|          [22.0,2.0]| 25000|
|          [26.0,4.0]| 23000|
+--------------------+------+



In [30]:
from pyspark.ml.regression import LinearRegression
# splitttingg data into train and test data
train,test  = finalized_data.randomSplit([0.75,0.25])
regressor = LinearRegression(featuresCol='Independent features',labelCol='salary')
regressor = regressor.fit(train)

In [35]:
# coefficients
regressor.coefficients

DenseVector([4415.2542, 220.339])

In [36]:
# intercepts
regressor.intercept

-72703.38983050824

In [39]:
results = regressor.evaluate(test)

In [43]:
results.predictions.show()

+--------------------+------+-----------------+
|Independent features|salary|       prediction|
+--------------------+------+-----------------+
|          [24.0,1.0]| 25000|33483.05084745771|
|          [26.0,4.0]| 23000|42974.57627118645|
+--------------------+------+-----------------+



In [45]:
results.meanAbsoluteError, results.meanSquaredError

(14228.81355932208, 235472924.44699886)