In [18]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

In [3]:
spark = SparkSession.builder.appName('Introduction to PySpark Mlib').getOrCreate()
spark.sparkContext

In [5]:
# Training
train = spark.read.csv('test3.csv', header=True, inferSchema=True)
train.show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [9]:
train.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



(None, ['Name', 'Age', 'Experience', 'Salary'])

In [10]:
train.columns

['Name', 'Age', 'Experience', 'Salary']

In [14]:
FeatureAssembler = VectorAssembler(inputCols=['Age', 'Experience'], outputCol='IndependentFeature')

In [15]:
output = FeatureAssembler.transform(train)
output.show()

+---------+---+----------+------+------------------+
|     Name|Age|Experience|Salary|IndependentFeature|
+---------+---+----------+------+------------------+
|    Krish| 31|        10| 30000|       [31.0,10.0]|
|Sudhanshu| 30|         8| 25000|        [30.0,8.0]|
|    Sunny| 29|         4| 20000|        [29.0,4.0]|
|     Paul| 24|         3| 20000|        [24.0,3.0]|
|   Harsha| 21|         1| 15000|        [21.0,1.0]|
|  Shubham| 23|         2| 18000|        [23.0,2.0]|
+---------+---+----------+------+------------------+



In [16]:
FinalizedData = output.select(['Salary','IndependentFeature'])
FinalizedData.show()

+------+------------------+
|Salary|IndependentFeature|
+------+------------------+
| 30000|       [31.0,10.0]|
| 25000|        [30.0,8.0]|
| 20000|        [29.0,4.0]|
| 20000|        [24.0,3.0]|
| 15000|        [21.0,1.0]|
| 18000|        [23.0,2.0]|
+------+------------------+



In [20]:
train_data, test_data = FinalizedData.randomSplit([.75,.25])