In [1]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('Missing').getOrCreate()

In [9]:
import csv
import random

# Generate example data
data = []
for _ in range(100):
    name = f"Person{_}"
    age = random.randint(20, 60)
    experience = random.randint(0, 10)
    salary = random.randint(10000, 90000)
    data.append([name, age, experience, salary])

# Write data to a CSV file
with open("data.csv", mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["name", "age", "experience", "salary"])
    writer.writerows(data)

print("CSV file created: data.csv")

CSV file created: data.csv


In [10]:
## Read The dataset
training = spark.read.csv('data.csv',header=True,inferSchema=True)

In [11]:
training.show()

+--------+---+----------+------+
|    name|age|experience|salary|
+--------+---+----------+------+
| Person0| 49|         0| 58422|
| Person1| 48|         4| 37664|
| Person2| 28|         4| 71499|
| Person3| 46|         7| 63238|
| Person4| 26|         6| 47200|
| Person5| 59|        10| 20241|
| Person6| 43|         0| 27733|
| Person7| 56|         0| 62898|
| Person8| 41|         6| 78750|
| Person9| 31|         7| 72162|
|Person10| 43|         0| 11687|
|Person11| 50|         9| 27353|
|Person12| 53|         3| 52929|
|Person13| 44|         4| 65838|
|Person14| 37|         5| 32714|
|Person15| 28|         5| 41130|
|Person16| 51|         3| 30307|
|Person17| 54|         3| 23044|
|Person18| 21|         7| 30662|
|Person19| 43|         0| 14178|
+--------+---+----------+------+
only showing top 20 rows



In [12]:
training.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- experience: integer (nullable = true)
 |-- salary: integer (nullable = true)



In [13]:
training.columns

['name', 'age', 'experience', 'salary']

[Age,Experience]----> new feature--->independent feature

In [18]:
from pyspark.ml.feature import VectorAssembler
featureassembler=VectorAssembler(inputCols=["age","experience"],outputCol="Independent Features")

In [19]:
output=featureassembler.transform(training)

In [21]:
output.show()

+--------+---+----------+------+--------------------+
|    name|age|experience|salary|Independent Features|
+--------+---+----------+------+--------------------+
| Person0| 49|         0| 58422|          [49.0,0.0]|
| Person1| 48|         4| 37664|          [48.0,4.0]|
| Person2| 28|         4| 71499|          [28.0,4.0]|
| Person3| 46|         7| 63238|          [46.0,7.0]|
| Person4| 26|         6| 47200|          [26.0,6.0]|
| Person5| 59|        10| 20241|         [59.0,10.0]|
| Person6| 43|         0| 27733|          [43.0,0.0]|
| Person7| 56|         0| 62898|          [56.0,0.0]|
| Person8| 41|         6| 78750|          [41.0,6.0]|
| Person9| 31|         7| 72162|          [31.0,7.0]|
|Person10| 43|         0| 11687|          [43.0,0.0]|
|Person11| 50|         9| 27353|          [50.0,9.0]|
|Person12| 53|         3| 52929|          [53.0,3.0]|
|Person13| 44|         4| 65838|          [44.0,4.0]|
|Person14| 37|         5| 32714|          [37.0,5.0]|
|Person15| 28|         5| 41

In [22]:
output.columns

['name', 'age', 'experience', 'salary', 'Independent Features']

In [23]:
finalized_data=output.select("Independent Features","Salary")

In [24]:
finalized_data.show()

+--------------------+------+
|Independent Features|Salary|
+--------------------+------+
|          [49.0,0.0]| 58422|
|          [48.0,4.0]| 37664|
|          [28.0,4.0]| 71499|
|          [46.0,7.0]| 63238|
|          [26.0,6.0]| 47200|
|         [59.0,10.0]| 20241|
|          [43.0,0.0]| 27733|
|          [56.0,0.0]| 62898|
|          [41.0,6.0]| 78750|
|          [31.0,7.0]| 72162|
|          [43.0,0.0]| 11687|
|          [50.0,9.0]| 27353|
|          [53.0,3.0]| 52929|
|          [44.0,4.0]| 65838|
|          [37.0,5.0]| 32714|
|          [28.0,5.0]| 41130|
|          [51.0,3.0]| 30307|
|          [54.0,3.0]| 23044|
|          [21.0,7.0]| 30662|
|          [43.0,0.0]| 14178|
+--------------------+------+
only showing top 20 rows



In [25]:
from pyspark.ml.regression import LinearRegression
##train test split
train_data,test_data=finalized_data.randomSplit([0.75,0.25])
regressor=LinearRegression(featuresCol='Independent Features', labelCol='Salary')
regressor=regressor.fit(train_data)

In [26]:
### Coefficients
regressor.coefficients

DenseVector([122.4927, 1606.1927])

In [27]:
### Intercepts
regressor.intercept

33979.01546240077

In [28]:
### Prediction
pred_results=regressor.evaluate(test_data)

In [29]:
pred_results.predictions.show()

+--------------------+------+------------------+
|Independent Features|Salary|        prediction|
+--------------------+------+------------------+
|          [21.0,8.0]| 49214|49400.904455854245|
|          [23.0,1.0]| 80693| 38402.54065576129|
|          [26.0,1.0]| 31235|38770.018800916456|
|         [27.0,10.0]| 80369|  53348.2462405064|
|          [31.0,7.0]| 34620| 49019.63885920055|
|         [32.0,10.0]| 27537| 53960.70981576502|
|          [33.0,0.0]| 75323|  38021.2750591076|
|          [34.0,3.0]| 41717| 42962.34601567206|
|          [34.0,6.0]| 44768|  47780.9242571848|
|          [36.0,9.0]| 36426| 52844.48792880099|
|          [37.0,3.0]| 44890| 43329.82416082723|
|          [39.0,6.0]| 56515| 48393.38783244341|
|          [42.0,8.0]| 81710|51973.251471940406|
|          [44.0,4.0]| 65838|45793.465913360196|
|         [45.0,10.0]| 47788|  55553.1151114374|
|          [46.0,2.0]| 79301| 42826.06584912181|
|          [46.0,7.0]| 56626| 50857.02958497638|
|          [47.0,3.0

In [30]:
pred_results.meanAbsoluteError,pred_results.meanSquaredError

(17845.925089951776, 499353818.58304757)