In [1]:
# I am using this section to suppress warnings generated by your code:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
warnings.filterwarnings('ignore')

# FindSpark simplifies the process of using Apache Spark with Python
import findspark
findspark.init()

import pyspark

# import SparkSession
from pyspark.sql import SparkSession

In [2]:
#Create SparkSession
#Here 'Getting Started with Spark' is the application name
#Ignore any warnings by SparkSession command

spark = SparkSession.builder.getOrCreate()

In [3]:
# using the spark.read.csv function we load the data into a dataframe.
# the header = True mentions that there is a header row in out csv file
# the inferSchema = True, tells spark to automatically find out the data types of the columns.

# Load mpg dataset
placements_data = spark.read.csv("job_placement_dataset.csv", header=True, inferSchema=True)
print(placements_data)
placements_data.show(placements_data.count(), truncate=False)
print("Total records:", placements_data.count())

DataFrame[id: int, name: string, gender: string, age: int, degree: string, stream: string, college_name: string, placement_status: string, salary: int, gpa: double, years_of_experience: int, date: string]
+---+-------------------+------+---+----------+-----------------------------+-----------------------------------------+----------------+------+---+-------------------+----------+
|id |name               |gender|age|degree    |stream                       |college_name                             |placement_status|salary|gpa|years_of_experience|date      |
+---+-------------------+------+---+----------+-----------------------------+-----------------------------------------+----------------+------+---+-------------------+----------+
|1  |John Doe           |Male  |25 |Bachelor's|Computer Science             |Harvard University                       |Placed          |60000 |3.7|2                  |04-03-2023|
|2  |Jane Smith         |Female|24 |Bachelor's|Electrical Engineering       |Ma

In [4]:
placements_data.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- degree: string (nullable = true)
 |-- stream: string (nullable = true)
 |-- college_name: string (nullable = true)
 |-- placement_status: string (nullable = true)
 |-- salary: integer (nullable = true)
 |-- gpa: double (nullable = true)
 |-- years_of_experience: integer (nullable = true)
 |-- date: string (nullable = true)



In [5]:
placements_data.head()

Row(id=1, name='John Doe', gender='Male', age=25, degree="Bachelor's", stream='Computer Science', college_name='Harvard University', placement_status='Placed', salary=60000, gpa=3.7, years_of_experience=2, date='04-03-2023')

In [6]:
#import functions/Classes for sparkml
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import LogisticRegression

# import functions/Classes for metrics
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [7]:
placements_data.groupBy('placement_status').count().orderBy('count').show()

+----------------+-----+
|placement_status|count|
+----------------+-----+
|      Not Placed|  130|
|          Placed|  570|
+----------------+-----+



In [8]:
# Convert gender, stream and placement_status column from string to numerical values
indexer = StringIndexer(inputCol="gender", outputCol="Gender")
placements_data = indexer.fit(placements_data).transform(placements_data)

indexer = StringIndexer(inputCol="stream", outputCol="Stream")
placements_data = indexer.fit(placements_data).transform(placements_data)


indexer = StringIndexer(inputCol="placement_status", outputCol="label")
placements_data = indexer.fit(placements_data).transform(placements_data)

placements_data.show(placements_data.count(), truncate=False)

+---+-------------------+------+---+----------+------+-----------------------------------------+----------------+------+---+-------------------+----------+-----+
|id |name               |Gender|age|degree    |Stream|college_name                             |placement_status|salary|gpa|years_of_experience|date      |label|
+---+-------------------+------+---+----------+------+-----------------------------------------+----------------+------+---+-------------------+----------+-----+
|1  |John Doe           |1.0   |25 |Bachelor's|0.0   |Harvard University                       |Placed          |60000 |3.7|2                  |04-03-2023|0.0  |
|2  |Jane Smith         |0.0   |24 |Bachelor's|2.0   |Massachusetts Institute of Technology    |Placed          |65000 |3.6|1                  |23-06-2023|0.0  |
|3  |Michael Johnson    |1.0   |26 |Bachelor's|4.0   |Stanford University                      |Placed          |58000 |3.8|3                  |25-02-2023|0.0  |
|4  |Emily Davis        |0.0

In [9]:
# Prepare feature vector
assembler = VectorAssembler(inputCols=["Gender","Stream","gpa","years_of_experience"], outputCol="features")
placements_transformed_data = assembler.transform(placements_data)

In [10]:
placements_transformed_data.select("features","label").show()

+-----------------+-----+
|         features|label|
+-----------------+-----+
|[1.0,0.0,3.7,2.0]|  0.0|
|[0.0,2.0,3.6,1.0]|  0.0|
|[1.0,4.0,3.8,3.0]|  0.0|
|[0.0,1.0,3.5,2.0]|  1.0|
|[1.0,0.0,3.9,2.0]|  0.0|
|[0.0,3.0,3.7,1.0]|  0.0|
|[1.0,1.0,3.8,3.0]|  0.0|
|[0.0,0.0,3.6,2.0]|  1.0|
|[1.0,2.0,3.7,2.0]|  0.0|
|[0.0,4.0,3.5,1.0]|  0.0|
|[1.0,0.0,3.9,2.0]|  0.0|
|[0.0,3.0,3.8,3.0]|  1.0|
|[1.0,1.0,3.7,2.0]|  0.0|
|[0.0,0.0,3.6,1.0]|  0.0|
|[1.0,2.0,3.8,3.0]|  0.0|
|[0.0,4.0,3.7,2.0]|  1.0|
|[1.0,0.0,3.9,2.0]|  0.0|
|[0.0,3.0,3.8,1.0]|  0.0|
|[1.0,1.0,3.7,2.0]|  0.0|
|[0.0,0.0,3.6,1.0]|  0.0|
+-----------------+-----+
only showing top 20 rows



In [11]:
# Split data into training and testing sets
(training_data, testing_data) = placements_transformed_data.randomSplit([0.7, 0.3], seed=42)

In [12]:
lr = LogisticRegression(featuresCol="features", labelCol="label")
model = lr.fit(training_data)

In [13]:
# Make predictions on testing data
predictions = model.transform(testing_data)

In [14]:
# Evaluate model performance
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy =", accuracy)

Accuracy = 0.782608695652174


In [15]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedPrecision")
precision = evaluator.evaluate(predictions)
print("Precision =", precision)

Precision = 0.7558841087782139


In [16]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedRecall")
recall = evaluator.evaluate(predictions)
print("Recall =", recall)

Recall = 0.782608695652174


In [17]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
f1_score = evaluator.evaluate(predictions)
print("F1 score = ", f1_score)

F1 score =  0.7679518731200438
