In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("CollegePlacementAnalysis").getOrCreate()

df = spark.read.option("header", True).option("inferSchema", True).csv(r"file:///C:/Users/Navya/Downloads/CollegePlacement.csv")
df.show(5)
df.printSchema()
print("Total Rows:", df.count())
print("Total Columns:", len(df.columns))

+----------+---+---------------+----+--------------------+---------------------+----------------------+--------------------+------------------+---------+
|College_ID| IQ|Prev_Sem_Result|CGPA|Academic_Performance|Internship_Experience|Extra_Curricular_Score|Communication_Skills|Projects_Completed|Placement|
+----------+---+---------------+----+--------------------+---------------------+----------------------+--------------------+------------------+---------+
|   CLG0030|107|           6.61|6.28|                   8|                   No|                     8|                   8|                 4|       No|
|   CLG0061| 97|           5.52|5.37|                   8|                   No|                     7|                   8|                 0|       No|
|   CLG0036|109|           5.36|5.83|                   9|                   No|                     3|                   1|                 1|       No|
|   CLG0055|122|           5.47|5.75|                   6|                  

In [5]:
#Null Value & Missing Data Analysis
from pyspark.sql.functions import sum as spark_sum
df.select([spark_sum(when(col(c).isNull(), 1).otherwise(0)).alias(c) for c in df.columns]).show()

+----------+---+---------------+----+--------------------+---------------------+----------------------+--------------------+------------------+---------+
|College_ID| IQ|Prev_Sem_Result|CGPA|Academic_Performance|Internship_Experience|Extra_Curricular_Score|Communication_Skills|Projects_Completed|Placement|
+----------+---+---------------+----+--------------------+---------------------+----------------------+--------------------+------------------+---------+
|         0|  0|              0|   0|                   0|                    0|                     0|                   0|                 0|        0|
+----------+---+---------------+----+--------------------+---------------------+----------------------+--------------------+------------------+---------+



In [6]:
#Summary Statistics
df.describe().show()

+-------+----------+------------------+------------------+------------------+--------------------+---------------------+----------------------+--------------------+------------------+---------+
|summary|College_ID|                IQ|   Prev_Sem_Result|              CGPA|Academic_Performance|Internship_Experience|Extra_Curricular_Score|Communication_Skills|Projects_Completed|Placement|
+-------+----------+------------------+------------------+------------------+--------------------+---------------------+----------------------+--------------------+------------------+---------+
|  count|     10000|             10000|             10000|             10000|               10000|                10000|                 10000|               10000|             10000|    10000|
|   mean|      NULL|           99.4718|7.5356730000000445|          7.532379|              5.5464|                 NULL|                4.9709|              5.5618|            2.5134|     NULL|
| stddev|      NULL|15.0531014

In [9]:
#Placement Rate Calculation
from pyspark.sql.functions import avg, round, when, col

# Convert 'Placement' column (Yes/No or Placed/Not Placed) into numeric 1/0
df = df.withColumn("Placed", when(col("Placement") == "Placed", 1)
                              .when(col("Placement") == "Yes", 1)
                              .otherwise(0))

# Calculate overall placement rate (%)
placement_rate = df.agg(round(avg("Placed") * 100, 2).alias("Placement_Rate"))
placement_rate.show()


+--------------+
|Placement_Rate|
+--------------+
|         16.59|
+--------------+



In [13]:
#Average CGPA of Placed vs Not Placed
from pyspark.sql.functions import round, mean
df.groupBy("Placement").agg(round(mean("CGPA"), 2).alias("Average_CGPA")).show()

#Average IQ of Placed vs Not Placed
df.groupBy("Placement").agg(round(mean("IQ"), 2).alias("Average_IQ")).show()

#Effect of Internship Experience on Placement
df.groupBy("Internship_Experience").agg(round(mean("CGPA"), 2).alias("Avg_CGPA"),
                                        round(mean("Projects_Completed"), 2).alias("Avg_Projects"),
                                        count("*").alias("Student_Count")).show()

#Correlation between Academic Performance and Placement
from pyspark.sql.functions import corr
df.select(corr("Academic_Performance", "CGPA").alias("Perf_CGPA_Corr")).show()

#CGPA vs Projects Completed
df.groupBy("Projects_Completed").agg(round(mean("CGPA"), 2).alias("Avg_CGPA")).orderBy("Projects_Completed").show()

#Communication Skills vs Placement
df.groupBy("Placement").agg(round(mean("Communication_Skills"), 2).alias("Avg_Comm_Score")).show()

#Extra Curricular vs Placement
df.groupBy("Placement").agg(round(mean("Extra_Curricular_Score"), 2).alias("Avg_ExtraCurricular")).show()


+---------+------------+
|Placement|Average_CGPA|
+---------+------------+
|       No|        7.32|
|      Yes|        8.59|
+---------+------------+

+---------+----------+
|Placement|Average_IQ|
+---------+----------+
|       No|     97.55|
|      Yes|    109.12|
+---------+----------+

+---------------------+--------+------------+-------------+
|Internship_Experience|Avg_CGPA|Avg_Projects|Student_Count|
+---------------------+--------+------------+-------------+
|                   No|    7.55|        2.53|         6036|
|                  Yes|    7.51|        2.49|         3964|
+---------------------+--------+------------+-------------+

+--------------------+
|      Perf_CGPA_Corr|
+--------------------+
|-0.00295759985157...|
+--------------------+

+------------------+--------+
|Projects_Completed|Avg_CGPA|
+------------------+--------+
|                 0|    7.49|
|                 1|    7.49|
|                 2|    7.49|
|                 3|     7.6|
|                 4|   