In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg

# Step 1: Initialize Spark
spark = SparkSession.builder.appName("StudentScores").getOrCreate()


In [2]:
import pandas as pd

# Sample student score data
data = {
    "Name": ["Alice", "Bob", "Charlie", "David", "Eva"],
    "Math": [90, 76, 89, 65, 92],
    "Science": [85, 92, 79, 70, 88],
    "English": [88, 81, 95, 60, 91]
}

# Create DataFrame
df = pd.DataFrame(data)

# Save to CSV
df.to_csv("students.csv", index=False)

print(" students.csv created successfully!")
print(df)

 students.csv created successfully!
      Name  Math  Science  English
0    Alice    90       85       88
1      Bob    76       92       81
2  Charlie    89       79       95
3    David    65       70       60
4      Eva    92       88       91


In [3]:
# Step 2: Load CSV
df = spark.read.csv("students.csv", header=True, inferSchema=True)

# Step 3: Calculate average score per subject
avg_scores = df.select(
    avg(col("Math")).alias("Avg_Math"),
    avg(col("Science")).alias("Avg_Science"),
    avg(col("English")).alias("Avg_English")
)

# Step 4: Filter students who scored more than 85 in Math
high_math = df.filter(col("Math") > 85)

# Step 5: Show results
print("=== Average Scores Per Subject ===")
avg_scores.show()

print("=== Students with Math > 85 ===")
high_math.show()

# Stop Spark session
spark.stop()

=== Average Scores Per Subject ===
+--------+-----------+-----------+
|Avg_Math|Avg_Science|Avg_English|
+--------+-----------+-----------+
|    82.4|       82.8|       83.0|
+--------+-----------+-----------+

=== Students with Math > 85 ===
+-------+----+-------+-------+
|   Name|Math|Science|English|
+-------+----+-------+-------+
|  Alice|  90|     85|     88|
|Charlie|  89|     79|     95|
|    Eva|  92|     88|     91|
+-------+----+-------+-------+

