In [0]:
csv_data = """student_id,name,subject,score,grade
1,Ankit,Math,85,A
2,Divya,Science,92,A
3,Rahul,English,78,B
4,Sneha,Math,65,C
5,Aryan,Science,55,D
6,Isha,English,88,A
7,Tanvi,Math,91,A
8,Kunal,Science,72,B
9,Megha,English,60,C
10,Rohan,Math,40,F
"""
with open("/tmp/student_scores.csv", "w") as f:
    f.write(csv_data)

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from delta.tables import DeltaTable
spark = SparkSession.builder \
    .appName("Student Scores with Delta") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()


In [0]:

spark.sql("SELECT current_database()").show()


+------------------+
|current_database()|
+------------------+
|           default|
+------------------+



In [0]:
# Step 1: Read CSV into DataFrame
df = spark.read.option("header", True).option("inferSchema", True).csv("/tmp/student_scores.csv")
df.show()

# Step 2: Write to Delta
df.write.format("delta").mode("overwrite").save("/tmp/delta/student_scores")

# Step 3: Register Delta Table
spark.sql("DROP TABLE IF EXISTS student_scores")
spark.sql("CREATE TABLE student_scores USING DELTA LOCATION '/tmp/delta/student_scores'")


In [0]:
# Task 1
spark.sql("SELECT name, score FROM student_scores").show()

# Task 2
spark.sql("SELECT subject, COUNT(*) AS student_count FROM student_scores GROUP BY subject").show()

# Task 3
spark.sql("SELECT subject, AVG(score) AS avg_score FROM student_scores GROUP BY subject").show()

# Task 4
spark.sql("SELECT name, subject, score FROM student_scores WHERE score > 80").show()


In [0]:


# Task 5: Highest score per subject
df = spark.table("student_scores")
windowSpec = Window.partitionBy("subject").orderBy(F.desc("score"))

df.withColumn("rank", F.rank().over(windowSpec)) \
  .filter("rank = 1") \
  .select("name", "subject", "score") \
  .show()

# Task 6
spark.sql("SELECT grade, COUNT(*) AS grade_count FROM student_scores GROUP BY grade").show()

# Task 7
spark.sql("SELECT name, subject, score FROM student_scores WHERE grade = 'F'").show()

# Task 8
spark.sql("SELECT name, subject, score FROM student_scores WHERE score BETWEEN 60 AND 90").show()

# Task 9: Ranking within subjects
df.withColumn("rank", F.rank().over(windowSpec)) \
  .select("name", "subject", "score", "rank") \
  .show()



In [0]:


# Task 10: Increase score of English students by 5
delta_table = DeltaTable.forPath(spark, "/tmp/delta/student_scores")
delta_table.update(
    condition="subject = 'English'",
    set={"score": "score + 5"}
)

# Task 11: Delete where score < 50
delta_table.delete("score < 50")

# Task 12: Add pass_status column
df = delta_table.toDF()
df = df.withColumn("pass_status", F.when(df["score"] >= 50, "PASS").otherwise("FAIL"))
df.write.format("delta").mode("overwrite").save("/tmp/delta/student_scores")



In [0]:
# Task 13: Create temp view and run SQL
df = spark.read.format("delta").load("/tmp/delta/student_scores")
df.createOrReplaceTempView("temp_scores")
spark.sql("SELECT subject, AVG(score) AS avg_score FROM temp_scores GROUP BY subject").show()

# Task 14: Save as student_scores_v2 Delta table
df.write.format("delta").mode("overwrite").save("/tmp/delta/student_scores_v2")
spark.sql("DROP TABLE IF EXISTS student_scores_v2")
spark.sql("CREATE TABLE student_scores_v2 USING DELTA LOCATION '/tmp/delta/student_scores_v2'")

# Task 15: Write to Parquet
df.write.mode("overwrite").parquet("/tmp/parquet/student_scores")