In [12]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import sum as spark_sum, col

# Initialize Spark session
spark = SparkSession.builder \
    .appName("Cumulative Score Calculation") \
    .getOrCreate()

# List of years for the CSV files
years = [2019, 2020, 2021, 2022, 2023]  # Adjust this list based on your actual files and years

# Read the first file to create the initial DataFrame
df = spark.read.csv(f"{years[0]}.csv", header=True, inferSchema=True)

# Union all other DataFrames
for year in years[1:]:
    temp_df = spark.read.csv(f"output_{year}.csv", header=True, inferSchema=True)
    df = df.union(temp_df)

# Ensure that the 'Year' and 'Score' columns are correctly typed if not already
df = df.withColumn("Year", col("Year").cast("int"))
df = df.withColumn("Total Score", col("Total Score").cast("float"))
df.show()

+-----+----+-------+-----------+-----------+----------------------+
|Class|Year|Quarter|Total Score|Paper Count|Cumulative Total Score|
+-----+----+-------+-----------+-----------+----------------------+
| AGRI|2019|      4|     18.983|         20|                18.983|
| ARTS|2019|      4|     0.7959|          2|                0.7959|
| BIOC|2019|      4|    26.7993|         37|               26.7993|
| BUSI|2019|      4|     2.4173|          5|                2.4173|
| CENG|2019|      4|    25.8145|         19|               25.8145|
| CHEM|2019|      4|    22.5635|         20|               22.5635|
| COMP|2019|      4|    23.1798|         33|               23.1798|
| DECI|2019|      4|     2.3417|          4|                2.3417|
| DENT|2019|      4|     1.5907|          2|                1.5907|
| EART|2019|      4|    11.0173|          6|               11.0173|
| ECON|2019|      4|     5.8369|          8|                5.8369|
| ENER|2019|      4|    17.9768|         11|    

In [13]:
from pyspark.sql.window import Window

# Define a window specification ordered by Year for cumulative sum
windowSpec = Window.orderBy("Year").rowsBetween(Window.unboundedPreceding, Window.currentRow)

# Calculate cumulative total score
df = df.withColumn("Cumulative Score", spark_sum("Total Score").over(windowSpec))
df.show()

+-----+----+-------+-----------+-----------+----------------------+------------------+
|Class|Year|Quarter|Total Score|Paper Count|Cumulative Total Score|  Cumulative Score|
+-----+----+-------+-----------+-----------+----------------------+------------------+
| AGRI|2019|      4|     18.983|         20|                18.983|18.982999801635742|
| ARTS|2019|      4|     0.7959|          2|                0.7959|19.778899788856506|
| BIOC|2019|      4|    26.7993|         37|               26.7993| 46.57819902896881|
| BUSI|2019|      4|     2.4173|          5|                2.4173| 48.99549901485443|
| CENG|2019|      4|    25.8145|         19|               25.8145| 74.80999982357025|
| CHEM|2019|      4|    22.5635|         20|               22.5635| 97.37349927425385|
| COMP|2019|      4|    23.1798|         33|               23.1798|120.55329930782318|
| DECI|2019|      4|     2.3417|          4|                2.3417|122.89499938488007|
| DENT|2019|      4|     1.5907|          2

24/05/07 22:13:29 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/07 22:13:29 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/07 22:13:29 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/07 22:13:29 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/07 22:13:29 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


In [14]:
# Show the results
df.show()

# Optionally, save the result to a single CSV file
df.coalesce(1).write.csv("cumulative_scores.csv", mode="overwrite", header=True)

24/05/07 22:13:31 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/07 22:13:31 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/07 22:13:31 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/07 22:13:31 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/07 22:13:31 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/07 22:13:31 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/07 2

+-----+----+-------+-----------+-----------+----------------------+------------------+
|Class|Year|Quarter|Total Score|Paper Count|Cumulative Total Score|  Cumulative Score|
+-----+----+-------+-----------+-----------+----------------------+------------------+
| AGRI|2019|      4|     18.983|         20|                18.983|18.982999801635742|
| ARTS|2019|      4|     0.7959|          2|                0.7959|19.778899788856506|
| BIOC|2019|      4|    26.7993|         37|               26.7993| 46.57819902896881|
| BUSI|2019|      4|     2.4173|          5|                2.4173| 48.99549901485443|
| CENG|2019|      4|    25.8145|         19|               25.8145| 74.80999982357025|
| CHEM|2019|      4|    22.5635|         20|               22.5635| 97.37349927425385|
| COMP|2019|      4|    23.1798|         33|               23.1798|120.55329930782318|
| DECI|2019|      4|     2.3417|          4|                2.3417|122.89499938488007|
| DENT|2019|      4|     1.5907|          2

24/05/07 22:13:31 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/05/07 22:13:31 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
