In [1]:
import pandas as pd

# Load your CSV file
df = pd.read_csv('output_concatenated.csv')

# Check the top rows to understand your data structure
print(df.head())

# Merging Year and Quarter
df['Year.Quarter'] = df['Year'].astype(str) + '.' + df['Quarter'].astype(str)

# Drop the original Year and Quarter columns if you no longer need them
df.drop(['Year', 'Quarter'], axis=1, inplace=True)

# Save the modified DataFrame back to CSV if needed, or just display it
df.to_csv('output.csv', index=False)
print("Modified DataFrame has been saved.")

# Optionally, display the modified DataFrame
print(df.head())


  Class  Year  Quarter  Total Score  Paper Count
0  MATH  2020        1       2.3369           14
1  SOCI  2020        1       4.5885           33
2  PHYS  2020        2      24.4616           80
3  ARTS  2020        3       1.7870            7
4  BUSI  2020        1       4.3641           20
Modified DataFrame has been saved.
  Class  Total Score  Paper Count Year.Quarter
0  MATH       2.3369           14       2020.1
1  SOCI       4.5885           33       2020.1
2  PHYS      24.4616           80       2020.2
3  ARTS       1.7870            7       2020.3
4  BUSI       4.3641           20       2020.1


In [3]:
import pandas as pd

# Load the data
data = pd.read_csv('output.csv')

# Convert Year.Quarter to a sortable format (e.g., 2020.1 to 20201)
data['Year_Quarter_Sortable'] = data['Year.Quarter'].apply(lambda x: int(x*10))

# Sort data by class and the new sortable year_quarter column
data_sorted = data.sort_values(by=['Class', 'Year_Quarter_Sortable'])

# Calculate cumulative total score for each class
data_sorted['Cumulative_Total_Score'] = data_sorted.groupby('Class')['Total Score'].cumsum()

# Save the result back to CSV if needed
data_sorted.to_csv('output_cumulative.csv', index=False)

data_sorted.head(10)


Unnamed: 0,Class,Total Score,Paper Count,Year.Quarter,Year_Quarter_Sortable,Cumulative_Total_Score
574,AGRI,10.9842,37,2018.1,20181,10.9842
582,AGRI,10.9573,39,2018.2,20182,21.9415
618,AGRI,16.3176,48,2018.3,20183,38.2591
569,AGRI,16.9695,53,2018.4,20184,55.2286
493,AGRI,7.5469,35,2019.1,20191,62.7755
504,AGRI,4.8535,37,2019.2,20192,67.629
529,AGRI,4.6837,33,2019.3,20193,72.3127
437,AGRI,7.2548,51,2019.4,20194,79.5675
27,AGRI,8.0296,40,2020.1,20201,87.5971
73,AGRI,12.2055,65,2020.2,20202,99.8026


In [None]:

import pandas as pd

def load_and_combine_data(path_existing, path_new):
    # Load the existing data
    existing_data = pd.read_csv(path_existing)

    # Load the newly scraped data
    new_data = pd.read_csv(path_new)

    # Remove the 'Year_Quarter_Sortable' column from the new data
    new_data.drop(columns=['Year_Quarter_Sortable'], inplace=True)

    # Combine the existing and new data
    combined_data = pd.concat([existing_data, new_data], ignore_index=True)

    # Group by 'Class' and 'Year.Quarter' and sum the relevant columns
    grouped_data = combined_data.groupby(['Class', 'Year.Quarter']).agg({
        'Total Score': 'sum',
        'Paper Count': 'sum',
        'Cumulative_Total_Score': 'sum'
    }).reset_index()

    return grouped_data

# Define file paths
file_path_existing = '/content/output_cumulative_filled.csv'
file_path_new = '/content/output_cumulative.csv'


# Combine the data
combined_data = load_and_combine_data(file_path_existing, file_path_new)

# Optionally, save the combined data to a new CSV file
combined_data.to_csv('path_to_save_combined_data.csv', index=False)



In [None]:
import pandas as pd

# Load your CSV file
df = pd.read_csv('data.csv')

# Check the top rows to understand your data structure
print(df.head())

# Merging Year and Quarter
df['Year.Quarter'] = df['Year'].astype(str) + '.' + df['Quarter'].astype(str)

# Drop the original Year and Quarter columns if you no longer need them
df.drop(['Year', 'Quarter'], axis=1, inplace=True)

# Save the modified DataFrame back to CSV if needed, or just display it
df.to_csv('data_mod.csv', index=False)
print("Modified DataFrame has been saved.")

# Optionally, display the modified DataFrame
print(df.head())


In [None]:
import pandas as pd
import numpy as np

# Load the data
data = pd.read_csv('data_mod.csv')

# Convert Year.Quarter to string to ensure consistency in data type
data['Year.Quarter'] = data['Year.Quarter'].astype(str)

# Helper function to generate all year.quarters between two given year.quarters
def generate_year_quarters(start, end):
    start_year, start_quarter = map(int, start.split('.'))
    end_year, end_quarter = map(int, end.split('.'))
    quarters = []
    for year in range(start_year, end_year + 1):
        start_q = 1 if year != start_year else start_quarter
        end_q = 4 if year != end_year else end_quarter
        for quarter in range(start_q, end_q + 1):
            quarters.append(f"{year}.{quarter}")
    return quarters

# Expand the data frame to include all quarters
all_quarters = generate_year_quarters('2018.1', '2023.4')
all_classes = data['Class'].unique()
expanded_data = pd.MultiIndex.from_product([all_classes, all_quarters], names=["Class", "Year.Quarter"]).to_frame(index=False)

# Ensure Year.Quarter is a string in expanded data too
expanded_data['Year.Quarter'] = expanded_data['Year.Quarter'].astype(str)

# Merge expanded data with actual data
data = expanded_data.merge(data, on=["Class", "Year.Quarter"], how="left")

# Sort data by class and year_quarter
data = data.sort_values(by=['Class', 'Year.Quarter'])

# Forward fill the total score for each class to handle missing quarters
data['Total Score'] = data.groupby('Class')['Total Score'].fillna(method='ffill')

# Calculate cumulative total score for each class
data['Cumulative_Total_Score'] = data.groupby('Class')['Total Score'].cumsum()

# Save the result back to CSV if needed
data.to_csv('output_cumulative_filled.csv', index=False)

data.head()


In [None]:
import pandas as pd

# Load the data
data = pd.read_csv('output.csv')

# Convert Year.Quarter to a sortable format (e.g., 2020.1 to 20201)
data['Year_Quarter_Sortable'] = data['Year.Quarter'].apply(lambda x: int(x*10))

# Sort data by class and the new sortable year_quarter column
data_sorted = data.sort_values(by=['Class', 'Year_Quarter_Sortable'])

# Calculate cumulative total score for each class
data_sorted['Cumulative_Total_Score'] = data_sorted.groupby('Class')['Total Score'].cumsum()

# Save the result back to CSV if needed
data_sorted.to_csv('output_cumulative.csv', index=False)

data_sorted.head(10)


In [None]:
import pandas as pd

# Load the data
data = pd.read_csv('path_to_save_combined_data.csv')

# Extract the year from 'Year.Quarter'
data['Year'] = data['Year.Quarter'].apply(lambda x: int(x))

# Group the data by 'Class' and 'Year', summing up the 'Total Score' for each group
yearly_data = data.groupby(['Class', 'Year'])['Total Score'].sum().reset_index()

# Export the grouped data to a CSV file
yearly_data.to_csv('yearly_scores.csv', index=False)


In [None]:
import pandas as pd

# Load the data
data = pd.read_csv('output.csv')

# Extract the year from 'Year.Quarter'
data['Year'] = data['Year.Quarter'].apply(lambda x: int(x))

# Group the data by 'Class' and 'Year', summing up the 'Total Score' for each group
yearly_data = data.groupby(['Class', 'Year'])['Total Score'].sum().reset_index()

# Calculate the cumulative score for each class across years
yearly_data['Cumulative Score'] = yearly_data.groupby('Class')['Total Score'].cumsum()

# Export the grouped data with cumulative scores to a CSV file
yearly_data.to_csv('cumulative_yearly_scores.csv', index=False)
