In [1]:
import pandas as pd
import glob

# Get a list of all Excel files in the folder
excel_files = glob.glob("Wages*.xlsx")

# Create an empty list to store the dataframes
dfs = []

# Loop through each Excel file and read it into a dataframe
for file in excel_files:
    df = pd.read_excel(file)
    # Extract the year from the filename
    year = file.split(" ")[1].split(".")[0] 
    df["Year"] = year
    # Convert 'amount' column to numeric, coercing non-numeric values to NaN
    df["amount"] = pd.to_numeric(df["amount"], errors="coerce")
    dfs.append(df)

# Concatenate all dataframes into a single dataframe
combined_df = pd.concat(dfs, ignore_index=True)

# Get all unique packages
all_packages = combined_df["package"].unique()

# Create a dictionary to store the total amount for each package and year
package_totals = {}

# Calculate the total amount for each package and year
for package in all_packages:
    package_totals[package] = {}
    for year in combined_df["Year"].unique():
        package_totals[package][year] = combined_df[(combined_df["package"] == package) & (combined_df["Year"] == year)]["amount"].sum()

# Create a list to store the data for the final dataframe
final_data = []

# Prepare data for the final dataframe
for package, year_totals in package_totals.items():
    row = {"Package": package}
    for year, total in year_totals.items():
        row[year] = total
    final_data.append(row)

# Create a new dataframe from the package totals
final_df = pd.DataFrame(final_data)

# Calculate the total amount for each package across all years
final_df["Total"] = final_df.drop("Package", axis=1).sum(axis=1)

# Reorder columns to have 'Package' first, then the years, and 'Total' last
years = sorted(combined_df["Year"].unique())
final_df = final_df[["Package"] + years + ["Total"]]

# Save the final dataframe to a new Excel file
final_df.to_excel("Package_Totals_by_Year.xlsx", index=False)