In [22]:
import pandas as pd
import re

# List of file paths for each yearly dataset
yearly_datasets = ['/workspaces/codespaces-jupyter/TSA_DataScience_23-24/Programs_Dataset/AY16Disbursements-by-Program.xlsx', 
                   '/workspaces/codespaces-jupyter/TSA_DataScience_23-24/Programs_Dataset/AY17Disbursements-by-Program.xlsx', 
                   '/workspaces/codespaces-jupyter/TSA_DataScience_23-24/Programs_Dataset/AY18Disbursements-by-Program.xlsx', 
                   '/workspaces/codespaces-jupyter/TSA_DataScience_23-24/Programs_Dataset/AY19Disbursements-by-Program.xlsx', 
                   '/workspaces/codespaces-jupyter/TSA_DataScience_23-24/Programs_Dataset/AY20Disbursements-by-Program.xlsx', 
                   '/workspaces/codespaces-jupyter/TSA_DataScience_23-24/Programs_Dataset/AY21Disbursements-by-Program.xlsx', 
                   '/workspaces/codespaces-jupyter/TSA_DataScience_23-24/Programs_Dataset/AY22Disbursements-by-Program.xlsx', 
                   '/workspaces/codespaces-jupyter/TSA_DataScience_23-24/Programs_Dataset/AY23Disbursements-by-Program.xlsx']

# Create a dictionary to map CIP codes to career subfields
cip_to_subfield = {
    '01': 'Agricultural/Animal/Plant/Veterinary Science and Related Fields',
    '03': 'Natural Resources and Conservation',
    '04': 'Architecture and Related Services',
    '05': 'Area, Ethnic, Cultural, Gender, and Group Studies',
    '09': 'Communication, Journalism, and Related Programs',
    '10': 'Communications Techologies/Technicians and Support Services',
    '11': 'Computer and Information Sciences and Support Services',
    '12': 'Culinary, Entertainment, and Personal Services',
    '13': 'Education',
    '14': 'Engineering',
    '15': 'Engineering/Engineering-related Technologies/Technicians',
    '16': 'Foreign Languages, Literatures, and Linguistics',
    '19': 'Family and Consumer Sciences/Human Sciences',
    '21': 'Reserved',
    '22': 'Legal Professions and Studies',
    '23': 'English Language and Literature/Letters',
    '24': 'Liberal Arts and Sciences, General Studies, and Humanities',
    '25': 'Library Science',
    '26': 'Biological and Biomedical Sciences',
    '27': 'Mathematics and Statistics',
    '28': 'Military Science, Leadership, and Operational Art',
    '29': 'Military Technologies and Applied Sciences',
    '30': 'Multi/Interdisciplinary Studies',
    '31': 'Parks, Recreation, Leisure, Fitness, and Kinesiology',
    '32': 'Basic Skills and Developmental/Remedial Education',
    '33': 'Citizenship Activities',
    '34': 'Health-Related Knowledge and Skills',
    '35': 'Interpersonal and Social Skills',
    '36': 'Leisure and Recreational Activities',
    '37': 'Personal Awareness and Self-Improvement',
    '38': 'Philosophy and Religious Studies',
    '39': 'Theology and Religious Vocations',
    '40': 'Physical Sciences',
    '41': 'Science Technologies/Technicians',
    '42': 'Psychology',
    '43': 'Homeland Security, Law Enforcement, Firefighting, and Related Protective Services',
    '44': 'Public Administration and Social Service Professions',
    '45': 'Social Sciences',
    '46': 'Construction Trades',
    '47': 'Mechanic and Repair Technologies/Technicians',
    '48': 'Precision Production',
    '49': 'Transportation and Materials Moving',
    '50': 'Visual and Performing Arts',
    '51': 'Health Professions and Related Programs',
    '52': 'Business, Management, Marketing, and Related Support Services',
    '53': 'High School/Secondary Diplomas and Certificates',
    '54': 'History',
    '55': 'Reserved',
    '60': 'Health Professions Residency/Fellowship Programs',
    '61': 'Medical Residency/Fellowship Programs'
    # Add more mappings as needed
}

# First element in yearly_datasets array is the source dataset for 2016
year = 2016

# Iterate through each yearly dataset
for dataset_path in yearly_datasets:
    # Read the source dataset for the current year
    df_source = pd.read_excel(dataset_path)
    
    # Create a new column 'CIP' based on the first two digits of the CIP code
    df_source['CIP'] = df_source['CIP Code'].astype(str).apply(lambda x: x.split('.')[0][:2].zfill(2))

    # Create a new column 'Career Subfield' based on the CIP-to-subfield mapping
    df_source['Career Subfield'] = df_source['CIP'].map(cip_to_subfield)
    
    # Group by 'CIP' and 'Career Subfield' and aggregate disbursement information
    df_aggregated = df_source.groupby(['CIP', 'Career Subfield', 'Program Title']).agg({
        'Recipients': 'sum',
        'Disbursements': 'sum'
    }).reset_index()
    
    # Calculate the "Average Disbursements per Recipient" for each year
    df_aggregated['Average Disbursements per Recipient'] = df_aggregated.apply(lambda row: row['Disbursements'] / row['Recipients'] if row['Recipients'] != 0 else 0, axis=1)
    
    # Reorder columns for the desired output format
    output_columns = ['CIP', 'Career Subfield', 'Program Title', 'Recipients', 'Disbursements', 'Average Disbursements per Recipient']
    df_aggregated = df_aggregated[output_columns]
    
    # Replace NaN with 0
    df_aggregated.fillna(0, inplace=True)
    
    # Save the individual DataFrame for each year to an Excel file
    df_aggregated.to_excel(f'output_combined_dataset_{year}.xlsx', index=False)

    # Increase the year by one for the next iteration
    year += 1
