In [4]:
import pandas as pd

### 2011 source file spec

In [5]:
# Define the file path
file_path = r'C:\Users\domin\Documents\GitHub\xai_budgeting\data\raw\kdkf_2011_raw.xls'

# Load the Excel file to get sheet names
xls = pd.ExcelFile(file_path)

# Define the years
previous_year = 2010
start_year = 2011
target_year = 2012

# Set the pandas option to display floating point numbers using the decimal format
pd.options.display.float_format = '{:.5f}'.format

### Master Loop

In [6]:
# Initialize an empty DataFrame for the master data
master_2011 = pd.DataFrame()

# Loop over each sheet in the Excel file
for sheet_name in xls.sheet_names:
    # Check if the sheet name starts with "HRM2_KT_"
    if sheet_name.endswith("HRM2"):
        # Load the sheet into a DataFrame, skipping the first three rows
        df = pd.read_excel(file_path, sheet_name=sheet_name, skiprows=0)

        # Merge the first row with the column headers
        df.columns = df.iloc[0, :2].tolist() + (df.columns[2:]).tolist()
        df = df.iloc[1:].reset_index(drop=True)

        # Update Headers of the df
        rename_dict = {
            'Rechnung': f'Rechnung {previous_year}',
            'Budget': f'Budget {start_year}',
            'Rechnung.1': f'Rechnung {start_year}',
            'Budget.1': f'Budget {target_year}'
        }
        df.rename(columns=rename_dict, inplace=True)

        # Rename the first column to 'HRM 2' and third column to "in 1000 CHF"
        df.columns.values[0] = 'HRM2-ID'
        df.columns.values[2] = 'HRM2-Bezeichnung'

        # Delete unecessary columns
        df.drop(df.columns[1], axis=1, inplace=True)
             
        # Add a new column with the DataFrame name (sheet name)
        df['Source'] = sheet_name
        df['Kanton'] = df['Source'].str.split(' HRM2').str[0]
        df['Year'] = 2011

        # Use pd.concat to add the DataFrame to the master DataFrame
        master_2011 = pd.concat([master_2011, df], ignore_index=True)

# Display the first few rows of the master DataFrame
master_2011.head()

Unnamed: 0,HRM2-ID,HRM2-Bezeichnung,Rechnung 2010,Budget 2011,Rechnung 2011,Budget 2012,Unnamed: 7,Unnamed: 8,Source,Kanton,Year
0,ERFOLGSRECHNUNG,,,,,,,,ZH HRM2,ZH,2011
1,30,Personalaufwand,4605152.0,4736464.0,7369400.9,4663153.945,,,ZH HRM2,ZH,2011
2,31,Sach- und übriger Betriebsaufwand,2608960.0,2627790.0,2693614.8,2515883.068,,,ZH HRM2,ZH,2011
3,33,Abschreibungen Verwaltungsvermögen,545973.0,439101.0,501069.37,462796.042,,,ZH HRM2,ZH,2011
4,35,Einlagen in Fonds und Spezialfinanzierungen,85972.0,25385.0,71126.1,20483.2,,,ZH HRM2,ZH,2011


### Translating FR columns to DE
Not needed, because there are no french cantons with HRM2 Reporting...

### Before saving the data, we drop non-numeric "HRM2-IDs", because they would cause issues with subsequent automated analysis...

In [7]:
# Drop the column named 'Unnamed: 7'
master_2011.drop('Unnamed: 7', axis=1, inplace=True)
master_2011.drop('Unnamed: 8', axis=1, inplace=True)

# Drop rows where 'HRM-ID' values are non-numeric
master_2011 = master_2011[pd.to_numeric(master_2011['HRM2-ID'], errors='coerce').notna()]

# Display the first few rows of the updated DataFrame
len(master_2011)

504

In [8]:
# Define the path where the CSV file will be saved
output_path = r'C:\Users\domin\Documents\GitHub\xai_budgeting\data\ready\kdkf_2011_data.csv'

# Save the DataFrame to a CSV file
master_2011.to_csv(output_path, index=False)
