In [19]:
import pandas as pd

### 2013 source file spec

In [20]:
# Define the file path
file_path = r'C:\Users\domin\Documents\GitHub\xai_budgeting\data\raw\kdkf_2013_raw.xlsx'

# Load the Excel file to get sheet names
xls = pd.ExcelFile(file_path)

# Define the years
previous_year = 2012
start_year = 2013
target_year = 2014

# Set the pandas option to display floating point numbers using the decimal format
pd.options.display.float_format = '{:.5f}'.format

### Master Loop

In [21]:
# Initialize an empty DataFrame for the master data
master_2013 = pd.DataFrame()

# Loop over each sheet in the Excel file
for sheet_name in xls.sheet_names:
    # Check if the sheet name starts with "HRM2_KT_"
    if sheet_name.endswith("HRM2"):
        # Load the sheet into a DataFrame, skipping the first three rows
        df = pd.read_excel(file_path, sheet_name=sheet_name, skiprows=0)

        # Merge the first row with the column headers
        df.columns = df.iloc[0, :2].tolist() + (df.columns[2:]).tolist()
        df = df.iloc[1:].reset_index(drop=True)

        # Update Headers of the df
        rename_dict = {
            'Rechnung': f'Rechnung {previous_year}',
            'Budget': f'Budget {start_year}',
            'Rechnung.1': f'Rechnung {start_year}',
            'Budget.1': f'Budget {target_year}'
        }
        df.rename(columns=rename_dict, inplace=True)

        # Rename the first column to 'HRM 2' and third column to "in 1000 CHF"
        df.columns.values[0] = 'HRM2-ID'
        df.columns.values[2] = 'HRM2-Bezeichnung'

        # Delete unecessary columns
        df.drop(df.columns[1], axis=1, inplace=True)
             
        # Add a new column with the DataFrame name (sheet name)
        df['Source'] = sheet_name
        df['Kanton'] = df['Source'].str.split(' HRM2').str[0]
        df['Year'] = 2013

        # Use pd.concat to add the DataFrame to the master DataFrame
        master_2013 = pd.concat([master_2013, df], ignore_index=True)

# Display the first few rows of the master DataFrame
master_2013.head()

Unnamed: 0,HRM2-ID,HRM2-Bezeichnung,Rechnung 2012,Budget 2013,Rechnung 2013,Budget 2014,Source,Kanton,Year,Compte,Compte.1
0,ERFOLGSRECHNUNG,,,def.,,,ZH HRM2,ZH,2013,,
1,30,Personalaufwand,4782409.41,4827007.19000,4727960.2232,4944121.854,ZH HRM2,ZH,2013,,
2,31,Sach- und übriger Betriebsaufwand,2742025.9,2633519.28000,2856763.26085,2972686.192,ZH HRM2,ZH,2013,,
3,davon 314,baulicher und betrieblicher Unterhalt,194592.14,202593.45000,207716.983,213736.65,ZH HRM2,ZH,2013,,
4,davon 3180,Wertberichtigungen auf Forderungen,-15285.0,3557.70000,3809.96198,916.6,ZH HRM2,ZH,2013,,


### Translating FR columns to DE

In [22]:
# Display rows where the column 'Kanton' is 'FR HRM2'
filtered_rows = master_2013[master_2013['Kanton'] == 'FR']

# Display the filtered rows
filtered_rows.head()

Unnamed: 0,HRM2-ID,HRM2-Bezeichnung,Rechnung 2012,Budget 2013,Rechnung 2013,Budget 2014,Source,Kanton,Year,Compte,Compte.1
1288,Compte de résultats,,,def.,,def.,FR HRM2,FR,2013,,
1289,30,Charges de personnel,,1173958.40000,,1174614,FR HRM2,FR,2013,1158331.9,1179831.0
1290,31,Charges de biens et services et autres charges...,,323472,,323144,FR HRM2,FR,2013,318554.0,319170.0
1291,de cela 314,Gros entretien et entretien courant,,,,43868,FR HRM2,FR,2013,49697.8,45853.0
1292,de cela 3180,Réévaluations sur créances,,,,0,FR HRM2,FR,2013,13764.4,2331.0


In [23]:
# 1. Update 'Rechnung 2012' based on 'Compte'
master_2013['Rechnung 2012'] = master_2013.apply(
    lambda row: row['Compte'] if pd.isna(row['Rechnung 2012']) and not pd.isna(row['Compte']) else row['Rechnung 2012'],
    axis=1
)

# 2. Update 'Rechnung 2013' based on 'Compte.1'
master_2013['Rechnung 2013'] = master_2013.apply(
    lambda row: row['Compte.1'] if pd.isna(row['Rechnung 2013']) and not pd.isna(row['Compte.1']) else row['Rechnung 2013'],
    axis=1
)

# Display the first few rows of the updated DataFrame
master_2013.head()


Unnamed: 0,HRM2-ID,HRM2-Bezeichnung,Rechnung 2012,Budget 2013,Rechnung 2013,Budget 2014,Source,Kanton,Year,Compte,Compte.1
0,ERFOLGSRECHNUNG,,,def.,,,ZH HRM2,ZH,2013,,
1,30,Personalaufwand,4782409.41,4827007.19000,4727960.2232,4944121.854,ZH HRM2,ZH,2013,,
2,31,Sach- und übriger Betriebsaufwand,2742025.9,2633519.28000,2856763.26085,2972686.192,ZH HRM2,ZH,2013,,
3,davon 314,baulicher und betrieblicher Unterhalt,194592.14,202593.45000,207716.983,213736.65,ZH HRM2,ZH,2013,,
4,davon 3180,Wertberichtigungen auf Forderungen,-15285.0,3557.70000,3809.96198,916.6,ZH HRM2,ZH,2013,,


In [27]:
# Display rows where the column 'Kanton' is 'VD_HRM2'
filtered_rows = master_2013[master_2013['Kanton'] == 'FR']

# Display the filtered rows
filtered_rows.head()

Unnamed: 0,HRM2-ID,HRM2-Bezeichnung,Rechnung 2012,Budget 2013,Rechnung 2013,Budget 2014,Source,Kanton,Year,Compte,Compte.1
1289,30,Charges de personnel,1158331.9,1173958.4,1179831.0,1174614.0,FR HRM2,FR,2013,1158331.9,1179831.0
1290,31,Charges de biens et services et autres charges...,318554.0,323472.0,319170.0,323144.0,FR HRM2,FR,2013,318554.0,319170.0
1293,330,Immobilisations corporelles du PA,61027.6,66129.0,64408.0,78570.0,FR HRM2,FR,2013,61027.6,64408.0
1294,332,Amortissements des immobilisations incorporelles,,,,,FR HRM2,FR,2013,,
1295,339,Remboursement du découvert du bilan,0.0,,,,FR HRM2,FR,2013,0.0,


### Before saving the data, we drop non-numeric "HRM2-IDs", because they would cause issues with subsequent automated analysis...

In [25]:
# Drop rows where 'HRM-ID' values are non-numeric
master_2013 = master_2013[pd.to_numeric(master_2013['HRM2-ID'], errors='coerce').notna()]

# Display the first few rows of the updated DataFrame
len(master_2013)

1664

In [26]:
# Define the path where the CSV file will be saved
output_path = r'C:\Users\domin\Documents\GitHub\xai_budgeting\data\ready\kdkf_2013_data.csv'

# Save the DataFrame to a CSV file
master_2013.to_csv(output_path, index=False)
