In [18]:
import pandas as pd

### 2018 source file spec

In [19]:
# Define the file path
file_path = r'C:\Users\domin\Documents\GitHub\xai_budgeting\data\raw\kdkf_2018_raw.xlsx'

# Load the Excel file to get sheet names
xls = pd.ExcelFile(file_path)

# Define the years
previous_year = 2017
start_year = 2018
target_year = 2019

# Set the pandas option to display floating point numbers using the decimal format
pd.options.display.float_format = '{:.5f}'.format

### Master Loop

In [20]:
# Initialize an empty DataFrame for the master data
master_2018 = pd.DataFrame()

# Loop over each sheet in the Excel file
for sheet_name in xls.sheet_names:
    # Check if the sheet name starts with "HRM2_KT_"
    if sheet_name.endswith("_HRM2"):
        # Load the sheet into a DataFrame, skipping the first three rows
        df = pd.read_excel(file_path, sheet_name=sheet_name, skiprows=0)

        # Merge the first row with the column headers
        df.columns = df.iloc[0, :2].tolist() + (df.columns[2:]).tolist()
        df = df.iloc[1:].reset_index(drop=True)

        # Update Headers of the df
        rename_dict = {
            'Rechnung': f'Rechnung {previous_year}',
            'Budget': f'Budget {start_year}',
            'Rechnung.1': f'Rechnung {start_year}',
            'Budget.1': f'Budget {target_year}'
        }
        df.rename(columns=rename_dict, inplace=True)

        # Rename the first column to 'HRM 2' and third column to "in 1000 CHF"
        df.columns.values[0] = 'HRM2-ID'
        df.columns.values[2] = 'HRM2-Bezeichnung'

        # Delete the second column
        df.drop(df.columns[1], axis=1, inplace=True)

        # Add a new column with the DataFrame name (sheet name)
        df['Source'] = sheet_name
        df['Kanton'] = df['Source'].str.split('_HRM2').str[0]
        df['Year'] = 2018

        # Use pd.concat to add the DataFrame to the master DataFrame
        master_2018 = pd.concat([master_2018, df], ignore_index=True)

# Display the first few rows of the master DataFrame
master_2018.head()

Unnamed: 0,HRM2-ID,HRM2-Bezeichnung,Rechnung 2017,Budget 2018,Rechnung 2018,Budget 2019,Source,Kanton,Year,Compte,Compte.1
0,ERFOLGSRECHNUNG,,,,,,ZH_HRM2,ZH,2018,,
1,30,Personalaufwand,5240375.77332,5315346.187,5358724.42478,5464826.83265,ZH_HRM2,ZH,2018,,
2,31,Sach- und übriger Betriebsaufwand,3035765.80313,3119151.583,3085488.27122,3011912.733,ZH_HRM2,ZH,2018,,
3,davon 314,baulicher und betrieblicher Unterhalt,210278.89938,195508.1,205381.71041,191377.3,ZH_HRM2,ZH,2018,,
4,davon 3180,Wertberichtigungen auf Forderungen,8031.84811,3405.4,29712.56065,1576.1,ZH_HRM2,ZH,2018,,


### Translating FR columns to DE

In [21]:
# # Display rows where the column 'Kanton' is 'HRM2_KT_VD'
# filtered_rows = master_2018[master_2018['Kanton'] == 'VD_HRM2']

# # Display the filtered rows
# filtered_rows.head()

In [22]:
# 1. Update 'Rechnung 2017' based on 'Compte'
master_2018['Rechnung 2017'] = master_2018.apply(
    lambda row: row['Compte'] if pd.isna(row['Rechnung 2017']) and not pd.isna(row['Compte']) else row['Rechnung 2017'],
    axis=1
)

# 2. Update 'Rechnung 2018' based on 'Compte.1'
master_2018['Rechnung 2018'] = master_2018.apply(
    lambda row: row['Compte.1'] if pd.isna(row['Rechnung 2018']) and not pd.isna(row['Compte.1']) else row['Rechnung 2018'],
    axis=1
)

# Display the first few rows of the updated DataFrame
master_2018.head()


Unnamed: 0,HRM2-ID,HRM2-Bezeichnung,Rechnung 2017,Budget 2018,Rechnung 2018,Budget 2019,Source,Kanton,Year,Compte,Compte.1
0,ERFOLGSRECHNUNG,,,,,,ZH_HRM2,ZH,2018,,
1,30,Personalaufwand,5240375.77332,5315346.187,5358724.42478,5464826.83265,ZH_HRM2,ZH,2018,,
2,31,Sach- und übriger Betriebsaufwand,3035765.80313,3119151.583,3085488.27122,3011912.733,ZH_HRM2,ZH,2018,,
3,davon 314,baulicher und betrieblicher Unterhalt,210278.89938,195508.1,205381.71041,191377.3,ZH_HRM2,ZH,2018,,
4,davon 3180,Wertberichtigungen auf Forderungen,8031.84811,3405.4,29712.56065,1576.1,ZH_HRM2,ZH,2018,,


In [26]:
# Display rows where the column 'Kanton' is 'VD_HRM2'
filtered_rows = master_2018[master_2018['Kanton'] == 'VD']

# Display the filtered rows
filtered_rows.head()

Unnamed: 0,HRM2-ID,HRM2-Bezeichnung,Rechnung 2017,Budget 2018,Rechnung 2018,Budget 2019,Source,Kanton,Year,Compte,Compte.1
3868,30,Charges de personnel,2394031.35592,2447668.6,2430731.7,2485444.5,VD_HRM2,VD,2018,2394031.35592,2430731.7
3869,31,Charges de biens et services et autres charges...,761729.46881,717674.1,742236.2,719558.7,VD_HRM2,VD,2018,761729.46881,742236.2
3872,330,Immobilisations corporelles du PA,239285.74942,173298.8,331533.5,139099.9,VD_HRM2,VD,2018,239285.74942,331533.5
3873,332,Amortissements des immobilisations incorporelles,23050.48773,23287.8,19923.2,16013.0,VD_HRM2,VD,2018,23050.48773,19923.2
3874,339,Remboursement du découvert du bilan,0.0,0.0,0.0,0.0,VD_HRM2,VD,2018,0.0,0.0


### Before saving the data, we drop non-numeric "HRM2-IDs", because they would cause issues with subsequent automated analysis...

In [24]:
# Drop rows where 'HRM-ID' values are non-numeric
master_2018 = master_2018[pd.to_numeric(master_2018['HRM2-ID'], errors='coerce').notna()]

# Display the first few rows of the updated DataFrame
len(master_2018)

2704

In [25]:
# Define the path where the CSV file will be saved
output_path = r'C:\Users\domin\Documents\GitHub\xai_budgeting\data\ready\kdkf_2018_data.csv'

# Save the DataFrame to a CSV file
master_2018.to_csv(output_path, index=False)
