In [2]:
import pandas as pd

### 2020 source file spec

In [3]:
# Define the file path
file_path = r'C:\Users\domin\Documents\GitHub\xai_budgeting\data\raw\kdkf_2020_raw.xlsx'

# Load the Excel file to get sheet names
xls = pd.ExcelFile(file_path)

# Define the years
previous_year = 2019
start_year = 2020
target_year = 2021

# Set the pandas option to display floating point numbers using the decimal format
pd.options.display.float_format = '{:.5f}'.format

### Master Loop

In [4]:
# Initialize an empty DataFrame for the master data
master_2020 = pd.DataFrame()

# Loop over each sheet in the Excel file
for sheet_name in xls.sheet_names:
    # Check if the sheet name starts with "HRM2_KT_"
    if sheet_name.startswith("HRM2_KT_"):
        # Load the sheet into a DataFrame, skipping the first three rows
        df = pd.read_excel(file_path, sheet_name=sheet_name, skiprows=3)

        # Merge the first row with the column headers
        df.columns = df.iloc[0, :2].tolist() + (df.columns[2:]).tolist()
        df = df.iloc[1:].reset_index(drop=True)

        # Update Headers of the df
        rename_dict = {
            'HRM 2': 'HRM2-ID',
            'Unnamed: 2': 'HRM2-Bezeichnung',
            'Rechnung': f'Rechnung {previous_year}',
            'Budget': f'Budget {start_year}',
            'Anpassung Budget**': f'Anpassung Budget {start_year}',
            'Budgetplus***': f'Budgetplus {start_year}',
            'Rechnung.1': f'Rechnung {start_year}',
            'Budget.1': f'Budget {target_year}'
        }
        df.rename(columns=rename_dict, inplace=True)

        # Filter the DataFrame to keep only rows where 'Referenz-ID' contains 'HRM2'
        df = df[df['Referenz-ID'].str.contains('HRM2', na=False)]

        # Add a new column with the DataFrame name (sheet name)
        df['Source'] = sheet_name
        df['Kanton'] = df['Source'].str.split('HRM2_KT_').str[1]
        df['Year'] = 2020

        # Use pd.concat to add the DataFrame to the master DataFrame
        master_2020 = pd.concat([master_2020, df], ignore_index=True)

# Display the first few rows of the master DataFrame
master_2020.head()

Unnamed: 0,Referenz-ID,HRM2-ID,HRM2-Bezeichnung,Rechnung 2019,Budget 2020,Rechnung 2020,Budget 2021,Source,Kanton,Year,Compte,Compte.1
0,HRM2_ER0030,30,Personalaufwand,25078006.42584,25365050.64956,25538338.81869,25939361.82914,HRM2_KT_alle,alle,2020,,
1,HRM2_ER0031,31,Sach- und übriger Betriebsaufwand,9695994.15067,10070494.85185,9648999.92794,10237462.65679,HRM2_KT_alle,alle,2020,,
2,HRM2_ER0314,davon 314,baulicher und betrieblicher Unterhalt,1093416.79827,1130573.045,1100302.72593,1116254.812,HRM2_KT_alle,alle,2020,,
3,HRM2_ER0318,davon 3180,Wertberichtigungen auf Forderungen,124425.54987,48634.503,55150.77083,41547.87,HRM2_KT_alle,alle,2020,,
4,HRM2_neu_ER0033,33,Abschreibungen VV,2834876.47063,2894989.36082,2874363.86922,2882024.76714,HRM2_KT_alle,alle,2020,,


### Translating FR columns to DE

In [5]:
# # Display rows where the column 'Kanton' is 'HRM2_KT_VD'
# filtered_rows = master_2020[master_2020['Kanton'] == 'VD']

# # Display the filtered rows
# filtered_rows.head()

Unnamed: 0,Referenz-ID,HRM2-ID,HRM2-Bezeichnung,Rechnung 2019,Budget 2020,Rechnung 2020,Budget 2021,Source,Kanton,Year,Compte,Compte.1
4531,HRM2_ER0030,30,Charges de personnel,,2533035.4,,2597460.6,HRM2_KT_VD,VD,2020,2477989.4,2521763.3
4532,HRM2_ER0031,31,Charges de biens et services et autres charges...,,735805.8,,739927.4,HRM2_KT_VD,VD,2020,794592.5,716740.1
4533,HRM2_ER0314,de cela 314,Gros entretien et entretien courant,,77389.1,,72261.9,HRM2_KT_VD,VD,2020,91221.6,77201.4
4534,HRM2_ER0318,de cela 3180,Réévaluations sur créances,,0.0,,0.0,HRM2_KT_VD,VD,2020,40976.4,-7308.1
4535,HRM2_neu_ER0033,33,Amortissements du patrimoine administratif,,157700.8,,160915.0,HRM2_KT_VD,VD,2020,153493.6,136033.0


In [6]:
# 1. Update 'Rechnung 2019' based on 'Compte'
master_2020['Rechnung 2019'] = master_2020.apply(
    lambda row: row['Compte'] if pd.isna(row['Rechnung 2019']) and not pd.isna(row['Compte']) else row['Rechnung 2019'],
    axis=1
)

# 2. Update 'Rechnung 2020' based on 'Compte.1'
master_2020['Rechnung 2020'] = master_2020.apply(
    lambda row: row['Compte.1'] if pd.isna(row['Rechnung 2020']) and not pd.isna(row['Compte.1']) else row['Rechnung 2020'],
    axis=1
)

# Display the first few rows of the updated DataFrame
master_2020.head()


Unnamed: 0,Referenz-ID,HRM2-ID,HRM2-Bezeichnung,Rechnung 2019,Budget 2020,Rechnung 2020,Budget 2021,Source,Kanton,Year,Compte,Compte.1
0,HRM2_ER0030,30,Personalaufwand,25078006.42584,25365050.64956,25538338.81869,25939361.82914,HRM2_KT_alle,alle,2020,,
1,HRM2_ER0031,31,Sach- und übriger Betriebsaufwand,9695994.15067,10070494.85185,9648999.92794,10237462.65679,HRM2_KT_alle,alle,2020,,
2,HRM2_ER0314,davon 314,baulicher und betrieblicher Unterhalt,1093416.79827,1130573.045,1100302.72593,1116254.812,HRM2_KT_alle,alle,2020,,
3,HRM2_ER0318,davon 3180,Wertberichtigungen auf Forderungen,124425.54987,48634.503,55150.77083,41547.87,HRM2_KT_alle,alle,2020,,
4,HRM2_neu_ER0033,33,Abschreibungen VV,2834876.47063,2894989.36082,2874363.86922,2882024.76714,HRM2_KT_alle,alle,2020,,


In [7]:
# # Display rows where the column 'Kanton' is 'HRM2_KT_VD'
# filtered_rows = master_2020[master_2020['Kanton'] == 'VD']

# # Display the filtered rows
# filtered_rows.head()

Unnamed: 0,Referenz-ID,HRM2-ID,HRM2-Bezeichnung,Rechnung 2019,Budget 2020,Rechnung 2020,Budget 2021,Source,Kanton,Year,Compte,Compte.1
4531,HRM2_ER0030,30,Charges de personnel,2477989.4,2533035.4,2521763.3,2597460.6,HRM2_KT_VD,VD,2020,2477989.4,2521763.3
4532,HRM2_ER0031,31,Charges de biens et services et autres charges...,794592.5,735805.8,716740.1,739927.4,HRM2_KT_VD,VD,2020,794592.5,716740.1
4533,HRM2_ER0314,de cela 314,Gros entretien et entretien courant,91221.6,77389.1,77201.4,72261.9,HRM2_KT_VD,VD,2020,91221.6,77201.4
4534,HRM2_ER0318,de cela 3180,Réévaluations sur créances,40976.4,0.0,-7308.1,0.0,HRM2_KT_VD,VD,2020,40976.4,-7308.1
4535,HRM2_neu_ER0033,33,Amortissements du patrimoine administratif,153493.6,157700.8,136033.0,160915.0,HRM2_KT_VD,VD,2020,153493.6,136033.0


In [8]:
len(master_2020)

5319

### Save df to .csv

In [9]:
# Define the path where the CSV file will be saved
output_path = r'C:\Users\domin\Documents\GitHub\xai_budgeting\data\ready\kdkf_2020_data.csv'

# Save the DataFrame to a CSV file
master_2020.to_csv(output_path, index=False)
