In [84]:
import os
import pandas as pd
import numpy as np

# Import Files

### View the various tabs of the files

In [85]:
def list_tabs_in_excel_files(folder_path):
    # List to store the names of Excel files and their tabs
    excel_files_and_tabs = []

    # Iterate over the files in the specified folder
    for file in os.listdir(folder_path):
        # Check for both .xlsx and .xls files
        if file.endswith(('.xlsx', '.xls')):
            file_path = os.path.join(folder_path, file)
            try:
                # Load the Excel file
                xls = pd.ExcelFile(file_path)
                # Store the file name and its tabs
                excel_files_and_tabs.append((file, xls.sheet_names))
            except Exception as e:
                print(f"Error processing file {file}: {e}")

    return excel_files_and_tabs

In [86]:
# Specify folder location with .xls and .xlsx files
folder_path = r'C:\Users\domin\Documents\GitHub\xai_budgeting\data\raw'  # Use a raw string or properly escaped path
tabs_in_files = list_tabs_in_excel_files(folder_path)

# Print the list of files and their tabs
for file_name, tabs in tabs_in_files:
    print(f"File: {file_name}, Tabs: {tabs}")


File: kdkf_2001-2002_raw.xls, Tabs: ['ZH', 'BE', 'LU', 'UR', 'SZ', 'OW', 'NW', 'GL', 'ZG', 'FR', 'SO', 'BS', 'BL', 'SH', 'AR', 'AI', 'SG', 'GR', 'AG', 'TG', 'TI', 'VD', 'VS', 'NE', 'GE', 'JU', 'CHF', 'CHD', 'Abschlusszahlen Budget 2001', 'Abschlusszahlen Rechnung 2001', 'Abschlusszahlen Budgets 2002', 'Abschlusszahlen Rechnung 2002', 'Übersicht Saldo L. R. ', 'Finanzierungsfehlbetrag', 'Selbstfinanzierungsgrad', 'Erläuterung Kennzahlen']
File: kdkf_2003-2004_raw.xls, Tabs: ['ZH', 'BE', 'LU', 'UR', 'SZ', 'OW', 'NW', 'GL', 'ZG', 'FR', 'SO', 'BS', 'BL', 'SH', 'AR', 'AI', 'SG', 'GR', 'AG', 'TG', 'TI', 'VD', 'VS', 'NE', 'GE', 'JU', 'CHF', 'CHD', 'AbsschlusszahlenBudget 2003', 'Abschlusszahlen Rechnung 03', 'Abschlusszahlen Budgets 2004', 'Abschlusszahlen Rechnung 2004', 'Übersicht Saldo L. R. ', 'Finanzierungsfehlbetrag', 'Selbstfinanzierungsgrad', 'Erläuterung Kennzahlen']
File: kdkf_2005-2006_raw.xls, Tabs: ['ZH', 'BE', 'LU', 'UR', 'SZ', 'OW', 'NW', 'GL', 'ZG', 'FR', 'SO', 'BS', 'BL', 'SH

### Load Testfile and extract data

In [87]:
# Define the file path and the sheet name
file_path = r'C:\Users\domin\Documents\GitHub\xai_budgeting\data\raw\kdkf_2022_raw.xlsx'  # Use a raw string for the file path
sheet_name = 'HRM2_KT_AG'

# Load the specified sheet into a DataFrame
hrm2_kt_ag = pd.read_excel(file_path, sheet_name=sheet_name, skiprows=3) # Skip the first three rows as these are solely headers

# Display the first few rows of the DataFrame
hrm2_kt_ag.head()


Unnamed: 0.1,Unnamed: 0,Aargau,Unnamed: 2,Rechnung,Budget*,Anpassung Budget**,Budgetplus***,Rechnung.1,Budget
0,Referenz-ID,HRM 2,in 1 000 Franken,2021.0,2022.0,2022.0,2022.0,2022.0,2023.0
1,ER,ERFOLGSRECHNUNG,,,,,,,
2,HRM2_ER0030,30,Personalaufwand,1710857.49536,1824800.673,47003.566,1871804.239,1769407.06614,1913008.922
3,HRM2_ER0031,31,Sach- und übriger Betriebsaufwand,465105.56175,486337.94,66409.523,552747.463,471235.72459,540895.2865
4,HRM2_ER0314,davon 314,baulicher und betrieblicher Unterhalt,40709.52698,38381.362,1000.0,39381.362,36143.20582,38863.247


In [88]:
# Set the pandas option to display floating point numbers using the decimal format
pd.options.display.float_format = '{:.5f}'.format

In [90]:
# Merge the first row with the column headers in a dataframe
hrm2_kt_ag.columns = hrm2_kt_ag.iloc[0, :2].tolist() + (hrm2_kt_ag.columns[2:]).tolist()
hrm2_kt_ag = hrm2_kt_ag.iloc[1:].reset_index(drop=True) 
hrm2_kt_ag.head()


Unnamed: 0,ER,ERFOLGSRECHNUNG,Unnamed: 2,Rechnung,Budget*,Anpassung Budget**,Budgetplus***,Rechnung.1,Budget
0,HRM2_ER0030,30,Personalaufwand,1710857.49536,1824800.673,47003.566,1871804.239,1769407.06614,1913008.922
1,HRM2_ER0031,31,Sach- und übriger Betriebsaufwand,465105.56175,486337.94,66409.523,552747.463,471235.72459,540895.2865
2,HRM2_ER0314,davon 314,baulicher und betrieblicher Unterhalt,40709.52698,38381.362,1000.0,39381.362,36143.20582,38863.247
3,HRM2_ER0318,davon 3180,Wertberichtigungen auf Forderungen,1524.78485,1550.0,0.0,1550.0,-1948.07897,1483.5
4,HRM2_neu_ER0033,33,Abschreibungen VV,264389.83,201998.18638,0.0,201998.18638,176835.45829,162266.45182


In [None]:
# Update Headers of the df


