In [1]:
import pandas as pd
import numpy as np

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
file_path = "../data/raw/kdkf_2016_raw.xlsx"

In [3]:
df = pd.read_excel(file_path, sheet_name="SH_HRM1", header=1)

In [4]:
df.head()

Unnamed: 0,0,0.1,2015,in %,2016,in %.1,2016.1,in %.2,2017
0,0,L A U F E N D E R E C H N U N G,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,30,Personalaufwand,170690.58737,0.021355,174335.6,-0.003825,173668.702,0.003445,174267.0
2,31,Sachaufwand,59086.52689,0.006911,59494.9,-0.011354,58819.409,0.002793,58983.7
3,davon 314,Baulicher Unterhalt,7668.53859,-0.16327,6416.5,0.020455,6547.749,0.005979,6586.9
4,32,Passivzinsen,1925.31005,0.486877,2862.7,-0.39881,1721.028,0.24199,2137.5


In [5]:
def rename_columns_based_on_year(
        df: pd.DataFrame, base_year: str
    ) -> pd.DataFrame:
        # Define the base renaming dictionary
        renaming_dict = {
            "0": "Acc-ID",
            "0.1": "Name",
            f"{base_year}": "Budget y",
            f"{base_year}.1": "Realized",
            str(int(base_year) + 1): "Budget y+1",
        }

        df.columns = df.columns.map(str)

        # Filter out columns not in the renaming dictionary
        df = df[df.columns.intersection(renaming_dict.keys())]

        # Rename the columns
        df = df.rename(columns=renaming_dict)

        return df

In [6]:
df = df.drop(df.index[0])
df = rename_columns_based_on_year(df, 2016)

In [7]:
df

Unnamed: 0,Acc-ID,Name,Budget y,Realized,Budget y+1
1,30,Personalaufwand,174335.6,173668.702,174267.0
2,31,Sachaufwand,59494.9,58819.409,58983.7
3,davon 314,Baulicher Unterhalt,6416.5,6547.749,6586.9
4,32,Passivzinsen,2862.7,1721.028,2137.5
5,330,Abschreibungen Finanzvermögen,3615.0,3574.686,3665.0
6,331 - 333,Abschreibungen Verwaltungsvermögen,18425.7,22119.987,19325.9
7,34 - 37,"Anteile, Entschädigungen, Beiträge",388946.8,384609.47,394565.0
8,davon 363,Beiträge an eigene Anstalten,61524.7,62965.85,63322.3
9,davon 364,Beiträge an gemischtwirtschaftliche Unternehmu...,2450.0,2394.172,2502.0
10,davon 365,Beiträge an private Institutionen,133187.1,129856.778,135662.7


In [8]:
hrm1_to_hrm2_dict = {
    "1": (["1"], [1.0]),
    "2": (["2"], [1.0]),
    "3": (["3"], [1.0]),
    "4": (["4"], [1.0]),
    "5": (["5"], [1.0]),
    "6": (["6"], [1.0]),
    "30": (["30"], [1.0]),
    "31": (["31"], [1.0]),
    "33": (["33"], [1.0]),
    "38": (["35"], [1.0]),
    "34 - 37": (["34", "35", "36", "37"], [0.25, 0.25, 0.25, 0.25]),
    "39": (["39"], [1.0]),
    "40": (["40"], [1.0]),
    "41 / 43": (["41", "43"], [0.5, 0.5]),
    "48": (["45"], [1.0]),
    "44 - 47": (["44", "45", "46", "47"], [0.25, 0.25, 0.25, 0.25]),
    "49": (["49"], [1.0]),
    "32": (["34"], [1.0]),
    "50": (["50"], [1.0]),
    "52": (["54"], [1.0]),
    "56 - 58": (["56", "57", "58"], [1/3, 1/3, 1/3]),
    "60 - 61": (["60", "61"], [0.5, 0.5]),
    "62 - 67": (["62", "63", "64", "65", "66", "67"], [1/6, 1/6, 1/6, 1/6, 1/6, 1/6])
}


In [9]:
new_rows = []

for index, row in df.iterrows():
    old_acc_id = row['Acc-ID']
    
    # Check if the Acc-ID is a range and needs splitting
    if old_acc_id in hrm1_to_hrm2_dict:
        new_acc_ids, ratios = hrm1_to_hrm2_dict[old_acc_id]
        splits = len(new_acc_ids)

        for new_acc_id, ratio in zip(new_acc_ids, ratios):
            new_row = row.copy()
            new_row['Acc-ID'] = new_acc_id
            new_row['Budget y'] *= ratio
            new_row['Realized'] *= ratio
            new_row['Budget y+1'] *= ratio
            new_rows.append(new_row.to_dict())
    else:
        # Handle cases where Acc-ID is not in the mapping
        new_rows.append(row)

In [10]:
new_df = pd.DataFrame(new_rows, index=None)

In [13]:
new_df

Unnamed: 0,Acc-ID,Name,Budget y,Realized,Budget y+1
0,30,Personalaufwand,174335.6,173668.702,174267.0
1,31,Sachaufwand,59494.9,58819.409,58983.7
2,davon 314,Baulicher Unterhalt,6416.5,6547.749,6586.9
3,34,Passivzinsen,2862.7,1721.028,2137.5
4,330,Abschreibungen Finanzvermögen,3615.0,3574.686,3665.0
5,331 - 333,Abschreibungen Verwaltungsvermögen,18425.7,22119.987,19325.9
6,34,"Anteile, Entschädigungen, Beiträge",97236.7,96152.3675,98641.25
7,35,"Anteile, Entschädigungen, Beiträge",97236.7,96152.3675,98641.25
8,36,"Anteile, Entschädigungen, Beiträge",97236.7,96152.3675,98641.25
9,37,"Anteile, Entschädigungen, Beiträge",97236.7,96152.3675,98641.25


In [16]:
new_df["Acc-ID"] = new_df["Acc-ID"].astype(str)

# Keep only rows with numeric 'Acc-ID'
new_df = new_df[new_df["Acc-ID"].str.isnumeric()]
new_df = new_df[new_df['Acc-ID'] != '0']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df["Acc-ID"] = new_df["Acc-ID"].astype(str)


In [17]:
new_df

Unnamed: 0,Acc-ID,Name,Budget y,Realized,Budget y+1
0,30,Personalaufwand,174335.6,173668.702,174267.0
1,31,Sachaufwand,59494.9,58819.409,58983.7
3,34,Passivzinsen,2862.7,1721.028,2137.5
4,330,Abschreibungen Finanzvermögen,3615.0,3574.686,3665.0
6,34,"Anteile, Entschädigungen, Beiträge",97236.7,96152.3675,98641.25
7,35,"Anteile, Entschädigungen, Beiträge",97236.7,96152.3675,98641.25
8,36,"Anteile, Entschädigungen, Beiträge",97236.7,96152.3675,98641.25
9,37,"Anteile, Entschädigungen, Beiträge",97236.7,96152.3675,98641.25
16,35,Einlagen in Spezialfinanzierungen/Fonds,14949.0,22073.59,16501.0
17,389,Einlagen in das Eigenkapital,0.0,33100.0,0.0
