In [61]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
from src.feature_engineering import (
    apply_feature_engineering,
    drop_all_zero_entries,
    choose_acc_ids,
)
import yaml

In [62]:
acc_config_path = Path("../config/acc_config.yaml")
with open(acc_config_path, "r") as yaml_file:
    acc_config = yaml.safe_load(yaml_file)

## Choose Account to have a look at

In [63]:
CATEGORY = "Investitionsausgaben"

In [64]:
# Replace with your actual data loading code
file_path = Path("../data/final/merged_complete.csv")
df = pd.read_csv(file_path, index_col=None, header=0)

# Convert 'Year' to a relative year
# df["Year"] = df["Year"] - df["Year"].min()

# Sort dataframe and apply feature engineering
df = df.sort_values(by="Year")
df = apply_feature_engineering(df)
df = drop_all_zero_entries(df)
df = choose_acc_ids(df, acc_config.get(CATEGORY)) # Choose your account
df = df.drop_duplicates(subset=['Year', 'Region', 'Acc-ID'], keep='first')


In [65]:
# Calculate the percentage difference and handle division by zero
df['Percentage Difference'] = ((df['Budget y'] - df['Realized']) / df['Realized']).replace([np.inf, -np.inf], np.nan) * 100

In [66]:
# Clip the percentage difference to the 1% and 99% quantile
low_quantile = df['Percentage Difference'].quantile(0.01)
high_quantile = df['Percentage Difference'].quantile(0.99)

df = df[(df['Percentage Difference'] >= low_quantile) & (df['Percentage Difference'] <= high_quantile)]

In [67]:
df.tail()

In [68]:
df.describe()

In [69]:
highest_dev = df.sort_values(by='Percentage Difference', ascending=False).head(10)
highest_dev

## Aggregated Relative Difference

In [70]:
aggregated_data = df.groupby('Year')['Percentage Difference'].mean()

plt.figure(figsize=(10, 6))
plt.plot(aggregated_data.index, aggregated_data, marker='o', linestyle='-', color='red', label='Aggregated Percentage Difference')
plt.title(f'Agg. Percentage Difference (Budget y to Realized) - All Regions - Category {CATEGORY}')
plt.xlabel('Year')
plt.ylabel('Aggregated Percentage Difference')
plt.axhline(0, color='grey', lw=0.8, ls='--')  # Add a line at 0% difference for reference
plt.legend()
plt.show()


## Relative Difference per Region

In [75]:
import seaborn as sns
import matplotlib.pyplot as plt
import os
from pathlib import Path

output_dir = 'time_series_additional_plots'
os.makedirs(output_dir, exist_ok=True)

plt.style.use('default')

sns.set(style="whitegrid")  # Use a similar style to your other plots

regions = df['Region'].unique()

for region in regions:
    region_data = df[df['Region'] == region].groupby('Year')['Percentage Difference'].mean().reset_index()
    plt.figure(figsize=(10, 4))
    sns.lineplot(data=region_data, x='Year', y='Percentage Difference', marker='o', linestyle='-', label=f'Percentage Difference - {region}')
    plt.title(f'Percentage Difference (Budget y to Realized) - {region} - Category {CATEGORY}', fontsize=16, color='black')
    plt.xlabel('Year', fontsize=12, color='black')
    plt.ylabel('Percentage Difference', fontsize=12, color='black')
    plt.axhline(0, color='grey', lw=0.8, ls='--')  # Add a line at 0% difference for reference
    plt.legend(title='Region', fontsize=10, title_fontsize='11', loc='best')
    plt.yticks(fontsize=10, color='black')
    
    # Set x-ticks to show every year and format as integers
    years = region_data['Year'].unique()
    plt.xticks(ticks=years, labels=[int(year) for year in years], fontsize=10, color='black')
    
    plt.tight_layout()
    
    # Save the plot
    plt.savefig(f"{output_dir}/percentage_difference_{region}_{CATEGORY}.png", bbox_inches='tight')
    plt.show()
