In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import os

output_directory = "../data/final" 
file_path = os.path.join(output_directory, "merged_complete_preprocessed.csv")
df = pd.read_csv(file_path)
df_zh = df[df['Region'] == 'ZH']
df_zh.set_index('Year', inplace=True)
df_zh.drop('Region', axis=1, inplace=True)

for column in df_zh.columns:
    plt.figure(figsize=(10, 6))  
    plt.plot(df_zh.index, df_zh[column], marker='o', linestyle='-', label=column)
    plt.title(f'{column} over Time for Canton ZH (2013-2021)')
    plt.xlabel('Year')
    plt.ylabel(column)
    plt.grid(True)
    plt.legend()
    plt.show()


In [2]:
output_directory = "../data/final" 
file_path = os.path.join(output_directory, "merged_double_digit.csv")
df = pd.read_csv(file_path)
#print uniques values for Region and Year
print(df['Region'].unique())
print(df['Year'].unique())

In [3]:
import pandas as pd
import os

output_directory_final = "../data/final"
file_path_double_digit = os.path.join(output_directory_final, "merged_double_digit.csv")
output_directory_merged = "../data/merged_canton" 
file_path_kantonsdaten_merged = os.path.join(output_directory_merged, "Kantonsdaten_2013_to_2021_merged.csv")

df_double_digit = pd.read_csv(file_path_double_digit)
df_kantonsdaten = pd.read_csv(file_path_kantonsdaten_merged)


merged_df = pd.merge(df_double_digit, df_kantonsdaten, how='outer', left_on=['Region', 'Year'], right_on=['Canton', 'Year'])
# merged_df.fillna(0, inplace=True)
merged_df = merged_df[merged_df['Region'] != 0]
merged_df.drop('Canton', axis=1, inplace=True)


merged_output_path = os.path.join(output_directory_final, "merged_complete.csv")
merged_df.to_csv(merged_output_path, index=False)


In [4]:
merged_df.info()

In [5]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, PowerTransformer
from scipy import stats

# Load the dataset
df = pd.read_csv('../data/final/merged_complete.csv')

#delete all the rows where Realized	Budget y	Budget y+1	Slack are all 0
df = df[(df['Realized'] != 0) | (df['Budget y'] != 0) | (df['Budget y+1'] != 0) | (df['Slack'] != 0)]
# # Exclude Year from numeric columns for preprocessing
# exclude_columns = ['Year', 'Acc-ID', 'Realized', 'Budget y', 'Budget y+1', 'Slack']
# 
# numeric_columns = [col for col in df.select_dtypes(include=[np.number]).columns if col not in exclude_columns]
# 
# 
# # Apply median imputation to the selected numeric columns
# imputer = SimpleImputer(strategy='median')
# df[numeric_columns] = imputer.fit_transform(df[numeric_columns])
# 
# # Detect outliers based on the selected numeric columns (don't remove yet)
# z_scores = np.abs(stats.zscore(df[numeric_columns]))
# outliers = (z_scores < 3).all(axis=1)
# 
# # Keep a copy of the DataFrame with outliers removed
# df_no_outliers = df[outliers].copy()
# 
# # Apply scaling
# scaler = StandardScaler()
# df_no_outliers[numeric_columns] = scaler.fit_transform(df_no_outliers[numeric_columns])
# 
# # Apply transformation to make data more Gaussian-like
# transformer = PowerTransformer()
# df_no_outliers[numeric_columns] = transformer.fit_transform(df_no_outliers[numeric_columns])




# Save the preprocessed DataFrame with both numeric and categorical data
df.to_csv('../data/final/merged_complete_preprocessed.csv', index=False)


In [None]:
import pandas as pd

# Assuming your data is loaded into a DataFrame named df
df = pd.read_csv('../data/final/merged_complete.csv')

# Ensure 'Year' is sorted
df = df.sort_values(by=['Region', 'Year'])

# Identify rows for the years 2011, 2012, and 2022
years_of_interest = [2011, 2012, 2022]

# Define columns that won't be interpolated
columns_not_to_interpolate = ['Year', 'Region', 'Acc-ID', 'Realized', 'Budget y', 'Budget y+1', 'Slack']

# Perform group-wise forward fill followed by backward fill for each region
# This aims to fill missing values for the specific years where linear interpolation might not be applicable
for region in df['Region'].unique():
    for year in years_of_interest:
        # Select rows for the specific region and year
        mask = (df['Region'] == region) & (df['Year'] == year)
        # Apply forward fill followed by backward fill within the region for the target year
        df.loc[mask, :] = df.loc[mask, :].fillna(method='ffill').fillna(method='bfill')

# Check if any missing values remain for the specified years
missing_values_check = df[df['Year'].isin(years_of_interest)].isnull().sum()
print(missing_values_check)

df.to_csv('../data/final/merged_complete_preprocessed.csv', index=False)


In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

# Load the data
df = pd.read_csv('../data/final/merged_complete.csv')
df = df.sort_values(by=['Region', 'Year'])

columns_to_fill = [col for col in df.columns if col not in ['Year', 'Region', 'Acc-ID', 'Realized', 'Budget y', 'Budget y+1', 'Slack']]

for column in columns_to_fill:
    df[column].replace(0, np.nan, inplace=True)

for region in df['Region'].unique():
    region_df = df[df['Region'] == region]
    
    for column in columns_to_fill:
        not_null_region_df = region_df.dropna(subset=[column])
        
        if len(not_null_region_df) > 1:
            X = not_null_region_df[['Year']]
            y = not_null_region_df[column]
            
            model = LinearRegression()
            model.fit(X, y)
            
            missing_df = region_df[region_df[column].isnull()]
            if not missing_df.empty:
                predicted_values = model.predict(missing_df[['Year']])
                
                df.loc[missing_df.index, column] = predicted_values

df = df.dropna(subset=['Region'])
columns_to_drop = set()

for region in df['Region'].unique():
    region_df = df[df['Region'] == region]
    
    for column in columns_to_fill:
        if region_df[column].isnull().all():
            columns_to_drop.add(column)

df.drop(columns=columns_to_drop, inplace=True)

columns_to_fill = [col for col in columns_to_fill if col not in columns_to_drop]

print(f"Dropped columns: {columns_to_drop}")

output_path = '../data/final/merged_complete_preprocessed.csv'
df.to_csv(output_path, index=False)

print(f"DataFrame with filled values for multiple columns, excluding dropped columns, saved to {output_path}.")


In [None]:
import pandas as pd
import numpy as np

# Load the data
df = pd.read_csv('../data/final/merged_complete.csv')
df = df.sort_values(by=['Region', 'Year'])

# List of columns to fill
columns_to_fill = [col for col in df.columns if col not in ['Year', 'Region', 'Acc-ID', 'Realized', 'Budget y', 'Budget y+1', 'Slack']]

# Replace 0 with NaN for specified columns
df[columns_to_fill] = df[columns_to_fill].replace(0, np.nan)

# Replace inf and -inf with NaN
df[columns_to_fill] = df[columns_to_fill].replace([np.inf, -np.inf], np.nan)

# Iterate through each region
for region in df['Region'].unique():
    region_df = df[df['Region'] == region].copy()
    region_df.set_index('Year', inplace=True)
    
    # Forward fill
    region_df[columns_to_fill] = region_df[columns_to_fill].ffill()
    
    # Backward fill
    region_df[columns_to_fill] = region_df[columns_to_fill].bfill()
    
    # Interpolate
    region_df[columns_to_fill] = region_df[columns_to_fill].interpolate(method='linear', limit_direction='both')
    
    # Assign filled data back to the original DataFrame
    df.loc[df['Region'] == region, columns_to_fill] = region_df[columns_to_fill].values

# Drop rows where 'Region' is NaN
df = df.dropna(subset=['Region'])

# Identify columns to drop if all values are NaN within a region
columns_to_drop = set()

for region in df['Region'].unique():
    region_df = df[df['Region'] == region]
    
    for column in columns_to_fill:
        if region_df[column].isnull().all():
            columns_to_drop.add(column)

# Drop identified columns
df.drop(columns=columns_to_drop, inplace=True)

# Update the columns to fill
columns_to_fill = [col for col in columns_to_fill if col not in columns_to_drop]

print(f"Dropped columns: {columns_to_drop}")

# Save the preprocessed DataFrame
output_path = '../data/final/merged_complete_preprocessed.csv'
df.to_csv(output_path, index=False)

print(f"DataFrame with filled values for multiple columns, excluding dropped columns, saved to {output_path}.")

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor  # Import Random Forest Regressor

# Load the data
df = pd.read_csv('../data/final/merged_complete.csv')
df = df.sort_values(by=['Region', 'Year'])

# Define the columns you want to fill
columns_to_fill = [col for col in df.columns if col not in ['Year', 'Region', 'Acc-ID', 'Realized', 'Budget y', 'Budget y+1', 'Slack']]

# Convert 0.0 to NaN for specified columns
for column in columns_to_fill:
    df[column].replace(0, np.nan, inplace=True)

# Initialize a set to store names of columns to drop

# Proceed with the existing process, now with the updated DataFrame and columns_to_fill list
for region in df['Region'].unique():
    print(f"Processing region: {region}")
    region_df = df[df['Region'] == region]
    
    for column in columns_to_fill:
        not_null_region_df = region_df.dropna(subset=[column])
        
        if len(not_null_region_df) > 1:
            X = not_null_region_df[['Year']]
            y = not_null_region_df[column]
            
            model = RandomForestRegressor(n_estimators=100, random_state=42)  # Use RandomForestRegressor
            model.fit(X, y)
            
            missing_df = region_df[region_df[column].isnull()]
            if not missing_df.empty:
                predicted_values = model.predict(missing_df[['Year']])
                
                df.loc[missing_df.index, column] = predicted_values
                print(f"Updated column '{column}' for region '{region}' with {len(predicted_values)} predicted values.")
            else:
                print(f"No missing values to update for column '{column}' in region '{region}'.")
        else:
            print(f"Not enough data to model column '{column}' in region '{region}'.")

df = df.dropna(subset=['Region'])
columns_to_drop = set()

# Loop over each region and check columns for all NaN values
for region in df['Region'].unique():
    region_df = df[df['Region'] == region]
    
    for column in columns_to_fill:
        if region_df[column].isnull().all():
            columns_to_drop.add(column)

# Drop the identified columns from the DataFrame
df.drop(columns=columns_to_drop, inplace=True)

# Now, columns_to_fill needs to be updated to exclude dropped columns
columns_to_fill = [col for col in columns_to_fill if col not in columns_to_drop]

# Print the names of dropped columns
print(f"Dropped columns: {columns_to_drop}")

# Save the modified DataFrame
output_path = '../data/final/merged_complete_preprocessed.csv'
df.to_csv(output_path, index=False)

print(f"DataFrame with filled values for multiple columns, excluding dropped columns, saved to {output_path}.")


In [None]:
import pandas as pd
df = pd.read_csv('../feature_importances/feature_importances.csv')
df = df[(df['Importance'] < 0.01)]
df = df['Feature']
for feature in df:
    print('"' + feature + '",')

In [None]:
import pandas as pd

# Load the data
df = pd.read_csv('../data/final/merged_double_digit.csv')

# Preprocess the DataFrame
df = df.sort_values(by=['Region', 'Year'])
df = df.dropna(subset=['Region', 'Year'])

yearMax = 2022
yearGeneration = 11
years = range(yearMax - yearGeneration, yearMax)  
yearMin = min(years)


# Get unique combinations of Region and Acc-ID
unique_regions_acc_ids = df[['Region', 'Acc-ID']].drop_duplicates()

# Create a DataFrame with all combinations
all_combinations = pd.MultiIndex.from_product(
    [unique_regions_acc_ids['Region'].unique(), unique_regions_acc_ids['Acc-ID'].unique(), years],
    names=['Region', 'Acc-ID', 'Year']
)
full_df = pd.DataFrame(index=all_combinations).reset_index()

# Merge with original data
full_df = pd.merge(full_df, df, how='left', on=['Region', 'Acc-ID', 'Year'])

# Save the output
output_path = '../data/final/checking.csv'
full_df.to_csv(output_path, index=False)


In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor 
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline

# Load the data
df = pd.read_csv('../data/final/checking.csv')
df = df.sort_values(by=['Region', 'Year', 'Acc-ID'])

# Define the columns you want to fill
columns_to_fill = [col for col in df.columns if col not in ['Year', 'Region', 'Acc-ID', 'Slack']]

# Convert 0.0 to NaN for specified columns
for column in columns_to_fill:
    df[column].replace(0, np.nan, inplace=True)

# Initialize a set to store names of columns to drop

# Proceed with the existing process, now with the updated DataFrame and columns_to_fill list
# for region in df['Region'].unique():
#     for acc_id in df[df['Region'] == region]['Acc-ID'].unique():
#         sub_df = df[(df['Region'] == region) & (df['Acc-ID'] == acc_id)]
#         print(f"Processing region: {region}, Acc-ID: {acc_id}")
# 
#         for column in columns_to_fill:
#             not_null_sub_df = sub_df.dropna(subset=[column])
#             
#             if len(not_null_sub_df) > 1:
#                 X = not_null_sub_df[['Year']]
#                 y = not_null_sub_df[column]
#                 
#                 model = RandomForestRegressor(n_estimators=100, n_jobs=-1) 
#                 model.fit(X, y)
#                 
#                 missing_df = sub_df[sub_df[column].isnull()]
#                 if not missing_df.empty:
#                     predicted_values = model.predict(missing_df[['Year']])
#                     
#                     df.loc[missing_df.index, column] = predicted_values
#                     print(f"Updated column '{column}' for region '{region}', Acc-ID '{acc_id}' with {len(predicted_values)} predicted values.")
#                 else:
#                     print(f"No missing values to update for column '{column}' in region '{region}', Acc-ID '{acc_id}'.")
#             else:
#                 print(f"Not enough data to model column '{column}' in region '{region}', Acc-ID '{acc_id}'.")


for region in df['Region'].unique():
    for acc_id in df[df['Region'] == region]['Acc-ID'].unique():
        sub_df = df[(df['Region'] == region) & (df['Acc-ID'] == acc_id)]
        for column in columns_to_fill:
            not_null_sub_df = sub_df.dropna(subset=[column])
            if len(not_null_sub_df) > 1:
                X = not_null_sub_df[['Year']].values
                y = not_null_sub_df[column].values
                
                # Set up a pipeline with PolynomialFeatures and LinearRegression
                degree = 2  # You can adjust the degree of the polynomial based on your data and needs
                model = make_pipeline(PolynomialFeatures(degree), LinearRegression())
                model.fit(X, y)
                
                missing_df = sub_df[sub_df[column].isnull()]
                if not missing_df.empty:
                    predicted_values = model.predict(missing_df[['Year']].values)
                    df.loc[missing_df.index, column] = predicted_values
                    print(f"Updated column '{column}' for region '{region}', Acc-ID '{acc_id}' with {len(predicted_values)} predicted values.")
            else:
                print(f"Not enough data to model column '{column}' in region '{region}', Acc-ID '{acc_id}'.")
                
df = df.dropna(subset=['Region'])
columns_to_drop = set()

# Loop over each region and check columns for all NaN values
for region in df['Region'].unique():
    region_df = df[df['Region'] == region]
    
    for column in columns_to_fill:
        if region_df[column].isnull().all():
            columns_to_drop.add(column)

# Drop the identified columns from the DataFrame
df.drop(columns=columns_to_drop, inplace=True)

# Now, columns_to_fill needs to be updated to exclude dropped columns
columns_to_fill = [col for col in columns_to_fill if col not in columns_to_drop]

# Print the names of dropped columns
print(f"Dropped columns: {columns_to_drop}")

# Save the modified DataFrame
output_path = '../data/final/checking2.csv'
df.to_csv(output_path, index=False)

print(f"DataFrame with filled values for multiple columns, excluding dropped columns, saved to {output_path}.")

# Drop column called Slack
df = df.drop('Slack', axis=1)
#add column Slack this is calculated Budget Y - Realized
df['Slack'] = df['Budget y'] - df['Realized']
#drop rows which contain NaN values
df = df.dropna()
# Save the output
output_path = '../data/final/checking.csv'

df.to_csv(output_path + str(yearMin) + 'to' + str(yearMax) + '.csv', index=False)

In [None]:
max_year = df['Year'].max()

# Generate a list of target years, from max_year - 10 to max_year - 100, in steps of 5
years = [max_year - i for i in range(10, 101, 5)]

# Create subsets where each subset drops all rows prior to each target year
for year in years:
    # Create subset by dropping all rows where Year is less than the target year
    subset = df[df['Year'] >= year]
    output_path = '../data/final/subsets'
    subset.to_csv(output_path + str(year) + 'to' + str(max_year)+ '.csv', index=False)
    print(f"Saved subset for year {year} to {output_path}")


NEW APPROACH


In [7]:
!pip install scikit-learn

In [19]:
import pandas as pd
from sklearn.linear_model import LinearRegression
import numpy as np
import os

output_directory_merged = "../data/merged_canton"
file_path_kantonsdaten_merged = os.path.join(output_directory_merged, "Kantonsdaten_merged_full.xlsx")

df = pd.read_excel(file_path_kantonsdaten_merged)

columns_to_fill = [
    'Transferzahlungen von Kantonen an Bund',
    'Transferzahlungen von Kantonen an andere Kantone (und Konkordate)',
    'Transferzahlungen von Kantonen an Gemeinden (und Gemeindezweckverbände)',
    'Transferzahlungen von Gemeinden an Kantone (und Konkordate)',
    'Öffentliche Bildugnsausgaben',
    'Kulturausgaben Kanton',
    'Kulturausgaben Gemeinden',
    'Kulturausgaben Kantone und Gemeinde',
    'BWS Landwirtschaft, Forstwirtschaft und Fischerei',
    'BWS Bergbau und Gewinnung von Steinen und Erden, Herstellung von Waren, Bau',
    'BWS Energieversorgung, Wasserversorgung, Sammlung, Behandlung und Beseitigung von Abfällen, Erziehung und Unterricht, Gesundheitswesen',
    'BWS Handel und Reparatur von Fahrzeugen, Transport, Informationsdienstleistungen und Telekommunikation, Beherbergung und Gastronomie',
    'BWS Erbringung von Finanzdienstleistungen und Versicherungen',
    'BWS Grundstücks- und Wohnungswesen, sonstige freiberufliche, wissenschaftliche und technische Tätigkeiten, wirtschaftlichen Dienstleistungen, Kunst, Unterhaltung und Erholung, sonstige Dienstleistungen',
    'BWS Öffentliche Verwaltung',
    'BWS Private Haushalte als Hersteller',
    'BIP'
]

kantons = df['Kanton'].unique()

for k in kantons:
    df_kanton = df[df['Kanton'] == k]
    
    for col in columns_to_fill:
        years = df_kanton['Jahr'].values.reshape(-1, 1)
        values = df_kanton[col].values
        train_mask = (years.flatten() <= 2021) & (~np.isnan(values))
        x_train = years[train_mask].reshape(-1, 1)
        y_train = values[train_mask]
        
        if len(x_train) > 1:  
            model = LinearRegression().fit(x_train, y_train)
            predict_mask = (years.flatten() == 2022) & (np.isnan(values))
            if np.any(predict_mask):
                df.loc[(df['Kanton'] == k) & (df['Jahr'] == 2022), col] = model.predict(np.array([[2022]]))[0]
                
df['Kanton'] = df['Kanton'].replace({'Waadt': 'VD', 'Wallis': 'VS', 'Genf': 'GE', 'Bern': 'BE', 'Freiburg': 'FR', 'Solothurn': 'SO', 'Neuenburg': 'NE', 'Jura': 'JU', 'Basel-Stadt': 'BS', 'Basel-Landschaft': 'BL', 'Aargau': 'AG', 'Zürich': 'ZH', 'Glarus': 'GL', 'Schaffhausen': 'SH', 'Appenzell A. Rh.': 'AR', 'Appenzell I. Rh.': 'AI', 'St. Gallen': 'SG', 'Graubünden': 'GR', 'Thurgau': 'TG', 'Luzern': 'LU', 'Uri': 'UR', 'Schwyz': 'SZ', 'Obwalden': 'OW', 'Nidwalden': 'NW', 'Zug': 'ZG', 'Tessin': 'TI'})

df.rename(columns={
    'Wanderungssaldo.1': 'Wanderungssaldo', 
    'Wanderungssaldo': 'Wanderungssaldo Ein-und Auswanderung',
    'Wanderungssaldo.1': 'Wanderungssaldo Zu-und Wegzüge',
    'Betreibungshandlungen  Pfändungsvollzüge': 'Betreibungshandlungen Pfändungsvollzüge',
    'Transferzahlungen von Kantonen an andere Kantone (und Konkordate)': 'Transferzahlungen von Kantonen an andere Kantone',
    'Transferzahlungen von Kantonen an Gemeinden (und Gemeindezweckverbände)': 'Transferzahlungen von Kantonen an Gemeinden',
    'Transferzahlungen von Gemeinden an Kantone (und Konkordate)': 'Transferzahlungen von Gemeinden an Kantone',
    'BWS Bergbau und Gewinnung von Steinen und Erden, Herstellung von Waren, Bau': 'BWS Bergbau und Gewinnung von Steinen und Erden',
    'BWS Energieversorgung, Wasserversorgung, Sammlung, Behandlung und Beseitigung von Abfällen, Erziehung und Unterricht, Gesundheitswesen': 'BWS Versorgung, Sammlung, Entsorgung',
    'BWS Handel und Reparatur von Fahrzeugen, Transport, Informationsdienstleistungen und Telekommunikation, Beherbergung und Gastronomie': 'BWS Transport, IT-Dienstleistung',
    'BWS Erbringung von Finanzdienstleistungen und Versicherungen': 'BWS Finanzdienstleistungen und Versicherungen',
    'BWS Grundstücks- und Wohnungswesen, sonstige freiberufliche, wissenschaftliche und technische Tätigkeiten, wirtschaftlichen Dienstleistungen, Kunst, Unterhaltung und Erholung, sonstige Dienstleistungen': 'BWS Wissenschaft und Kunst sowie sonstige Dienstleistungen',
}, inplace=True)

df['Betreibungshandlungen Verwertungen'] = pd.to_numeric(df['Betreibungshandlungen Verwertungen'], errors='coerce')

output_path = os.path.join(output_directory_merged, "Kantonsdaten_merged_filled.xlsx")
df.to_excel(output_path, index=False)

In [20]:
output_directory_final = "../data/final"
file_path_double_digit = os.path.join(output_directory_final, "merged_double_digit.csv")


df_double_digit = pd.read_csv(file_path_double_digit)


merged_df = pd.merge(df_double_digit, df, how='outer', left_on=['Region', 'Year'], right_on=['Kanton', 'Jahr'])
merged_df = merged_df[merged_df['Region'] != 0]
merged_df.drop('Kanton', axis=1, inplace=True)
merged_df.drop('Jahr', axis=1, inplace=True)

#drop rows where column Jahr is NaN
merged_df = merged_df.dropna(subset=['Year'])


merged_output_path = os.path.join(output_directory_final, "merged_complete.csv")
merged_df.to_csv(merged_output_path, index=False)


In [21]:
merged_df.info()
