In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error 
from sklearn.preprocessing import StandardScaler

In [2]:
#Load the dataset
df = pd.read_csv(r"C:\Users\manah\Downloads\monatszahlen2505_verkehrsunfaelle_06_06_25.csv")
df.head()

Unnamed: 0,MONATSZAHL,AUSPRAEGUNG,JAHR,MONAT,WERT,VORJAHRESWERT,VERAEND_VORMONAT_PROZENT,VERAEND_VORJAHRESMONAT_PROZENT,ZWOELF_MONATE_MITTELWERT
0,Alkoholunfälle,insgesamt,2025,202501,,,,,
1,Alkoholunfälle,insgesamt,2025,202502,,,,,
2,Alkoholunfälle,insgesamt,2025,202503,,,,,
3,Alkoholunfälle,insgesamt,2025,202504,,,,,
4,Alkoholunfälle,insgesamt,2025,202505,,,,,


In [3]:
# Preprocessing the dataset 

df.dropna(axis=1, how='all', inplace=True)
print(df.columns)
df = df[['MONATSZAHL', 'AUSPRAEGUNG', 'JAHR', 'MONAT', 'WERT']]

#Filtering dataset for enteries above 2020, category as alkoholunfalle and type as insgesamt

df_cleaned = df[(df['JAHR']<=2020) & (df['MONATSZAHL']=='Alkoholunfälle') & (df['AUSPRAEGUNG']=='insgesamt')]

# Ensure MONAT is string type

df_cleaned['MONAT'] = df['MONAT'].astype(str)

# Remove rows where MONAT is 'Summe'
# map it as month number such as 01, 02, 03,...
df_cleaned = df_cleaned[df_cleaned['MONAT'].str.lower() != 'summe']

df_cleaned['MONAT'] = df_cleaned['MONAT'].astype(str).str[-2:]

print(df_cleaned)


Index(['MONATSZAHL', 'AUSPRAEGUNG', 'JAHR', 'MONAT', 'WERT', 'VORJAHRESWERT',
       'VERAEND_VORMONAT_PROZENT', 'VERAEND_VORJAHRESMONAT_PROZENT',
       'ZWOELF_MONATE_MITTELWERT'],
      dtype='object')
         MONATSZAHL AUSPRAEGUNG  JAHR MONAT  WERT
64   Alkoholunfälle   insgesamt  2020    01  28.0
65   Alkoholunfälle   insgesamt  2020    02  40.0
66   Alkoholunfälle   insgesamt  2020    03  27.0
67   Alkoholunfälle   insgesamt  2020    04  26.0
68   Alkoholunfälle   insgesamt  2020    05  40.0
..              ...         ...   ...   ...   ...
330  Alkoholunfälle   insgesamt  2000    08  77.0
331  Alkoholunfälle   insgesamt  2000    09  84.0
332  Alkoholunfälle   insgesamt  2000    10  83.0
333  Alkoholunfälle   insgesamt  2000    11  71.0
334  Alkoholunfälle   insgesamt  2000    12  85.0

[252 rows x 5 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['MONAT'] = df['MONAT'].astype(str)


In [4]:
#categorical features for one-hot encoding
encoder = OneHotEncoder(sparse_output=False)

cat_features = encoder.fit_transform(df_cleaned[['MONATSZAHL', 'AUSPRAEGUNG']])


# Numeric features as floats
numeric_features = df_cleaned[['JAHR', 'MONAT']].values.astype(float)



In [5]:
#Extracting Inputs and Outputs for the training Process
X = np.hstack([cat_features, numeric_features])
Y = df_cleaned[['WERT']].values

In [6]:
# Split dataset into train and test (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, random_state=42, test_size=0.2
)
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train model
rf = RandomForestRegressor(n_estimators=700, random_state=42)
rf.fit(X_train_scaled, y_train)

# Predict and evaluate
y_pred = rf.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
print(f"Test MSE : {mse:.2f}")

  return fit_method(estimator, *args, **kwargs)


Test MSE : 66.33


In [7]:
# Prepare input for prediction:
# Category: 'Alkoholunfälle'
# Type: 'insgesamt'
# Year: 2021
# Month: 1

# Encode categorical features for prediction input
cat_input = encoder.transform(
    [["Alkoholunfälle", "insgesamt"]]
)  # shape (1, cat_feature_count)

# Numeric features for Jan 2021
num_input = np.array([[2021, 1]])  # Year=2021, Month=1

# Combine features
X_pred = np.hstack([cat_input, num_input])

# Predict
prediction = rf.predict(X_pred)

print(f"Predicted number of accidents for January 2021: {prediction[0]:.2f}")

Predicted number of accidents for January 2021: 29.14


