# Modellaufbau und Training

---

Autor: mn086

---

## Setup

In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

## Daten-Import

**Pfade:**

In [2]:
root_processed = os.path.join('..', 'data', 'processed')

**Import in Dataframes**

In [3]:
df_regr = pd.read_csv(os.path.join(root_processed, 'regression_data.csv'))

## Daten-Struktur

In [4]:
df_regr.tail(3)

Unnamed: 0,anzahl_personen,vee,anzahl_kfz_je_person,unfaelle_je_10k_kfz,elektro,pih,euro2,euro3,euro4,euro6,euro6dt
396,,25966,,50.5,0.246801,0.123908,5.708917,6.968312,27.723949,23.002234,4.853748
397,,26021,,51.7,0.279993,0.168433,6.172961,7.648391,27.57708,22.237534,5.418294
398,,25954,,63.3,0.344202,0.192435,4.607541,6.389057,24.128833,26.149405,6.292839


In [5]:
df_regr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 399 entries, 0 to 398
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   anzahl_personen       216 non-null    float64
 1   vee                   399 non-null    int64  
 2   anzahl_kfz_je_person  216 non-null    float64
 3   unfaelle_je_10k_kfz   399 non-null    float64
 4   elektro               399 non-null    float64
 5   pih                   399 non-null    float64
 6   euro2                 399 non-null    float64
 7   euro3                 399 non-null    float64
 8   euro4                 399 non-null    float64
 9   euro6                 399 non-null    float64
 10  euro6dt               399 non-null    float64
dtypes: float64(10), int64(1)
memory usage: 34.4 KB


## Variablen Listen

In [6]:
y_label = "euro4"
features = ["vee", "elektro", "pih", "euro2", "euro3", "euro6"]

X = df_regr[features]
y = df_regr[y_label]

## Daten aufteilen, Train-Test-Split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2,     # 20% Testdaten
    random_state=42    # Für Reproduzierbarkeit
)

## Modell

### Auswahl des Modells

In [8]:
regr = LinearRegression()

### Modell mit den Daten trainieren

In [9]:
regr.fit(X_train, y_train)

In [10]:
# Bestimmtheitsmaß R-squared für Trainings- und Test Daten berechnen
r2_train = regr.score(X_train, y_train)
r2_test = regr.score(X_test, y_test)

print(f'R² Training: {r2_train:.4f}')
print(f'R² Test: {r2_test:.4f}')

R² Training: 0.8118
R² Test: 0.7764


In [11]:
# angepasstes Bestimmtheitsmaß für Trainings- und Test Daten berechnen
# Anzahl der Beobachtungen
n_train = X_train.shape[0]
n_test = X_test.shape[0]

# Anzahl der Features
p = X_train.shape[1]

# Angepasstes R² berechnen
adj_r2_train = 1 - (1 - r2_train) * (n_train - 1) / (n_train - p - 1)
adj_r2_test = 1 - (1 - r2_test) * (n_test - 1) / (n_test - p - 1)

print(f'Angepasstes R² Training: {adj_r2_train:.4f}')
print(f'Angepasstes R² Test: {adj_r2_test:.4f}')

Angepasstes R² Training: 0.8081
Angepasstes R² Test: 0.7580
