# Datenanalyse und Exploration

---

Autor: mn086

---

## Setup

In [1]:
import os
import pandas as pd
#import numpy as np

## Daten-Import

**Pfade:**

In [2]:
root_processed = os.path.join('..', 'data', 'processed')

data = 'kfz_kombiniert.csv' # aufbereiteter Datensatz

**Import in Dataframes**

In [3]:
df_data = pd.read_csv(os.path.join(root_processed, data), index_col=0) # Dataframe Fahrzeugbestand

## Daten-Struktur

In [4]:
df_data.tail(3)

Unnamed: 0,landkreis_id,landkreis,benzin_euro1,benzin_euro2,benzin_euro3,benzin_euro4,benzin_euro5,benzin_euro6,benzin_euro6d,benzin_euro6dt,...,euro3,euro4,euro5,euro6,euro6d,euro6dt,sonstigeemissionsgruppen,vee,anzahl_personen,unfaelle_je_10k_kfz
396,9778,"Unterallgäu, Landkreis",971,4362,3534,20334,14385,13035,30,2889,...,6861,27297,27984,22648,64,4779,2064,25966,,50.5
397,9779,"Donau-Ries, Landkreis",853,4334,3530,18599,12914,11386,43,2865,...,6993,25214,25475,20332,144,4954,1671,26021,,51.7
398,9780,"Oberallgäu, Landkreis",914,3252,2994,18687,15545,15712,74,4123,...,6441,24325,29009,26362,148,6344,2438,25954,,63.3


In [5]:
df_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 399 entries, 0 to 398
Data columns (total 82 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   landkreis_id                               399 non-null    int64  
 1   landkreis                                  399 non-null    object 
 2   benzin_euro1                               399 non-null    int64  
 3   benzin_euro2                               399 non-null    int64  
 4   benzin_euro3                               399 non-null    int64  
 5   benzin_euro4                               399 non-null    int64  
 6   benzin_euro5                               399 non-null    int64  
 7   benzin_euro6                               399 non-null    int64  
 8   benzin_euro6d                              399 non-null    int64  
 9   benzin_euro6dt                             399 non-null    int64  
 10  benzin_sonstigeemissionsgruppen

## Variablen Listen

In [6]:
list_meta = ['landkreis_id', 'landkreis']  # Meta-Informationen zu den Landkreisen
list_num = [col for col in df_data.columns if col not in list_meta and pd.api.types.is_numeric_dtype(df_data[col])] # Liste numerischer Spalten
list_kfz_aggr = [
    "benzin", "diesel", "elektro", "gas", "hybrid", "pih", "sonstigeantriebe",
    "euro1", "euro2", "euro3", "euro4", "euro5", "euro6", "euro6dt", "euro6d", "sonstigeemissionsgruppen"] # Aggregierte Spalten
list_kfz_aggr_antriebe = ["benzin", "diesel", "elektro", "gas", "hybrid", "pih", "sonstigeantriebe"] # Aggregierte Antriebe
list_kfz_aggr_eg = [col for col in list_kfz_aggr if col not in list_kfz_aggr_antriebe] # Aggregierte Emissionsgruppen
list_kfz_num = [col for col in list_num if col not in list_kfz_aggr and col not in
                ['gesamt', 'vee', 'anzahl_personen', 'unfaelle_je_10k_kfz']] # Nicht aggregierte Spalten (Kombination Antrieb & Emissionsgruppe)

In [7]:
# Dataframe der aggregierten Antriebe
df_antriebe = df_data[['landkreis'] + list_kfz_aggr_antriebe]
# Normierten DataFrame in Prozent erstellen
df_antriebe_prozent = df_antriebe.copy()
df_antriebe_prozent[list_kfz_aggr_antriebe] = df_antriebe[list_kfz_aggr_antriebe].apply(lambda x: x / x.sum() * 100, axis=1)

# Dataframe der aggregierten Emissionsgruppen
df_eg = df_data[['landkreis'] + list_kfz_aggr_eg]
# Normierten DataFrame in Prozent erstellen
df_eg_prozent = df_eg.copy()
df_eg_prozent[list_kfz_aggr_eg] = df_eg[list_kfz_aggr_eg].apply(lambda x: x / x.sum() * 100, axis=1)

# Dataframe für die Korrelation erstellen
df_corr = pd.DataFrame(pd.concat([df_data[['anzahl_personen', 'vee', 'anzahl_kfz', 'unfaelle_je_10k_kfz']],
                                  df_antriebe_prozent[list_kfz_aggr_antriebe],
                                  df_eg_prozent[list_kfz_aggr_eg]], axis=1))

In [8]:
import altair as alt

def create_stacked_bar_chart(df, id_vars, var_name, value_name, x_axis_title, chart_title):
    """
    Erstellt ein gestapeltes Balkendiagramm aus einem DataFrame.

    Parameter:
    df (pd.DataFrame): Der DataFrame, der die Daten enthält
    id_vars (list): Liste der Spalten, die als Identifikatoren verwendet werden
    var_name (str): Name der Spalte, die die Kategorien enthält
    value_name (str): Name der Spalte, die die Werte enthält
    x_axis_title (str): Titel der x-Achse
    chart_title (str): Titel des Diagramms

    Rückgabewert:
    alt.Chart: Das erstellte gestapelte Balkendiagramm
    """
    
    # Den DataFrame schmelzen, um Motortypen in eine einzelne Spalte zu konvertieren
    df_melted = df.melt(
        id_vars=id_vars, 
        var_name=var_name, 
        value_name=value_name
    )

    # Gestapeltes Balkendiagramm erstellen
    chart = alt.Chart(df_melted).mark_bar().encode(
        x=alt.X(f'{id_vars[0]}:N', axis=alt.Axis(labels=False, title=x_axis_title)),  # Labels aufgrund der großen Anzahl ausblenden
        y=alt.Y(f'{value_name}:Q', stack='zero'),
        color=f'{var_name}:N',
        tooltip=[id_vars[0], var_name, value_name]
    ).properties(
        width=800,
        height=400,
        title=chart_title
    )
    
    return chart

In [9]:
# Beispielaufruf für df_antriebe
chart_antriebe = create_stacked_bar_chart(
    df=df_antriebe,
    id_vars=['landkreis'],
    var_name='antriebsart',
    value_name='Anzahl',
    x_axis_title='Landkreis',
    chart_title='Fahrzeugbestand nach Antriebsart und Landkreis'
)
chart_antriebe

In [10]:
# Beispielaufruf für df_eg
chart_eg = create_stacked_bar_chart(
    df=df_eg,
    id_vars=['landkreis'],
    var_name='Emissiongruppe',
    value_name='Anzahl',
    x_axis_title='Landkreis',
    chart_title='Fahrzeugbestand nach Emissionsgruppe und Landkreis'
)
chart_eg

In [11]:
# Beispielaufruf für df_antriebe (normiert)
chart_antriebe_prozent = create_stacked_bar_chart(
    df=df_antriebe_prozent,
    id_vars=['landkreis'],
    var_name='antriebsart',
    value_name='Anteil',
    x_axis_title='Landkreis',
    chart_title='Fahrzeugbestand nach Antriebsart und Landkreis'
)
chart_antriebe_prozent

In [12]:
# Beispielaufruf für df_eg (normiert)
chart_eg_prozent = create_stacked_bar_chart(
    df=df_eg_prozent,
    id_vars=['landkreis'],
    var_name='Emissiongruppe',
    value_name='Anteil',
    x_axis_title='Landkreis',
    chart_title='Fahrzeugbestand nach Emissionsgruppe und Landkreis'
)
chart_eg_prozent

In [13]:
corr_pearson = df_corr.corr()
corr_pearson.style.background_gradient(cmap='Blues', vmin=-1, vmax=1)

Unnamed: 0,anzahl_personen,vee,anzahl_kfz,unfaelle_je_10k_kfz,benzin,diesel,elektro,gas,hybrid,pih,sonstigeantriebe,euro1,euro2,euro3,euro4,euro5,euro6,euro6dt,euro6d,sonstigeemissionsgruppen
anzahl_personen,1.0,0.121965,0.965517,0.35544,0.079715,-0.128231,0.042753,0.120597,0.431665,0.343089,-0.086953,0.205404,0.022354,-0.023924,-0.117147,-0.310338,0.185445,0.111739,0.081272,0.113525
vee,0.121965,1.0,0.167683,-0.211131,-0.413467,0.398062,0.2025,-0.296352,0.15771,0.468067,-0.362347,0.198057,-0.201852,-0.285683,-0.489984,-0.062319,0.284904,0.222108,0.166346,0.400239
anzahl_kfz,0.965517,0.167683,1.0,0.096966,0.116455,-0.165661,0.072621,0.160156,0.266505,0.346364,-0.071154,0.151893,0.006467,-0.05139,-0.143814,-0.361497,0.121017,0.207255,0.144651,0.179446
unfaelle_je_10k_kfz,0.35544,-0.211131,0.096966,1.0,0.103197,-0.149932,0.107309,0.047725,0.336793,0.303561,0.016363,0.17553,0.065859,0.102061,-0.080715,-0.376244,0.142324,0.11131,0.265352,0.107928
benzin,0.079715,-0.413467,0.116455,0.103197,1.0,-0.988023,-0.168074,0.085943,0.082539,-0.133376,0.42161,0.132928,0.148099,0.063919,0.177141,-0.237332,0.031851,-0.032992,0.05503,-0.13002
diesel,-0.128231,0.398062,-0.165661,-0.149932,-0.988023,1.0,0.082675,-0.15937,-0.189884,0.04291,-0.395599,-0.158931,-0.147705,-0.055829,-0.131443,0.329603,-0.042283,-0.03117,-0.092554,0.037009
elektro,0.042753,0.2025,0.072621,0.107309,-0.168074,0.082675,1.0,-0.100433,0.226828,0.377805,-0.065894,0.040182,-0.164512,-0.24224,-0.369774,-0.27425,0.105916,0.338259,0.160465,0.636694
gas,0.120597,-0.296352,0.160156,0.047725,0.085943,-0.15937,-0.100433,1.0,-0.093041,-0.113367,-0.140275,0.052879,0.322696,0.37622,0.360839,-0.180554,-0.278566,-0.127124,-0.10698,0.007509
hybrid,0.431665,0.15771,0.266505,0.336793,0.082539,-0.189884,0.226828,-0.093041,1.0,0.591446,0.005818,0.181579,-0.14243,-0.207199,-0.43439,-0.532474,0.234234,0.458878,0.310443,0.429607
pih,0.343089,0.468067,0.346364,0.303561,-0.133376,0.04291,0.377805,-0.113367,0.591446,1.0,-0.251453,0.204872,-0.207372,-0.350217,-0.636757,-0.637485,0.395184,0.582669,0.458266,0.533935
