In [None]:
#Import relevant packages
import pandas as pd
import matplotlib.pyplot as plts
import plotly.express as px

#Read the data
df_pres = pd.read_csv('data/transport_performance.csv', sep=';')
df_mob= pd.read_csv('data/mobiliteit.csv', sep=';')
df_pres.head()

Unnamed: 0,Modes of travel,Population,Margins,Region characteristics,Periods,Transport performance (billion passenger kilometres )
0,Total,Population 6 years or older,Value,The Netherlands,2018,218.1
1,Total,Population 6 years or older,Value,The Netherlands,2019,218.8
2,Total,Population 6 years or older,Value,The Netherlands,2020,152.0
3,Total,Population 6 years or older,Value,The Netherlands,2021,168.4
4,Total,Population 6 years or older,Value,The Netherlands,2022,186.9


In [2]:
#Initialise prestation data
df_pres['Periods'] = pd.to_numeric(df_pres['Periods'], errors='coerce')
df_pres['Transport performance (billion passenger kilometres )'] = pd.to_numeric(df_pres['Transport performance (billion passenger kilometres )'], errors='coerce')

#Rename columns
df_pres.rename({'Transport performance (billion passenger kilometres )': 'Total travel'}, axis=1, inplace=True)

#Drop unnesecary columns
df_pres.drop('Margins', axis=1, inplace=True)
df_pres.drop('Population', axis=1, inplace=True)

df_pres.head()

Unnamed: 0,Modes of travel,Region characteristics,Periods,Total travel
0,Total,The Netherlands,2018,218.1
1,Total,The Netherlands,2019,218.8
2,Total,The Netherlands,2020,152.0
3,Total,The Netherlands,2021,168.4
4,Total,The Netherlands,2022,186.9


In [None]:
# Filter data for urbanisation analysis
ref_year = 2018
levels = [
    "Extremely urbanised", "Strongly urbanised", "Moderately urbanised",
    "Hardly urbanised", "Not urbanised"
]
# Create subset first 
df = df_pres.loc[
    (df_pres["Modes of travel"] == "Total") &
    (df_pres["Region characteristics"].isin(levels))
].copy()

# Ensure Periods is numeric in the subset as well
df['Periods'] = pd.to_numeric(df['Periods'], errors='coerce')

# Sort and compute simple base per mode if needed
df = df.sort_values(['Modes of travel', 'Periods'])
base = df.groupby('Modes of travel')['Total travel'].transform('first')
df['Travel (km) index'] = df['Total travel'] / base * 100

# Define baseline per region characteristic for the reference year
base_per_level = (
    df.loc[df["Periods"] == ref_year]
      .set_index("Region characteristics")["Total travel"]
)

# Index = current value / baseline * 100 (per region characteristic)
df["Index"] = df["Total travel"] / df["Region characteristics"].map(base_per_level) * 100

# Plot: recovery index by degree of urbanisation
fig1 = px.line(
    df, x="Periods", y="Index", color="Region characteristics", markers=True,
    title=f"Recovery index ({ref_year} = 100) by degree of urbanisation - Netherlands",
    labels={"Periods": "Year", "Index": "Index"}
)
fig1.add_hline(y=100, line_dash="dot")
fig1.show()

In [4]:
# Urban vs Rural (compact, no pivot/melt) — uses df and ref_year defined above

# Map five levels → two groups
group_map = {
    "Extremely urbanised": "Urban",
    "Strongly urbanised": "Urban",
    "Moderately urbanised": "Urban",
    "Hardly urbanised": "Rural",
    "Not urbanised": "Rural",
}

# Prepare subset and group per year
uvr = df[["Periods", "Region characteristics", "Total travel"]].copy()
uvr["Group"] = uvr["Region characteristics"].map(group_map)
uvr = (uvr.groupby(["Periods", "Group"], as_index=False)["Total travel"]
          .sum()
          .sort_values(["Group", "Periods"]))
# Define baseline per group (for the reference year)
base_grp = (
    uvr.loc[uvr["Periods"] == ref_year]
       .set_index("Group")["Total travel"]
)


# Index = current value / baseline * 100
uvr["Index"] = uvr["Total travel"] / uvr["Group"].map(base_grp) * 100

# Plot Urban vs Rural recovery
fig2 = px.line(
    uvr, x="Periods", y="Index", color="Group", markers=True,
    title=f"Recovery index ({ref_year} = 100): Urban vs Rural - Netherlands",
    labels={"Periods": "Year", "Index": f"Index "}
)
fig2.add_hline(y=100, line_dash="dot")
fig2.show()
