In [127]:
import requests
import json
import pandas as pd
from functools import reduce
import plotly.express as px
import plotly
import statistics
from statistics import stdev
import plotly.io as pio

In [128]:
def api_to_df(url):
    """
    accesses api as json, creates dict, and then converts dict to pandas dataframe

    args:
        - url, string url for api access
    
    returns:
        - pandas dataframe with api data
    """
    response = requests.get(url).text
    doc = json.loads(response)


    values = doc["value"]
    dim_ids = doc["id"]
    dim_sizes = doc["size"]
    dimensions = doc["dimension"]

    dim_labels = {}
    for dim in dim_ids:
        dim_info = dimensions[dim]
        label_dict = dim_info["category"]["label"]
        index_dict = dim_info["category"]["index"]
        ordered_labels = [label_dict[key] for key, _ in sorted(index_dict.items(), key=lambda x: x[1])]
        dim_labels[dim] = ordered_labels

    def flat_index_to_coords(index, sizes):
        coords = []
        for size in reversed(sizes):
            coords.insert(0, index % size)
            index = index // size
        return coords

    records = []
    for flat_key, value in values.items():
        flat_index = int(flat_key)
        coord_indices = flat_index_to_coords(flat_index, dim_sizes)

        record = {}
        for i, dim in enumerate(dim_ids):
            record[dim] = dim_labels[dim][coord_indices[i]]
        record["value"] = value
        records.append(record)

    return pd.DataFrame(records)
 
    

In [129]:
def clean_df(df, indic, value, metadata_cols, unit = None, target_col = None):
    """
    takes in df as arg, converts indic to columns, adds unit to indic columns, makes value column part of indic columns,
      removes freq column, returns cleaned df

      args:
      - df: pandas df to be cleaned, df
      - indic: indicator column to be converted into multiple columns, string
      - unit: unit column to be combined into indic columns, string
      - value: value column to be turned into values in indic cols, string
      - metadata_cols: other cols to be accepted, list of strings
      - target_col: main string to be indic col, string
    """
        # Strip whitespace from column names
    df.columns = df.columns.str.strip()

    # Combine indicator and unit to form unique column names
    if unit:
        df["indic_clean"] = df[indic] + " [" + df[unit] + "]"
    else: 
        df["indic_clean"] = df[indic]

    # Pivot to wide format
    df_wide = df.pivot_table(
        index=metadata_cols,
        columns="indic_clean",
        values=value
    ).reset_index()

    # Flatten columns
    df_wide.columns.name = None

    # If a target_col is specified, return only that column + metadata
    if target_col:
    
        if target_col not in df_wide.columns:
            raise ValueError(
                f"Target column '{target_col}' not found. Available columns:\n{df_wide.columns.tolist()}"
            )
        mean = df_wide[target_col].mean()
        std = df_wide[target_col].std()
        df_wide[target_col] = (df_wide[target_col] - mean) / std
        return df_wide[metadata_cols + [target_col]]
    
    # If no target_col specified, return everything
    return df_wide

# Healthcare

In [130]:
healthcare = api_to_df("https://ec.europa.eu/eurostat/api/dissemination/statistics/1.0/data/hlth_hlye?format=JSON&time=2008&time=2009&time=2010&time=2011&time=2012&time=2013&time=2014&time=2015&time=2016&time=2017&time=2018&time=2019&time=2020&time=2021&time=2022&geo=BE&geo=BG&geo=CZ&geo=DK&geo=DE&geo=EE&geo=IE&geo=EL&geo=ES&geo=FR&geo=HR&geo=IT&geo=CY&geo=LV&geo=LT&geo=LU&geo=HU&geo=MT&geo=NL&geo=AT&geo=PL&geo=PT&geo=RO&geo=SI&geo=SK&geo=FI&geo=SE&unit=YR&unit=PC&sex=M&sex=F&indic_he=HLY_0&indic_he=HLY_PC_0&indic_he=LE_0&indic_he=HLY_50&indic_he=HLY_PC_50&indic_he=LE_50&indic_he=HLY_65&indic_he=HLY_PC_65&indic_he=LE_65&lang=en")

In [131]:
healthcare_clean = clean_df(healthcare, "indic_he", "value", ["geo","time"], "unit", "Healthy life years in absolute value at birth [Year]")
healthcare_clean

Unnamed: 0,geo,time,Healthy life years in absolute value at birth [Year]
0,Austria,2008,-0.543029
1,Austria,2009,-0.335181
2,Austria,2010,-0.346121
3,Austria,2011,-0.411757
4,Austria,2012,-0.072637
...,...,...,...
397,Sweden,2018,2.443410
398,Sweden,2019,2.530924
399,Sweden,2020,2.421531
400,Sweden,2021,1.469809


# Education

In [132]:
education = api_to_df("https://ec.europa.eu/eurostat/api/dissemination/statistics/1.0/data/edat_lfse_03?format=JSON&time=2008&time=2009&time=2010&time=2011&time=2012&time=2013&time=2014&time=2015&time=2016&time=2017&time=2018&time=2019&time=2020&time=2021&time=2022&geo=BE&geo=BG&geo=CZ&geo=DK&geo=DE&geo=EE&geo=IE&geo=EL&geo=ES&geo=FR&geo=HR&geo=IT&geo=CY&geo=LV&geo=LT&geo=LU&geo=HU&geo=MT&geo=NL&geo=AT&geo=PL&geo=PT&geo=RO&geo=SI&geo=SK&geo=FI&geo=SE&unit=PC&sex=M&sex=F&age=Y20-24&age=Y25-34&age=Y35-44&age=Y45-54&age=Y55-64&isced11=ED0-2&isced11=ED3_4&isced11=ED5-8&lang=en")

In [133]:
education_clean = clean_df(education, "isced11", "value",["geo", "time"], "unit", target_col="Tertiary education (levels 5-8) [Percentage]")
education_clean

Unnamed: 0,geo,time,Tertiary education (levels 5-8) [Percentage]
0,Austria,2008,-1.565091
1,Austria,2009,-1.442042
2,Austria,2010,-1.398975
3,Austria,2011,-1.391592
4,Austria,2012,-1.304227
...,...,...,...
400,Sweden,2018,1.281033
401,Sweden,2019,1.381933
402,Sweden,2020,1.426231
403,Sweden,2021,1.533284


# Safety

In [134]:
safety = api_to_df("https://ec.europa.eu/eurostat/api/dissemination/statistics/1.0/data/crim_off_cat?format=JSON&time=2008&time=2009&time=2010&time=2011&time=2012&time=2013&time=2014&time=2015&time=2016&time=2017&time=2018&time=2019&time=2020&time=2021&time=2022&geo=BE&geo=BG&geo=CZ&geo=DK&geo=DE&geo=EE&geo=IE&geo=EL&geo=ES&geo=FR&geo=HR&geo=IT&geo=CY&geo=LV&geo=LT&geo=LU&geo=HU&geo=MT&geo=NL&geo=AT&geo=PL&geo=PT&geo=RO&geo=SI&geo=SK&geo=FI&geo=SE&unit=NR&unit=P_HTHAB&iccs=ICCS0101&iccs=ICCS0102&iccs=ICCS020111&iccs=ICCS020221&iccs=ICCS0301&iccs=ICCS03011&iccs=ICCS03012&iccs=ICCS0302&iccs=ICCS030221&iccs=ICCS0401&iccs=ICCS0501&iccs=ICCS05012&iccs=ICCS0502&iccs=ICCS05021&iccs=ICCS0601&iccs=ICCS0701&iccs=ICCS0703&iccs=ICCS07031&iccs=ICCS07041&iccs=ICCS0903&iccs=ICCS09051&lang=en")

In [135]:
safety_clean = clean_df(safety, "iccs", "value",["geo","time"], "unit", target_col="Unlawful acts involving controlled drugs or precursors [Number]")
safety_clean

Unnamed: 0,geo,time,Unlawful acts involving controlled drugs or precursors [Number]
0,Austria,2008,-0.446624
1,Austria,2009,-0.444807
2,Austria,2010,-0.443769
3,Austria,2011,-0.441433
4,Austria,2012,-0.446471
...,...,...,...
400,Sweden,2018,1.182293
401,Sweden,2019,1.284619
402,Sweden,2020,1.450358
403,Sweden,2021,1.358581


# Environmental Consciousness

In [136]:
environment = api_to_df("https://ec.europa.eu/eurostat/api/dissemination/statistics/1.0/data/sdg_13_10?format=JSON&time=2008&time=2009&time=2010&time=2011&time=2012&time=2013&time=2014&time=2015&time=2016&time=2017&time=2018&time=2019&time=2020&time=2021&time=2022&geo=BE&geo=BG&geo=CZ&geo=DK&geo=DE&geo=EE&geo=IE&geo=EL&geo=ES&geo=FR&geo=HR&geo=IT&geo=CY&geo=LV&geo=LT&geo=LU&geo=HU&geo=MT&geo=NL&geo=AT&geo=PL&geo=PT&geo=RO&geo=SI&geo=SK&geo=FI&geo=SE&unit=T_HAB&unit=I90&src_crf=TOTXMEMO&src_crf=TOTX4_MEMO&lang=en")

In [137]:
env_clean = clean_df(environment, "src_crf", "value",["geo","time"], "unit", target_col="Total (excluding memo items) [Tonnes per capita]")
env_clean

Unnamed: 0,geo,time,Total (excluding memo items) [Tonnes per capita]
0,Austria,2008,0.285685
1,Austria,2009,0.103148
2,Austria,2010,0.155301
3,Austria,2011,0.103148
4,Austria,2012,0.077071
...,...,...,...
400,Sweden,2018,-1.852607
401,Sweden,2019,-1.878684
402,Sweden,2020,-1.982991
403,Sweden,2021,-1.852607


# Public Infrastructure

In [138]:

infrastructure = api_to_df('https://ec.europa.eu/eurostat/api/dissemination/statistics/1.0/data/rail_pa_total?format=JSON&time=2008&time=2009&time=2010&time=2011&time=2012&time=2013&time=2014&time=2015&time=2016&time=2017&time=2018&time=2019&time=2020&time=2021&time=2022&geo=EU27_2020&geo=BE&geo=BG&geo=CZ&geo=DK&geo=DE&geo=EE&geo=IE&geo=EL&geo=ES&geo=FR&geo=HR&geo=IT&geo=LV&geo=LT&geo=LU&geo=HU&geo=NL&geo=AT&geo=PL&geo=PT&geo=RO&geo=SI&geo=SK&geo=FI&geo=SE&unit=MIO_PKM&unit=THS_PAS&lang=en')

In [139]:
inf_clean = clean_df(infrastructure, "unit", "value",["geo","time"], target_col="Thousand passengers")
inf_clean

Unnamed: 0,geo,time,Thousand passengers
0,Austria,2008,-0.093254
1,Austria,2009,-0.091695
2,Austria,2010,-0.087537
3,Austria,2011,-0.084751
4,Austria,2012,-0.050660
...,...,...,...
337,Sweden,2018,-0.075841
338,Sweden,2019,-0.043327
339,Sweden,2020,-0.214645
340,Sweden,2021,-0.223033


# Quality of Life Index

In [140]:

eu_countries = {
    "Austria", "Belgium", "Bulgaria", "Croatia", "Cyprus", "Czechia", "Denmark",
    "Estonia", "Finland", "France", "Germany", "Greece", "Hungary", "Ireland",
    "Italy", "Latvia", "Lithuania", "Luxembourg", "Malta", "Netherlands",
    "Poland", "Portugal", "Romania", "Slovakia", "Slovenia", "Spain", "Sweden"
}

df = pd.read_csv("qol_data.csv", encoding="ISO-8859-1")

df = df.dropna(axis=1, how='all')
df.columns = df.columns.str.strip()
df_filtered = df[["Year", "Country name", "Ladder score"]]

qol_clean = df_filtered[df_filtered["Country name"].isin(eu_countries)]

qol_clean = qol_clean.reset_index(drop=True)

mean = qol_clean["Ladder score"].mean()
std = qol_clean["Ladder score"].std()
qol_clean["Ladder score"] = (qol_clean["Ladder score"] - mean) / std

qol_clean


Unnamed: 0,Year,Country name,Ladder score
0,2024.0,Finland,1.743748
1,2024.0,Denmark,1.461323
2,2024.0,Sweden,1.230130
3,2024.0,Netherlands,1.178899
4,2023.0,Austria,0.652145
...,...,...,...
346,2016.0,Sweden,1.150000
347,2015.0,Sweden,1.159195
348,2014.0,Sweden,1.255088
349,2012.0,Sweden,1.407466


# Combining Dataframes

In [141]:
# standardizing columns
def prep_df(df, time_col="time", country_col="geo", new_cols={}):
    df = df.rename(columns={time_col: "year", country_col: "country", **new_cols})
    df["year"] = df["year"].astype(int)
    return df

healthcare = prep_df(healthcare_clean, new_cols={"Healthy life years in absolute value at birth [Year]": "healthcare"})
education = prep_df(education_clean, new_cols={"Tertiary education (levels 5-8) [Percentage]": "education"})
safety = prep_df(safety_clean, new_cols={"Unlawful acts involving controlled drugs or precursors [Number]": "safety"})
environment = prep_df(env_clean, new_cols={"Total (excluding memo items) [Tonnes per capita]": "environment"})
infrastructure = prep_df(inf_clean, new_cols={"Thousand passengers": "infrastructure"})
qol = prep_df(qol_clean, time_col="Year", country_col="Country name", new_cols={"Ladder score": "qol"})

dfs = [healthcare, education, safety, environment, qol]
combined = reduce(lambda left, right: pd.merge(left, right, on=["country", "year"], how="outer"), dfs)

combined = combined[combined['year'] != 2008]
combined = combined[combined['year'] != 2009]
combined = combined[combined['year'] != 2010]
combined = combined[combined['year'] != 2023]
combined = combined[combined['year'] != 2024]

In [142]:
def avg_miss_vals(df):
    """
    Replaces missing values with an average from the previous and following rows

    args:
        - df: dataframe w missing vals to avg
    
    returns: 
        - df_filled: df w no missing vals
    """
    df_filled = df.copy()
    
    for col in df.select_dtypes(include='number').columns:
        df_filled[col] = df_filled[col].interpolate(method='linear', limit_direction='both')
    
    return df_filled

In [143]:
combined = avg_miss_vals(combined)
combined.to_csv("alldata_clean.csv")

In [144]:
# Averaged df
avgs = combined.drop(columns=['year']).groupby('country').mean(numeric_only=True).reset_index()
avgs


Unnamed: 0,country,healthcare,education,safety,environment,qol
0,Austria,-0.617781,0.079357,-0.007253,0.109667,1.041026
1,Belgium,0.461567,1.130913,0.233309,0.498645,0.66758
2,Bulgaria,0.688558,-0.507279,-0.41065,-0.305388,-2.25885
3,Croatia,-0.622339,-0.83582,-0.354125,-0.885595,-0.982851
4,Cyprus,0.570049,1.385215,-0.462002,0.316108,-0.497639
5,Czechia,0.216344,-0.883707,-0.403385,0.861546,0.334856
6,Denmark,-0.488332,0.578218,-0.056346,0.209628,1.584474
7,Estonia,-1.234941,0.705882,-0.409738,1.500426,-0.823522
8,Finland,-0.853432,0.977615,-0.078352,0.27482,1.579383
9,France,0.503501,0.867281,1.854649,-0.524867,0.279684


# Plotting
Here, I want to plot quality of life against all other factors to see if there is any direct relationship between them that is immediately visible.

In [145]:
fig = px.scatter(
    combined,
    x='healthcare',
    y='qol',
    color='year',
    opacity=0.5,
    labels={'healthcare': 'Life Expectancy', 'qol': 'Quality of Life'},
    title='Comparing Relationship of Life Expectancy and Quality of Life in Europe',
    hover_data=['country', 'year']
)

fig

In [146]:
fig = px.scatter(
    combined,
    x='education',
    y='qol',
    color='year',
    opacity=0.5,
    labels={'education': 'Tertiary Education', 'qol': 'Quality of Life'},
    title='Comparing Relationship of Tertiary Education and Quality of Life in Europe',
    hover_data=['country', 'year']
)

fig

In [147]:
fig = px.scatter(
    combined,
    x='safety',
    y='qol',
    color='year',
    opacity=0.5,
    labels={'safety': 'Controlled Drug Crime', 'qol': 'Quality of Life'},
    title='Comparing Relationship of Controlled Drug Crime and Quality of Life in Europe',
    hover_data=['country', 'year']
)

fig

In [148]:
fig = px.scatter(
    combined,
    x='environment',
    y='qol',
    color='year',
    opacity=0.5,
    labels={'environment': 'Emissions [Tonnes per capita]', 'qol': 'Quality of Life'},
    title='Comparing Relationship of Emissions and Quality of Life in Europe',
    hover_data=['country', 'year']
)

fig

Now, I am plotting each factor over time.

In [149]:
fig = px.line(
    combined,
    x='year',
    y='qol',
    color='country',
    labels={'year': 'Year', 'qol': 'Quality of Life'},
    title='Quality of Life over Time in Europe',
    hover_data=['country', 'year']
)

#plotly.offline.plot(fig, filename='birth_death_EU.html')

fig

In [150]:
fig = px.line(
    combined,
    x='year',
    y='healthcare',
    color='country',
    labels={'year': 'Year', 'healthcare': 'Life Expectancy'},
    title='Life Expectancy over Time in Europe',
    hover_data=['country', 'year']
)
fig

In [151]:
fig = px.line(
    combined,
    x='year',
    y='education',
    color='country',
    labels={'year': 'Year', 'qol': 'Tertiary Education'},
    title='Tertiary Education over Time in Europe',
    hover_data=['country', 'year']
)
fig

In [152]:
fig = px.line(
    combined,
    x='year',
    y='safety',
    color='country',
    labels={'year': 'Year', 'safety': 'Controlled Drug Crime'},
    title= 'Controlled Drug Crime over Time in Europe',
    hover_data=['country', 'year']
)
fig

In [153]:
fig = px.line(
    combined,
    x='year',
    y='environment',
    color='country',
    labels={'year': 'Year', 'environment': 'Emissions'},
    title='Emissions over Time in Europe',
    hover_data=['country', 'year']
)
fig

Now, I'm plotting the averages of each feature over the years against quality of life.

In [154]:
fig = px.scatter(
    avgs,
    x='healthcare',
    y='qol',
    color='country',
    opacity=1,
    labels={'healthcare': 'Life Expectancy', 'qol': 'Quality of Life'},
    title='Comparing Relationship of Life Expectancy and Quality of Life in Europe',
    hover_data=['country']
)

fig

In [155]:
fig = px.scatter(
    avgs,
    x='education',
    y='qol',
    color='country',
    opacity=1,
    labels={'education': 'Tertiary Education', 'qol': 'Quality of Life'},
    title='Comparing Relationship of Tertiary Education and Quality of Life in Europe',
    hover_data=['country']
)

fig

In [156]:
fig = px.scatter(
    combined,
    x='safety',
    y='qol',
    color='country',
    opacity=1,
    labels={'safety': 'Controlled Drug Crime', 'qol': 'Quality of Life'},
    title='Comparing Relationship of Controlled Drug Crime and Quality of Life in Europe',
    hover_data=['country']
)

fig

In [157]:
fig = px.scatter(
    avgs,
    x='environment',
    y='qol',
    color='country',
    opacity=1,
    labels={'environment': 'Emissions [Tonnes per capita]', 'qol': 'Quality of Life'},
    title='Comparing Relationship of Emissions and Quality of Life in Europe',
    hover_data=['country']
)

fig