In [1]:
import zipfile
import glob
from pathlib import Path
from functools import reduce

import pandas as pd
import numpy as np

data_path = "data"

In [7]:
# cols_to_drop: Remove certain *constant* columns - TODO: Unhardcode

def eurostat_reader(file_path, cols_to_drop={"UNIT", "C_RESID", "AGE", "ISCED11"}):

    with zipfile.ZipFile(file_path, 'r') as zip_file:
        for file in zip_file.namelist():
            if 'Data' in file:
                with zip_file.open(file) as data_file:
                    df = pd.read_csv(data_file, encoding="ISO-8859-1")

    # Drop desired columns
    if len(cols_to_drop.intersection(df.columns)) > 0:
        df = df.drop(columns=cols_to_drop.intersection(df.columns))

    # Clean Value column and convert to float
    df["Value"] = df["Value"].apply(
        lambda x: x.replace(",", "").replace(".", "")).replace({":": None}, regex=False).astype(np.float32)

    # Find the column containing the relevant values to `spread` on
    value_col = [col for col in df.columns if col not in {"TIME", "GEO", "Value"}]

    if len(value_col) > 1:
        raise ValueError(f"Too many columns available to spread on for file '{file_path}', "
                         f"namely {value_col}. Check the data and add columns to remove to cols_to_drop.")  
        
    if len(value_col) == 1:
        df = df.pivot_table(index=['TIME','GEO'], columns=value_col[0], values='Value').reset_index()
        
        if "Population on 1 January by age, sex and NUTS 2 region" in file_path:
            df.columns = ['Amount_of_' + col if col not in {'TIME','GEO'} else col for col in df.columns]
        elif "Pupils and students enrolled by education level, sex and NUTS2 regions" in file_path:
            df.columns = ['Educated_' + col if col not in {'TIME','GEO'} else col for col in df.columns]
        
    else:
        df = df.rename(columns={'Value': Path(file_path).stem.replace("by NUTS 2 regions", "")})
        
    df = df.dropna(how='all').dropna(axis=1, how='all')
    
    del df.columns.name
    
    return df

In [8]:
dfs = []

for file in glob.glob("data/*.zip"):
    if file == 'data\Railway transport - national and international railway passengers transport by loading_unloading NUTS 2 region.zip':
        continue
    
    df = eurostat_reader(file)
    dfs.append(df)

In [13]:
df_merged = reduce(lambda x, y: pd.merge(x, y, on=['TIME', 'GEO'], how='outer'), dfs)
print("Shape:", df_merged.shape)
df_merged.head()

Shape: (881, 39)


Unnamed: 0,TIME,GEO,Passengers carried,Passengers carried (arrival),Passengers carried (departures),Burglary of private residential premises,Intentional homicide,Robbery,Theft of a motorized land vehicle,Gross domestic product (GDP) at current market prices,...,Educated_Females,Educated_Males,Educated_Total,Electrified railway lines,Motorways,Navigable canals,Navigable rivers,Other roads,Railway lines with double and more tracks,Total railway lines
0,1993,Calabria,1.0,,1.0,,,,,,...,,,,403.0,279.0,,,9262.0,253.0,855.0
1,1993,Campania,738.0,366.0,372.0,,,,,,...,,,,658.0,445.0,,,9380.0,487.0,960.0
2,1993,Centro (IT),11155.0,5596.0,5559.0,,,,,,...,,,,,1156.0,,,31432.0,,
3,1993,Emilia-Romagna,836.0,421.0,414.0,,,,,,...,,,,,630.0,,,10062.0,,
4,1993,Friuli-Venezia Giulia,49.0,23.0,26.0,,,,,,...,,,,381.0,203.0,,,3322.0,246.0,494.0


In [10]:
df_merged.columns

Index(['TIME', 'GEO', 'Passengers carried', 'Passengers carried (arrival)',
       'Passengers carried (departures)',
       'Burglary of private residential premises', 'Intentional homicide',
       'Robbery', 'Theft of a motorized land vehicle',
       'Gross domestic product (GDP) at current market prices ',
       'Available beds in hospitals (HP.1)',
       'Curative care beds in hospitals (HP.1)',
       'Long-term care beds in hospitals (HP.1)',
       'Other beds in hospitals (HP.1)',
       'Psychiatric care beds in hospitals (HP.1)',
       'Rehabilitative care beds in hospitals (HP.1)',
       'Households with broadband access',
       'Internet use: interaction with public authorities (last 12 months)',
       'Passengers disembarked', 'Passengers embarked',
       'Passengers embarked and disembarked',
       'Hotels; holiday and other short-stay accommodation; camping grounds, recreational vehicle parks and trailer parks',
       'People at risk of poverty or social exclu