In [1]:
import zipfile
import glob
from pathlib import Path
from functools import reduce

import pandas as pd
import numpy as np

data_path = "data"

In [2]:
# cols_to_drop: Remove certain *constant* columns - TODO: Unhardcode?

def eurostat_reader(file_path, cols_to_drop={"UNIT", "C_RESID", "AGE", "ISCED11"}):

    with zipfile.ZipFile(file_path, 'r') as zip_file:
        for file in zip_file.namelist():
            if 'Data' in file:
                with zip_file.open(file) as data_file:
                    df = pd.read_csv(data_file, encoding="ISO-8859-1")

    # Drop desired columns
    if len(cols_to_drop.intersection(df.columns)) > 0:
        df = df.drop(columns=cols_to_drop.intersection(df.columns))

    # Clean Value column and convert to float
    df["Value"] = df["Value"].apply(
        lambda x: x.replace(",", "").replace(".", "")).replace({":": None}, regex=False).astype(np.float32)

    # Find the column containing the relevant values to `spread` on
    value_col = [col for col in df.columns if col not in {"TIME", "GEO", "Value"}]

    if len(value_col) > 1:
        raise ValueError(f"Too many columns available to spread on for file '{file_path}', "
                         f"namely {value_col}. Check the data and add columns to remove to cols_to_drop.")  
        
    if len(value_col) == 1:
        df = df.pivot_table(index=['TIME','GEO'], columns=value_col[0], values='Value').reset_index()
        
        if "Population on 1 January by age, sex and NUTS 2 region" in file_path:
            df.columns = ['Amount_of_' + col if col not in {'TIME','GEO'} else col for col in df.columns]
        elif "Pupils and students enrolled by education level, sex and NUTS2 regions" in file_path:
            df.columns = ['Educated_' + col if col not in {'TIME','GEO'} else col for col in df.columns]
        
    else:
        df = df.rename(columns={'Value': Path(file_path).stem.replace("by NUTS 2 regions", "")})
        
    df = df.dropna(how='all').dropna(axis=1, how='all')
    
    del df.columns.name
    
    return df

In [3]:
dfs = []

for file in glob.glob("data/*.zip"):
    if file == 'data\Railway transport - national and international railway passengers transport by loading_unloading NUTS 2 region.zip':
        continue
    
    df = eurostat_reader(file)
    dfs.append(df)

In [4]:
df_merged = reduce(lambda x, y: pd.merge(x, y, on=['TIME', 'GEO'], how='outer'), dfs)
print("Shape:", df_merged.shape)
df_merged.head()

Shape: (881, 39)


Unnamed: 0,TIME,GEO,Passengers carried,Passengers carried (arrival),Passengers carried (departures),Burglary of private residential premises,Intentional homicide,Robbery,Theft of a motorized land vehicle,Gross domestic product (GDP) at current market prices,...,Educated_Females,Educated_Males,Educated_Total,Electrified railway lines,Motorways,Navigable canals,Navigable rivers,Other roads,Railway lines with double and more tracks,Total railway lines
0,1993,Calabria,1.0,,1.0,,,,,,...,,,,403.0,279.0,,,9262.0,253.0,855.0
1,1993,Campania,738.0,366.0,372.0,,,,,,...,,,,658.0,445.0,,,9380.0,487.0,960.0
2,1993,Centro (IT),11155.0,5596.0,5559.0,,,,,,...,,,,,1156.0,,,31432.0,,
3,1993,Emilia-Romagna,836.0,421.0,414.0,,,,,,...,,,,,630.0,,,10062.0,,
4,1993,Friuli-Venezia Giulia,49.0,23.0,26.0,,,,,,...,,,,381.0,203.0,,,3322.0,246.0,494.0


In [5]:
df_merged.columns

Index(['TIME', 'GEO', 'Passengers carried', 'Passengers carried (arrival)',
       'Passengers carried (departures)',
       'Burglary of private residential premises', 'Intentional homicide',
       'Robbery', 'Theft of a motorized land vehicle',
       'Gross domestic product (GDP) at current market prices ',
       'Available beds in hospitals (HP.1)',
       'Curative care beds in hospitals (HP.1)',
       'Long-term care beds in hospitals (HP.1)',
       'Other beds in hospitals (HP.1)',
       'Psychiatric care beds in hospitals (HP.1)',
       'Rehabilitative care beds in hospitals (HP.1)',
       'Households with broadband access',
       'Internet use: interaction with public authorities (last 12 months)',
       'Passengers disembarked', 'Passengers embarked',
       'Passengers embarked and disembarked',
       'Hotels; holiday and other short-stay accommodation; camping grounds, recreational vehicle parks and trailer parks',
       'People at risk of poverty or social exclu

In [6]:
df_merged.to_excel("data/merged_eurostat.xlsx", index=False)

# Inter-city railroad connections

In [7]:
file_path = ("data/Railway transport - national and international railway passengers transport by "
             "loading_unloading NUTS 2 region.zip")

with zipfile.ZipFile(file_path, 'r') as zip_file:
        for file in zip_file.namelist():
            if 'Data' in file:
                with zip_file.open(file) as data_file:
                    df = pd.read_csv(data_file, encoding="ISO-8859-1")

# GEO is constant, simply "Italy"
df.drop(columns=['UNIT', 'GEO'], inplace=True)

# Clean Value column and convert to float
df["Value"] = df["Value"].apply(
    lambda x: x.replace(",", "").replace(".", "")).replace({":": None}, regex=False).astype(np.float32)

df.head()

Unnamed: 0,TIME,C_LOAD,C_UNLOAD,Value
0,2015,Piemonte,Piemonte,43897376.0
1,2015,Piemonte,Valle d'Aosta/Vallée d'Aoste,
2,2015,Piemonte,Liguria,138717.0
3,2015,Piemonte,Lombardia,1890324.0
4,2015,Piemonte,Provincia Autonoma di Bolzano/Bozen,


In [8]:
# Check there is no data for certain years - indeed, for 2010
df.set_index('TIME')['Value'].isna().reset_index().groupby('TIME', sort=False).all()

Unnamed: 0_level_0,Value
TIME,Unnamed: 1_level_1
2015,False
2010,True
2005,False


In [9]:
df_travel = df.pivot_table('Value', index=['TIME', 'C_LOAD'], columns='C_UNLOAD').reset_index()
del df_travel.columns.name
df_travel

Unnamed: 0,TIME,C_LOAD,Abruzzo,Basilicata,Calabria,Campania,Emilia-Romagna,Friuli-Venezia Giulia,Lazio,Liguria,...,Piemonte,Provincia Autonoma di Bolzano/Bozen,Provincia Autonoma di Trento,Puglia,Sardegna,Sicilia,Toscana,Umbria,Valle d'Aosta/Vallée d'Aoste,Veneto
0,2005,Abruzzo,4627401.0,794.0,3302.0,1060.0,,,,1993.0,...,30669.0,,,124790.0,,845.0,,,170.0,
1,2005,Basilicata,658.0,1646354.0,3815.0,16370.0,,,,948.0,...,5791.0,,,4922.0,,1153.0,,,56.0,
2,2005,Calabria,3671.0,5232.0,5656993.0,224269.0,,,,57462.0,...,166490.0,,,49355.0,,33851.0,,,750.0,
3,2005,Campania,1403.0,13866.0,238684.0,21613624.0,,,,99233.0,...,146922.0,,,101706.0,,108708.0,,,357.0,
4,2005,Liguria,2717.0,789.0,57734.0,96512.0,,,,33906636.0,...,175947.0,,,15585.0,,44230.0,,,49.0,
5,2005,Lombardia,135084.0,15527.0,263387.0,516190.0,,,,970218.0,...,774026.0,,,601418.0,8.0,167260.0,,,9.0,
6,2005,Molise,11165.0,105.0,828.0,162.0,,,,414.0,...,9640.0,,,19275.0,,233.0,,,23.0,
7,2005,Piemonte,30859.0,7109.0,169041.0,145978.0,,,,187057.0,...,42741720.0,,,187003.0,19.0,88188.0,,,,
8,2005,Puglia,124902.0,6639.0,46225.0,99942.0,,,,15804.0,...,181746.0,,,13656459.0,,10924.0,,,859.0,
9,2005,Sardegna,,,,1.0,,,,6.0,...,33.0,,,,3991017.0,,,,1.0,


In [10]:
df_travel.to_excel("data/interregion_railroad_travel.xlsx", index=False)