In [1]:
import zipfile
import glob
from pathlib import Path
from functools import reduce

import pandas as pd
import numpy as np

data_path = "data"

In [2]:
# cols_to_drop: Remove certain *constant* columns - TODO: Unhardcode?

def eurostat_reader(file_path, cols_to_drop={"UNIT", "C_RESID", "AGE", "ISCED11"}):

    with zipfile.ZipFile(file_path, 'r') as zip_file:
        for file in zip_file.namelist():
            if 'Data' in file:
                with zip_file.open(file) as data_file:
                    df = pd.read_csv(data_file, encoding="ISO-8859-1")

    # Drop desired columns
    if len(cols_to_drop.intersection(df.columns)) > 0:
        df = df.drop(columns=cols_to_drop.intersection(df.columns))
        
    # Clean Value column and convert to float
    df["Value"] = df["Value"].apply(
        lambda x: x.replace(",", "").replace(".", "")).replace({":": None}, regex=False).astype(np.float32)

    # Find the column containing the relevant values to `spread` on
    value_col = [col for col in df.columns if col not in {"TIME", "GEO", "Value"}]

    if len(value_col) > 1:
        raise ValueError(f"Too many columns available to spread on for file '{file_path}', "
                         f"namely {value_col}. Check the data and add columns to remove to cols_to_drop.")  
        
    if len(value_col) == 1:
        df = df.pivot_table(index=['TIME','GEO'], columns=value_col[0], values='Value').reset_index()
        
        # These two files have non-descriptive pivot names
        if "Population on 1 January by age, sex and NUTS 2 region" in file_path:
            df.columns = ['Amount_of_' + col if col not in {'TIME','GEO'} else col for col in df.columns]
        elif "Pupils and students enrolled by education level, sex and NUTS2 regions" in file_path:
            df.columns = ['Educated_' + col if col not in {'TIME','GEO'} else col for col in df.columns]
            
        del df.columns.name
    
    # Check there is no data for certain years
    na_check = df.set_index('TIME')['Value'].isna().all(level=0)
    if len(na_check.index[na_check]) > 0:
        df = df.set_index('TIME').drop(list(na_check.index[na_check])).reset_index()
    
    # Drop fully NA rows and columns
    df = df.dropna(how='all').dropna(axis=1, how='all')
    
    if len(value_col) == 0:
        df = df.rename(columns={'Value': Path(file_path).stem.replace("by NUTS 2 regions", "")})
    
    return df

In [3]:
dfs = []

for file in glob.glob("data/*.zip"):
    if file == 'data\Railway transport - national and international railway passengers transport by loading_unloading NUTS 2 region.zip':
        continue
    
    df = eurostat_reader(file)
    dfs.append(df)

KeyError: 'Value'

In [None]:
df_merged = reduce(lambda x, y: pd.merge(x, y, on=['TIME', 'GEO'], how='outer'), dfs)
print("Shape:", df_merged.shape)
df_merged.head()

In [None]:
df_merged.columns

In [None]:
df_merged.to_csv("data/merged_eurostat.csv", index=False)

# Inter-city railroad connections

In [None]:
file_path = ("data/Railway transport - national and international railway passengers transport by "
             "loading_unloading NUTS 2 region.zip")

with zipfile.ZipFile(file_path, 'r') as zip_file:
        for file in zip_file.namelist():
            if 'Data' in file:
                with zip_file.open(file) as data_file:
                    df = pd.read_csv(data_file, encoding="ISO-8859-1")

# GEO is constant, simply "Italy"
df.drop(columns=['UNIT', 'GEO'], inplace=True)

# Clean Value column and convert to float
df["Value"] = df["Value"].apply(
    lambda x: x.replace(",", "").replace(".", "")).replace({":": None}, regex=False).astype(np.float32)

df.head()

In [None]:
# Check there is no data for certain years - indeed, for 2010
df.set_index('TIME')['Value'].isna().all(level=0)

In [None]:
na_check = df.set_index('TIME')['Value'].isna().all(level=0)
if len(na_check.index[na_check]) > 0:
    print(f"No data for: {list(na_check.index[na_check])}")
    df = df.set_index('TIME').drop(list(na_check.index[na_check])).reset_index()

In [None]:
df_travel = df.pivot_table('Value', index=['TIME', 'C_LOAD'], columns='C_UNLOAD').reset_index()
del df_travel.columns.name
df_travel

In [None]:
df_travel.to_csv("data/interregion_railroad_travel.csv", index=False)