In [1]:
import pandas as pd
import warnings

In [None]:
warnings.filterwarnings("ignore", category=UserWarning, module="openpyxl.styles.stylesheet")

In [6]:
def process_eurostat_dataset(path: str):

    # retrieve sheets data and metadata from Summary sheet
    ## the time frequency and unit of measurement information 
    # is not a relevant index value since they are always the same
    ignore = ['Time frequency', 'Unit of measure']
    summary = pd.read_excel(
        path, sheet_name='Summary', 
        usecols=lambda col: \
            not col.startswith('Unnamed') and col not in ignore,
        index_col='Contents', skiprows=14
    )

    all = []

    # iterate over data sheets and pre-process them
    for sheet_name, config in summary.iterrows():

        # the valuable information starts with the row containing "TIME" indicating the start of the data table
        preview = pd.read_excel(path, sheet_name=sheet_name, usecols=[0])
        header_row = preview[preview.iloc[:, 0] == 'TIME'].index[0]

        data = pd.read_excel(
            path, sheet_name=sheet_name,
            usecols= lambda col: not str(col).startswith('Unnamed'),
            na_values=':', skiprows=header_row + 1
        )

        # Remove the first row "Geo (Label)"
        data = data.iloc[1:].reset_index(drop=True)

        # some sheets contain additional meta-data at the end which are seperated by a NaN value row from the actual data
        ## if we find such a row we remove all rows after
        first_blank_row_index = data.isnull().all(axis=1).idxmax() if data.isnull().all(axis=1).any() else None
        if first_blank_row_index is not None:
            data = data.loc[:first_blank_row_index - 1]

        # create multi-index based on config
        config_df = pd.DataFrame([config] * len(data), index=data.index)
        data = pd.concat([config_df, data], axis=1)
        data.set_index(['TIME'] + list(config_df.columns), inplace=True)

        all.append(data)

    # merge them into one multi-index data frame 
    all = pd.concat(all).sort_index()
    
    return all

  warn("Workbook contains no default style, apply openpyxl's default")


{'Sheet 50':    Data extracted on 26/12/2024 01:08:28 from [ESTAT]  \
 0                                           Dataset:    
 1                                      Last updated:    
 2                                                 NaN   
 3                                      Time frequency   
 4                                            Products   
 ..                                                ...   
 58                                   Available flags:   
 59                                                  d   
 60                                                  e   
 61                                                  p   
 62                                                  u   
 
                                            Unnamed: 1  \
 0   Electricity prices for household consumers - b...   
 1                                    19/12/2024 23:00   
 2                                                 NaN   
 3                                                 NaN   
