In [None]:
import os
import pandas as pd
from functions import find_header_length, read_headers, infer_date

Set Papermill parameters

In [None]:
dataset: str = 'unem'   # ID for ONS time series dataset to process
url: str = 'https://www.ons.gov.uk/file?uri=/employmentandlabourmarket/peoplenotinwork/unemployment/datasets/claimantcountandvacanciesdataset/current/unem.csv'

Derive filenames

In [None]:
RAW_CSV=f'../../data/raw/{dataset}.csv'
METADATA_FILE = f'../../data/metadata/codes/{dataset}-codes.csv'
DATA_FILE = f'../../data/processed/{dataset}.csv'

Make sure the directories exist

In [None]:
os.makedirs(os.path.dirname(METADATA_FILE), exist_ok=True)
os.makedirs(os.path.dirname(DATA_FILE), exist_ok=True)

Work out how long the header is

In [None]:
header_length, cdid_row = find_header_length(RAW_CSV)

Load the headers

In [None]:
headers = read_headers(RAW_CSV, header_rows=header_length)
headers

Save the headers file

In [None]:
headers.to_csv(METADATA_FILE)

Load the data, setting the first column (the date) as the index, and skipping all the headers, apart from the 'CDID' row.

In [None]:
data = pd.read_csv(
  RAW_CSV,
  index_col=[0],
  skiprows=[x for x in range(0, header_length) if x != cdid_row]
)

Merge the data with the inferred date and frequency, then melt the frame by date and frequency. Drop any null values, then sort by date and then variable.

In [None]:
long_data = data.merge(
  data.index.to_series().pipe(infer_date),
  left_index=True, right_index=True
).melt(
  id_vars=['date', 'freq']
).dropna().sort_values(
  ['date', 'variable']
)

Save the file

In [None]:
long_data.to_csv(DATA_FILE, index=None)