In [None]:
import os
import pandas as pd

Load and process the LMS file downloaded by the `get-lms.py` script

In [None]:
lms_file = '../../data/raw/lms.csv'

Read the headers, which are the first 7 rows of the file. Transpose and set an index

In [None]:
headers = pd.read_csv(lms_file, header=None, nrows=7, index_col=0).T.set_index('CDID')
headers

Save metadata file

In [None]:
METADATA_FILE = '../../data/metadata/codes/lms-codes.csv'
os.makedirs(os.path.dirname(METADATA_FILE), exist_ok=True)
headers.to_csv(METADATA_FILE)

Load data

In [None]:
data = pd.read_csv(lms_file, header=[1,6], index_col=0)
data.columns = data.columns.droplevel(1)

Process the date - this is a mixture of Yearly (`YYYY`), Quarterly (`YYYY Qx`) and Monthly  (`YYYY MMM`), based on the date format. Parse these, coercing errors into a valid null date. Store the type of date detected based on non-null values. Finally collapse these with a `ffill` to construct the `date` columm.

In [None]:
raw_date = data.index.to_series()

freq = pd.Series(index=raw_date.index, dtype=str)

year = pd.to_datetime(raw_date, format="%Y", errors="coerce")
freq.loc[year.notna()] = 'a'

quarter = raw_date.str.split(r'\sQ')
quarter.loc[quarter.str.len() == 1] = pd.NaT
quarter.loc[quarter.notna()] = quarter.loc[quarter.notna()].map(lambda x: f"{x[0]}-{int(x[1])*3 -2}").pipe(pd.to_datetime)
freq.loc[quarter.notna()] = 'q'

month = pd.to_datetime(raw_date, format="%Y %b", errors="coerce")
freq.loc[month.notna()] = 'm'

date = pd.DataFrame({
  'year': year,
  'quarter': quarter,
  'month': month,
  'date': None,
  'freq': freq
}).ffill(axis=1).drop(columns=['year', 'quarter', 'month'])

Merge the `data` and `date` frames to create the lms data. Melt it into a long format, drop any nulls and sort by date and then variable

In [None]:
lms = data.merge(
    date, left_index=True, right_index=True
).melt(
    id_vars=['date', 'freq']
).dropna(
).sort_values(
    ['date', 'variable']
)

Save the data file

In [None]:
DATA_FILE = '../../data/processed/labour-market/lms.csv'
os.makedirs(os.path.dirname(DATA_FILE), exist_ok=True)
lms.to_csv(DATA_FILE, index=None)