In [2]:
import os
import pandas as pd

Load the latest CSV. This is imported using `dvc import-url`, and updated using `dvc update -R data/raw`.

In [3]:
data = pd.read_csv(
    '../../data/raw/LMS_by_local_authority.csv',
    parse_dates=['DATE'],
    usecols=['DATE', 'DATE_NAME', 'GEOGRAPHY_CODE', 'GEOGRAPHY_NAME', 'GEOGRAPHY_TYPE', 'VARIABLE_CODE', 'VARIABLE_NAME', 'MEASURES_NAME', 'OBS_VALUE', 'OBS_STATUS_NAME']
)

Convert the column names to lower case.

In [4]:
data.columns = data.columns.str.lower()

Tidy up the `variable_name` column

In [5]:
data.variable_name = data.variable_name.str.strip()

Rename the `obs_value` column to `value`.

In [6]:
data = data.rename(
    columns={
      'obs_value': 'value',
      'obs_status_name': 'notes'
    }
)

Filter out only the **Variable** measures. The dataset also provides other measures such as **Numerator**, **Denominator** and **Confidence**

In [7]:
data = data.loc[ data.measures_name == 'Variable' ].drop(columns='measures_name')

Save the data by local authority to a csv

In [8]:
OUT_FILE = '../../data/processed/labour-market/latest_by_LA_2021.csv'
os.makedirs(os.path.dirname(OUT_FILE), exist_ok=True)
data.loc[ data.geography_type == 'local authorities: district / unitary (as of April 2021)' ].drop(columns=['geography_type']).to_csv(OUT_FILE, index=False)

Save an index of metadata

In [9]:
VARIABLE_INDEX = '../../data/metadata/labour-market/variables_local_authority.csv'
os.makedirs(os.path.dirname(VARIABLE_INDEX), exist_ok=True)
data.loc[:, ['variable_code', 'variable_name']].drop_duplicates().set_index('variable_code').sort_index().to_csv(VARIABLE_INDEX)