In [1]:
import os
import sys

import pandas as pd

Add `pipeline` directory to the sys path so we can import from there.

In [2]:
PIPELINE_DIR = os.path.abspath('..')
if PIPELINE_DIR not in sys.path:
    sys.path.append(PIPELINE_DIR)

Import some helper functions to deal with ONS code

In [3]:
from ons_functions import read_headers, infer_date

Define the mm32 raw data filename. This was the one downloaded by the `./get.py` script in this directory.

In [4]:
mm23_file = '../../data/raw/mm23.csv'

Read the headers

In [5]:
headers = read_headers(mm23_file, header_rows=7)
headers

Unnamed: 0_level_0,Title,PreUnit,Unit,Release Date,Next release,Important Notes
CDID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A9ER,CPI wts: Non-energy industrial goods GOODS,,Parts per 1000,20-09-2023,18 October 2023,
A9ES,CPI wts: Durables GOODS,,Parts per 1000,20-09-2023,18 October 2023,
A9ET,CPI wts: Semi-durables GOODS,,Parts per 1000,20-09-2023,18 October 2023,
A9EU,CPI wts: Non-durables GOODS,,Parts per 1000,20-09-2023,18 October 2023,
A9EV,CPI wts: Non-seasonal food GOODS,,Parts per 1000,20-09-2023,18 October 2023,
...,...,...,...,...,...,...
WUOU,CPI: % points change over previous month (12 m...,,%,20-09-2023,18 October 2023,
WUOV,CPI: % points change over previous month (12 m...,,%,20-09-2023,18 October 2023,
WUOW,CPI: % points change over previous month (12 m...,,%,20-09-2023,18 October 2023,
ZMHO,Internal purchasing power of the pound (based ...,,Pence,20-09-2023,18 October 2023,


Save the header files

In [6]:
METADATA_FILE = '../../data/metadata/cpi/mm23-codes.csv'
os.makedirs(os.path.dirname(METADATA_FILE), exist_ok=True)
headers.to_csv(METADATA_FILE)

Load the data

In [7]:
data = pd.read_csv(mm23_file, header=[1,6], index_col=0)
data.columns = data.columns.droplevel(1)

Merge the data with the inferred date and frequency, then melt the frame by date and frequency. Drop any null values, then sort by date and then variable.

In [8]:

mm23 = data.merge(
  data.index.to_series().pipe(infer_date),
  left_index=True, right_index=True
).melt(
  id_vars=['date', 'freq']
).dropna().sort_values(
  ['date', 'variable']
)


Save the data file

In [9]:
DATA_FILE = '../../data/processed/cpi/mm23.csv'
os.makedirs(os.path.dirname(DATA_FILE), exist_ok=True)
mm23.to_csv(DATA_FILE, index=None)