In [None]:

import os
import re

import pandas as pd

Load the CSV

In [None]:
SOURCE_CSV='../../data/raw/vacancies-by-sector.csv'
data = pd.read_csv(SOURCE_CSV, header=6)

Also load the CSV title - which is in the CSV 'header'

In [None]:
title = pd.read_csv(SOURCE_CSV, nrows=1, skiprows=1, header=None, usecols=[0]).iloc[0, 0]
title

Extract the date of the 'current quarter' from the title

In [None]:
def make_date(match):
    start_month, end_month, year = match.groups()
    date = pd.to_datetime(f'{start_month} {year}')
    return date.isoformat().split('T')[0]

date = re.sub(r'^(\w+)\s+to\s+(\w+)\s+(\d+).*', repl=make_date, string=title)

Rename some columns to be more generic

In [None]:

data = data.rename(columns={
  data.columns[0]: 'Sector',
  data.columns[1]: 'Growth since previous quarter (%)',
  data.columns[2]: 'Growth since pre-coronavirus January to March 2020 (%)',
})
data

Convert to long format

In [None]:
data = data.melt(id_vars=['Sector'])

Calculate if the sector is a key or wanted youth sector.

In [None]:
key_youth_sectors = [
  'Human health & social work activities',
  'Accommodation & food service activities',
]

wanted_youth_sectors = [
  'Arts entertainment & recreation',
  'Financial & insurance activities',
  'Professional scientific & technical activities',
]

data = pd.concat([
  data,
  pd.Series(data.Sector.isin(key_youth_sectors), name='key_youth_sectors'),
  pd.Series(data.Sector.isin(wanted_youth_sectors), name='wanted_youth_sectors')
], axis=1)

Add the date

In [None]:
data['date'] = date

Reorder the columns

In [None]:
data = data.loc[:, [
  'date',
  'Sector',
  'key_youth_sectors',
  'wanted_youth_sectors',
  'variable',
  'value',
]]

Save to CSV file

In [None]:
OUTPUT_FILE='../../data/processed/vacancies/vacancies-growth-by-sector.csv'
os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)
data.to_csv(OUTPUT_FILE, index=False)