In [1]:
import pandas as pd
import openpyxl

In [2]:
NEET_LATEST = '../../data/raw/neet-latest.xlsx'

In [3]:
openpyxl.open(NEET_LATEST).sheetnames

['Note',
 'People - SA',
 'Men - SA',
 'Women - SA',
 'People - NSA',
 'Men - NSA',
 'Women - NSA',
 'People 18-20 - NSA',
 'People 21-22 - NSA',
 'People 23-24 - NSA']

In [7]:
sheets = ['People - SA', 'Men - SA', 'Women - SA']

In [151]:
def get_metadata(sheet):
    metadata = pd.read_excel(NEET_LATEST, sheet_name=sheet, header=None, skiprows=1, nrows=1)
    metadata = pd.Series(metadata.values.flatten())
    metadata = metadata.dropna()
    metadata = metadata.reset_index(drop=True)
    metadata = pd.DataFrame(metadata.iloc[1::2].values, index=metadata.iloc[0::2].str.strip(':')).T
    metadata = metadata.rename(
      columns={
        'Date of publication': 'Release Date',
        'Date of next publication': 'Next release'
      }
    )
    return metadata

def get_headers(sheet):
    headers = pd.read_excel(NEET_LATEST, sheet_name=sheet, index_col=0, header=None, skiprows=4, nrows=3).T
    headers.index.name = 'column'
    headers.columns = pd.Index([
      'age',
      'm1',
      'm2'
    ])
    headers.loc[:, 'age'] = headers.loc[:, 'age'].ffill()
    headers.loc[:, 'm1'] = headers.loc[:, 'm1'].ffill()
    headers.loc[:, 'm2'] = headers.loc[:, 'm2'].fillna('')
    headers = headers.reset_index()
    headers['measure'] = headers.loc[:, ['m1', 'm2']].astype('str').apply('_'.join, axis=1).str.strip('_')
    headers = headers.drop(columns=['m1', 'm2'])
    headers['sheet'] = sheet
    metadata = get_metadata(sheet)
    headers.loc[:, metadata.columns] = metadata.iloc[0].values
    headers = headers.set_index(['sheet', 'column'])
    return headers

headers = pd.concat([get_headers(s) for s in sheets ])


In [152]:
def read_data(sheet):
    data = pd.read_excel(NEET_LATEST, sheet, index_col=0, header=None, skiprows=9, na_values=['..', '*'])

    # Get rid of trailing rows
    data = data[data.iloc[:, 0].notna()]
    data = data.astype(float)
    
    data.index.name = 'date'

    # Every 5th column is a rate
    non_rate_columns = data.columns[[(x + 1) % 5 != 0 for x in range(data.columns.size)]]
    # Round the non-rate columns
    data.loc[:, non_rate_columns] = data.loc[:, non_rate_columns].div(1000).round()
    
    data = data.melt(var_name='column',ignore_index=False)
    data['sheet'] = sheet
    
    data = data.reset_index(drop=False).set_index(['sheet', 'column'])
    return data

data = pd.concat([read_data(s) for s in sheets])

In [153]:
neet = data.merge(headers.loc[:, ['age', 'measure']], left_index=True, right_index=True).reset_index().drop(columns=['column']).set_index(['date', 'sheet', 'age', 'measure'])

In [154]:
headers.to_csv('../../data/metadata/codes/neet-codes.csv')
neet.to_csv('../../data/processed/neet.csv')