# Eurostat
Code to collect and process eurostat data to create the following indicators:

* Private sector R&D workforce
* Business Enterprise R&D (BERD)
* Share if hugh growth firms 

Raw data collected using the Eurostat API via the `EuroStat API Client` python package (https://pypi.org/project/eurostatapiclient/).

## Preamble

In [None]:
%load_ext autoreload
%autoreload 2

from eurostatapiclient import EurostatAPIClient

import numpy as np
import pandas as pd

from beis_indicators.geo.nuts import auto_nuts2_uk
import beis_indicators.geo.nuts as geo_nuts

In [None]:
VERSION = 'v2.1'
FORMAT = 'json'
LANGUAGE = 'en'

In [None]:
client = EurostatAPIClient(VERSION, FORMAT, LANGUAGE)

### Mappings

In [None]:
#using the eurostat code labels
vars_map = {
    'EUR_HAB': 'Euro per inhabitant',
    'MIO_EUR': 'Million euro',
    'FTE': 'Full-time equivalent (FTE)',
    'HC': 'Head count', 
    'PC_ACT_FTE': 'Percentage of active population - numerator in full-time equivalent (FTE)',
    'PC_ACT_HC': 'Percentage of active population - numerator in head count (HC)',
    'PPS_HAB': 'Purchasing power standard (PPS) per inhabitant'
}

In [None]:
from beis_indicators import project_dir
import ast

In [None]:
project_dir
with open(f'{project_dir}/data/aux/eurostat_data_queries.txt', 'r') as f:
    mylist = ast.literal_eval(f.read())

## Data Collection, Processing & Transformation

This section is made up with three sections- one for each indicator. Each section is broken down in the following steps:

* Use the python package to pull down flattened data by entering using a query & put into a dataframe
* Collect the subset for UK NUTS2 regions
* Replace the codes with the label associated 
* Data is transformed into a pivot table to output the desired format

### Private sector R&D workforce data

#### Head Count

In [None]:
#pull in data
data_priv_nuts2 = client.get_dataset('rd_p_persreg?sinceTimePeriod=2007&geoLevel=nuts2&precision=1&sex=T&sectperf=BES&prof_pos=TOTAL&unit=HC')

print(data_priv_nuts2.label)

dataframe_priv_nuts2 = data_priv_nuts2.to_dataframe()

In [None]:
#UK NUTS2 regions subset
dataframe_priv_nuts2_uk = dataframe_priv_nuts2[dataframe_priv_nuts2['geo'].str.contains('UK')]

In [None]:
#mappings
dataframe_priv_nuts2_uk['time'] = dataframe_priv_nuts2_uk['time'].astype(int)

In [None]:
#pivot table
d_priv = dataframe_priv_nuts2_uk.pivot_table(index=['geo','time'],
               columns = 'unit',
               values = 'values').reset_index().set_index('geo')

In [None]:
d_priv['HC'] = pd.to_numeric(d_priv['HC'], downcast='integer')

In [None]:
d_priv.columns

In [None]:
d_priv.reset_index(inplace=True)
d_priv.columns = ['nuts_id', 'year', 'eurostat_private_rd_headcount_workforce_data']
d_priv = auto_nuts2_uk(d_priv)

In [None]:
d_priv = d_priv[['year','nuts_id', 'nuts_year_spec', 'eurostat_private_rd_headcount_workforce_data']]

In [None]:
#save data
d_priv.to_csv('../../data/processed/eurostat/eurostat_private_rd_headcount_workforce_data.nuts2.csv',index=False)

#### Full Time Equivalent (FTE)

In [None]:
#pull in data
data_priv_nuts2 = client.get_dataset('rd_p_persreg?sinceTimePeriod=2007&geoLevel=nuts2&precision=6&sex=T&sectperf=BES&prof_pos=TOTAL&unit=FTE')

print(data_priv_nuts2.label)

dataframe_priv_nuts2 = data_priv_nuts2.to_dataframe()

In [None]:
#UK NUTS2 regions subset
dataframe_priv_nuts2_uk = dataframe_priv_nuts2[dataframe_priv_nuts2['geo'].str.contains('UK')]

In [None]:
#mappings
dataframe_priv_nuts2_uk['time'] = dataframe_priv_nuts2_uk['time'].astype(int)

In [None]:
#pivot table
d_priv = dataframe_priv_nuts2_uk.pivot_table(index=['geo','time'],
               columns = 'unit',
               values = 'values').reset_index().set_index('geo')

In [None]:
d_priv.columns

In [None]:
d_priv.reset_index(inplace=True)
d_priv.columns = ['nuts_id', 'year', 'eurostat_private_rd_fte_workforce_data']
d_priv = auto_nuts2_uk(d_priv)

In [None]:
d_priv = d_priv[['year','nuts_id', 'nuts_year_spec', 'eurostat_private_rd_fte_workforce_data']]

In [None]:
#save data
d_priv.to_csv('../../data/processed/eurostat/eurostat_private_rd_fte_workforce_data.nuts2.csv', index=False)

### Business Enterprise R&D (BERD) data

In [None]:
#pull in data
data_berd_nuts2 = client.get_dataset('rd_e_gerdreg?sinceTimePeriod=2007&geoLevel=nuts2&precision=6&sectperf=BES&unit=MIO_EUR')

print(data_berd_nuts2.label)

dataframe_berd_nuts2 = data_berd_nuts2.to_dataframe()

In [None]:
#UK NUTS2 regions subset
dataframe_berd_nuts2_uk = dataframe_berd_nuts2[dataframe_berd_nuts2['geo'].str.contains('UK')]

In [None]:
#mappings
dataframe_berd_nuts2_uk['time'] = dataframe_berd_nuts2_uk['time'].astype(int)

In [None]:
#pivot table
d_berd = dataframe_berd_nuts2_uk.pivot_table(index=['geo','time'],
               columns = 'unit',
               values = 'values').reset_index().set_index('geo')

In [None]:
d_berd['euros'] = d_berd['MIO_EUR'] * 1000000.00
d_berd.drop(columns=['MIO_EUR'], inplace=True)

In [None]:
d_berd.columns

In [None]:
d_berd.reset_index(inplace=True)
d_berd.columns = ['nuts_id', 'year', 'eurostat_berd_data']
d_berd = auto_nuts2_uk(d_berd)

In [None]:
d_berd = d_berd[['year','nuts_id', 'nuts_year_spec', 'eurostat_berd_data']]

In [None]:
#save data
d_berd.to_csv('../../data/processed/eurostat/eurostat_berd_data.nuts2.csv',index=False)

### Private non-profit (PNPERD) data

In [None]:
#pull in data
data_pnpberd_nuts2 = client.get_dataset('rd_e_gerdreg?sinceTimePeriod=2007&geoLevel=nuts2&precision=6&sectperf=PNP&unit=MIO_EUR')

print(data_pnpberd_nuts2.label)

df_pnpberd_nuts2 = data_pnpberd_nuts2.to_dataframe()

In [None]:
#UK NUTS2 regions subset
df_pnpberd_nuts2_uk = df_pnpberd_nuts2[df_pnpberd_nuts2['geo'].str.contains('UK')]

In [None]:
#mappings
df_pnpberd_nuts2_uk['time'] = df_pnpberd_nuts2_uk['time'].astype(int)

In [None]:
d_pnpberd = df_pnpberd_nuts2_uk.pivot_table(index=['geo','time'],
               columns = 'unit',
               values = 'values').reset_index().set_index('geo')

In [None]:
d_pnpberd['euros'] = d_pnpberd['MIO_EUR'] * 1000000.00
d_pnpberd.drop(columns=['MIO_EUR'], inplace=True)

In [None]:
d_pnpberd.columns

In [None]:
d_pnpberd.reset_index(inplace=True)
d_pnpberd.columns = ['nuts_id', 'year', 'eurostat_private_non_profit_rd_workforce_data']
d_pnpberd = auto_nuts2_uk(d_pnpberd)

In [None]:
d_pnpberd = d_pnpberd[['year','nuts_id', 'nuts_year_spec', 'eurostat_private_non_profit_rd_workforce_data']]

In [None]:
d_pnpberd.to_csv('../../data/processed/eurostat/eurostat_private_non_profit_rd_workforce_data.nuts2.csv', index=False)

### Higher Education Performed R&D expenditure (HERD)

In [None]:
#pull in data
data_herd_nuts2 = client.get_dataset('rd_e_gerdreg?sinceTimePeriod=2007&geoLevel=nuts2&precision=6&sectperf=HES&unit=MIO_EUR')

print(data_herd_nuts2.label)

df_herd_nuts2 = data_herd_nuts2.to_dataframe()

In [None]:
#UK NUTS2 regions subset
df_herd_nuts2_uk = df_herd_nuts2[df_herd_nuts2['geo'].str.contains('UK')]

In [None]:
#mappings
df_herd_nuts2_uk['time'] = df_herd_nuts2_uk['time'].astype(int)

In [None]:
d_herd = df_herd_nuts2_uk.pivot_table(index=['geo','time'],
               columns = 'unit',
               values = 'values').reset_index().set_index('geo')

In [None]:
d_herd['euros'] = d_herd['MIO_EUR'] * 1000000.00
d_herd.drop(columns=['MIO_EUR'], inplace=True)

In [None]:
d_herd.columns

In [None]:
d_herd.reset_index(inplace=True)
d_herd.columns = ['nuts_id', 'year', 'eurostat_higher_ed_rd_workforce_data']
d_herd = auto_nuts2_uk(d_herd)

In [None]:
d_herd = d_herd[['year','nuts_id', 'nuts_year_spec', 'eurostat_higher_ed_rd_workforce_data']]

In [None]:
d_herd.to_csv('../../data/processed/eurostat/eurostat_higher_ed_rd_workforce_data.nuts2.csv',index=False)

### Government Performed R&D Expenditure (GovERD)

In [None]:
#pull in data
data_goverd_nuts2 = client.get_dataset('rd_e_gerdreg?sinceTimePeriod=2007&geoLevel=nuts2&precision=2&sectperf=GOV&unit=MIO_EUR')

print(data_goverd_nuts2.label)

df_goverd_nuts2 = data_goverd_nuts2.to_dataframe()

In [None]:
#UK NUTS2 regions subset
df_goverd_nuts2_uk = df_goverd_nuts2[df_goverd_nuts2['geo'].str.contains('UK')]

In [None]:
#mappings
df_goverd_nuts2_uk['time'] = df_goverd_nuts2_uk['time'].astype(int)

In [None]:
d_goverd = df_goverd_nuts2_uk.pivot_table(index=['geo','time'],
               columns = 'unit',
               values = 'values').reset_index().set_index('geo')

In [None]:
d_goverd['euros'] = d_goverd['MIO_EUR'] * 1000000.00
d_goverd.drop(columns=['MIO_EUR'], inplace=True)

In [None]:
d_goverd.columns

In [None]:
d_goverd.reset_index(inplace=True)
d_goverd.columns = ['nuts_id', 'year', 'eurostat_gov_rd_workforce_data']
d_goverd = auto_nuts2_uk(d_goverd)

In [None]:
d_goverd = d_goverd[['year','nuts_id', 'nuts_year_spec', 'eurostat_gov_rd_workforce_data']]

In [None]:
d_goverd.to_csv('../../data/processed/eurostat/eurostat_gov_rd_workforce_data.nuts2.csv', index=False)

### Private Household Income

In [None]:
data_house_nuts2 = client.get_dataset('tgs00036?sinceTimePeriod=2007&precision=2&unit=PPS_HAB')

print(data_house_nuts2.label)

data_house_nuts2 = data_house_nuts2.to_dataframe()

In [None]:
# data_house_nuts2

In [None]:
#UK NUTS2 regions subset
data_house_nuts2_uk = data_house_nuts2[data_house_nuts2['geo'].str.contains('UK')]

In [None]:
#mappings
data_house_nuts2_uk['time'] = data_house_nuts2_uk['time'].astype(int)

In [None]:
# data_house_nuts2_uk

In [None]:
d_house = data_house_nuts2_uk.pivot_table(index=['geo','time'],
               columns = 'unit',
               values = 'values').reset_index().set_index('geo')

In [None]:
d_house['PPS_HAB'] = d_house['PPS_HAB'].round(2)

In [None]:
d_house.columns

In [None]:
d_house.reset_index(inplace=True)
d_house.columns = ['nuts_id', 'year', 'eurostat_private_households_income']
d_house = auto_nuts2_uk(d_house)

In [None]:
d_house = d_house[['year','nuts_id', 'nuts_year_spec', 'eurostat_private_households_income']]

In [None]:
d_house.to_csv('../../data/processed/eurostat/eurostat_private_households_income.nuts2.csv', index=False)

### Share of high growth firms

In [None]:
#pull in data
data_share_nuts2 = client.get_dataset('bd_hgnace2_r3?sinceTimePeriod=2007&geoLevel=nuts2&precision=1&indic_sb=V97460&nace_r2=B-E&nace_r2=B-S_X_K642&nace_r2=F&nace_r2=G&nace_r2=H&nace_r2=I&nace_r2=J&nace_r2=K_L_X_K642&nace_r2=M_N&nace_r2=P_Q&nace_r2=R_S')

print(data_share_nuts2.label)

dataframe_share_nuts2 = data_share_nuts2.to_dataframe()

In [None]:
dataframe_share_nuts2[dataframe_share_nuts2['geo'] == 'UKK1']

In [None]:
#UK NUTS2 regions subset

dataframe_share_nuts2_uk = dataframe_share_nuts2[dataframe_share_nuts2['geo'].str.contains('UK')]

In [None]:
dataframe_share_nuts2_uk

In [None]:
dataframe_share_nuts2_uk.pivot_table(index=['geo','time'],
               columns = 'indic_sb',
               values = 'values')

Note: Does not seem to be UK NUTS2 values for this dataset

### GDP per capita

In [None]:
#pull in data
data_gdp_nuts2 = client.get_dataset('tgs00004')
# TGS00004
print(data_gdp_nuts2.label)

dataframe_gdp_nuts2 = data_gdp_nuts2.to_dataframe()

In [None]:
#UK NUTS2 regions subset
dataframe_gdp_nuts2_uk = dataframe_gdp_nuts2[dataframe_gdp_nuts2['geo'].str.contains('UK')]

In [None]:
#mappings
dataframe_gdp_nuts2_uk['time'] = dataframe_gdp_nuts2_uk['time'].astype(int)

In [None]:
dataframe_gdp_nuts2_uk

In [None]:
d_gdp = dataframe_gdp_nuts2_uk.pivot_table(index=['geo','time'],
               columns = 'unit',
               values = 'values').reset_index().set_index('geo')

In [None]:
d_gdp['euros'] = (d_gdp['MIO_PPS'] *1e6)

In [None]:
del d_gdp['MIO_PPS']

In [None]:
d_gdp.columns

In [None]:
d_gdp.reset_index(inplace=True)
d_gdp.columns = ['nuts_id', 'year', 'eurostat_gdp_per_capita']
d_gdp = auto_nuts2_uk(d_gdp)

In [None]:
d_gdp = d_gdp[['year','nuts_id', 'nuts_year_spec', 'eurostat_gdp_per_capita']]

In [None]:
d_gdp.loc[503]['eurostat_gdp_per_capita']

In [None]:
d_gdp.to_csv('../../data/processed/eurostat/eurostat_gdp_per_capita.nuts2.csv', index=False)