# ASHE places

We collect data about median salaries in a NUTS2 area. This is an indicator in its own right, and we will also use it to calculate the House Affordability index.

Our strategy will be to collect the data from [Nomis](https://www.nomisweb.co.uk/query/construct/apilinks.asp?menuopt=201) for LEPS.

Unfortunately the data is not available at the NUTS2 so we will have to use an alternative source



## Preamble

In [1]:
%run ../notebook_preamble.ipy

In [2]:
from io import BytesIO
from zipfile import ZipFile

In [3]:
def make_dirs(name,dirs = ['raw','processed']):
    '''
    Utility that creates directories to save the data
    
    '''
    
    for d in dirs:
        if name not in os.listdir(f'../../data/{d}'):
            os.mkdir(f'../../data/{d}/{name}')
            
def flat_freq(a_list):
    '''
    Return value counts for categories in a nested list
    
    '''
    return(pd.Series([x for el in a_list for x in el]).value_counts())

        

def flatten_list(a_list):
    
    return([x for el in a_list for x in el])

        

In [4]:
def save_data(df,name,path,today=today_str):
    '''
    Utility to save processed data quicker
    
    Arguments:
        df (df) is the dataframe we want to save
        name (str) is the name of the file
        path (str) is the path where we want to save the file
        today (str) is the day when the data is saved
    
    '''
    
    df.to_csv(f'{path}/{today_str}_{name}.csv')
    

In [5]:
def get_process_ashe_place(api_link,var_name):
    '''
    This function collects and processes ashe place data
    
    Arguments:
        api_link (str) is the endpoint we get the data from
        var_name (str) is the name for the observed value variable
    
    
    '''
    
    #Get the data
    nomis_table = pd.read_csv(api_link)
    
    #tidy variable names
    nomis_table.columns = [x.lower() for x in nomis_table.columns]
    
    #Some subseting of rows (ie we only keep the values)
    nomis_values = nomis_table.loc[nomis_table['measures_name']=='Value']
    
    #Some subsetting of columns
    nomis_filtered = nomis_values[['date_name','geography_name','geography_code','obs_value']]
    
    #Observed value
    nomis_filtered.rename(columns={'obs_value':'var_name'})
    
    return(nomis_filtered)
    
    

In [6]:
def parse_ashe_dump_data(path,file,occupation_list):
    '''
    This function collects and parses data from an ASHE occupation salary dump
    
    Arguments:
        path (str) is the path where we have stored the excel files
        file (str) is the name of the file
        occupation_list (list) is the list of occupations that we will focus on    
    
    '''
    #Extract the year from the file name
    year = file.split(' ')[-1][:-4]
    
    print(year)
    
    
    #Read the file. We are focusin on Full-Time to keep the indicator comparable with the LEPS. 
    #We are also subsetting to remove some information at the top / bottom / sides
    
    table = pd.read_excel(path+'/'+file,
                    sheet_name='Full-Time',skiprows=4,na_values='x').iloc[:-5,:4]
    
    #Extract NUTS and occupations from the 'Descriptionc' field
    
    #We will use the fact that occupations are all Uppercase
    
    place_names = []
    occ_names = []
    
    #We go through every description and if a word is all uppercase we put it in an occupation container,
    #otherwise in a place container
    
    for category in table['Descriptionc']:
        
        split = category.split(' ')
        
        place =[]
        occ = []
        
        for word in split:
            if word.isupper()==False:
                place.append(word)
                
            else:
                occ.append(word)
                
        place_names.append(' '.join(place))
        occ_names.append(' '.join(occ))
        
    #Assign the words we identified as places to NUTS2 removing a trailing comma
    table['nuts_2'] = [x[:-1] for x in place_names]
    
    #Assign occupations
    table['occupation'] = occ_names
    
    #Assign years
    table['year']=year
    
    #Focus on occupations of interest
    table_filter = table.loc[[x in occupation_list for x in table['occupation']]]
    
    #Clean the occupation name
    table_filter['occupation'] =[x.lower() for x in table_filter['occupation']]
    
    #Rename the median variable
    table_filter.rename(columns={'Median':'gross_annual_salary_median'},inplace=True)
    
    return(table_filter[['year','nuts_2','occupation','gross_annual_salary_median']])
    

In [39]:
def make_indicator(table,target_path,var_lookup,year_var,nuts_var='nuts_code',nuts_spec=2018,decimals=3):
    '''
    We use this function to create and save indicators using our standardised format.
    
    Args:
        table (df) is a df with relevant information
        target_path (str) is the location of the directory where we want to save the data (includes interim and processed)
        var_lookup (dict) is a lookup to rename the variable into our standardised name
        year (str) is the name of the year variable
        nuts_var (str) is the name of the NUTS code variable. We assume it is nuts_code
        nuts_spec (y) is the value of the NUTS specification. We assume we are working with 2018 NUTS
    
    '''
    #Copy
    t = table.reset_index(drop=False)
    
    #Reset index (we assume that the index is the nuts code, var name and year - this might need to be changed)
    
    
    #Process the interim data into an indicator
    
    #This is the variable name and code
    var_name = list(var_lookup.keys())[0]
    
    var_code = list(var_lookup.values())[0]
    
    #Focus on those
    t = t[[year_var,nuts_var,var_name]]
    
    #Add the nuts specification
    t['nuts_year_spec'] = nuts_spec
    
    #Rename variables
    t.rename(columns={var_name:var_code,year_var:'year',nuts_var:'nuts_id'},inplace=True)

    #Round variables
    if decimals>0:
        t[var_code] = [np.round(x,decimals) for x in t[var_code]]
    #If we have zero decimals, cast as int
    #else:
    #    t[var_code] = t[var_code].astype(int)
    
    #Reorder variables
    t = t[['year','nuts_id','nuts_year_spec',var_code]]
    
    print(t.head())
    
    #Save in the processed folder
    t.to_csv(f'../../data/processed/{target_path}/{var_code}.csv',index=False)

In [41]:
#dirs

if 'ashe_place' not in os.listdir('../../data/raw'):
    os.makedirs('../../data/raw/ashe_place')

if 'ashe_place' not in os.listdir('../../data/interim/'):
    os.makedirs('../../data/interim/ashe_place')
    
if 'ashe_place' not in os.listdir('../../data/processed/'):
    os.makedirs('../../data/processed/ashe_place')

#Path to save data:

int_path ='../../data/interim/ashe_place'

## 1. Collect data

We collect the data from NOMIS.

Note that we are collecting **annual gross salary** for full-time workers

### LEPS

The LEP case will be easy as the information is already available at the lep level

In [8]:
api_lep_link = 'https://www.nomisweb.co.uk/api/v01/dataset/NM_30_1.data.csv?geography=1925185537,1925185575,1925185538...1925185543,1925185572,1925185544,1925185570,1925185545,1925185577,1925185553,1925185547...1925185549,1925185571,1925185569,1925185551,1925185552,1925185554,1925185558,1925185555...1925185557,1925185559,1925185560,1925185550,1925185576,1925185562,1925185573,1925185563...1925185568&date=latestMINUS4-latest&sex=8&item=2&pay=7&measures=20100,20701'

In [9]:
ashe_lep = get_process_ashe_place(api_lep_link,'gross_annual_salary_median')

In [10]:
ashe_lep.head()

Unnamed: 0,date_name,geography_name,geography_code,obs_value
0,2015,Black Country,E37000001,24174.0
2,2015,Buckinghamshire Thames Valley,E37000002,32343.0
4,2015,Cheshire and Warrington,E37000003,28191.0
6,2015,Coast to Capital,E37000004,30000.0
8,2015,Cornwall and Isles of Scilly,E37000005,23354.0


### NUTS2

ASHE data are not available at the NUTS2 level and it is not trivial to convert LAD data into NUTS as we have done in other places (eg House Affordability) because the information is only available as median salaries. We could have used number of jobs & average salaries to calculate wage bills and recalculate salaries at the NUTS2 level but this would mean reporting averages rather than medians. 

For all these reasons, we end using a ASHE data dump at the ONS level available [here](https://www.ons.gov.uk/employmentandlabourmarket/peopleinwork/earningsandworkinghours/adhocs/009571annualsurveyofhoursandearningsasheestimatesofannualandhourlyearningsforindustryandoccupationbynuts2andnuts3uk2011to2017)

Note that there are some concerns about the reliability of these indicators given small sample sizes etc. so any indicators built using this data should be treated with caution.

In [11]:
#Download and extract the ASHE data
data_link = 'https://www.ons.gov.uk/file?uri=/employmentandlabourmarket/peopleinwork/earningsandworkinghours/adhocs/009571annualsurveyofhoursandearningsasheestimatesofannualandhourlyearningsforindustryandoccupationbynuts2andnuts3uk2011to2017/k42forpublishing.zip'

ashe_req = requests.get(data_link)
ashe_zip = ZipFile(BytesIO(ashe_req.content))
ashe_zip.extractall(path='../../data/raw/ashe_place/download')

## 2. Processing

The extracted data are a bunch of excel files with median data by occupation and industry between 2011 and 2017.

We will focus on Science, Engineering and technology occupations.

In [12]:
my_dir = os.listdir('../../data/raw/ashe_place/download/K42a - NUTS2 by occupation')

#Files we want to consider
my_files = [x for x in my_dir if ('Annual pay' in x) & (' CV' not in x)]

my_files

['Ad hoc K42a_11 Ad Hoc K42a7a   Annual pay - Gross 2011.xls',
 'Ad hoc K42a_12_13 Ad Hoc K42a7a   Annual pay - Gross 2012.xls',
 'Ad hoc K42a_12_13 Ad Hoc K42a7a   Annual pay - Gross 2013.xls',
 'Ad hoc K42a_14_16 Ad Hoc K42a7a   Annual pay - Gross 2014.xls',
 'Ad hoc K42a_14_16 Ad Hoc K42a7a   Annual pay - Gross 2015.xls',
 'Ad hoc K42a_14_16 Ad Hoc K42a7a   Annual pay - Gross 2016.xls',
 'Ad hoc K42a_17 Ad Hoc K42a7a   Annual pay - Gross 2017.xls']

In [13]:
path = '../../data/raw/ashe_place/download/K42a - NUTS2 by occupation'
occ_list = ['SCIENCE, RESEARCH, ENGINEERING AND TECHNOLOGY PROFESSIONALS']


In [14]:
sci_median_salaries = pd.concat([parse_ashe_dump_data(path,file,occ_list) for file in my_files]).reset_index(drop=True)

2011


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  table_filter['occupation'] =[x.lower() for x in table_filter['occupation']]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(**kwargs)


2012
2013
2014
2015
2016
2017


In [15]:
sci_median_salaries

Unnamed: 0,year,nuts_2,occupation,gross_annual_salary_median
0,2011,Tees Valley and Durham,"science, research, engineering and technology ...",32961.0
1,2011,Northumberland and Tyne and Wear,"science, research, engineering and technology ...",35185.0
2,2011,Cumbria,"science, research, engineering and technology ...",40351.0
3,2011,Cheshire,"science, research, engineering and technology ...",37419.0
4,2011,Greater Manchester,"science, research, engineering and technology ...",36333.0
...,...,...,...,...
267,2017,Highlands and Islands,"science, research, engineering and technology ...",36710.0
268,2017,Eastern Scotland,"science, research, engineering and technology ...",38952.0
269,2017,West Central Scotland,"science, research, engineering and technology ...",37511.0
270,2017,Southern Scotland,"science, research, engineering and technology ...",40704.0


In [16]:
#We fix a typo in one of the geographies (they switched the order of Bristol and Bath)
sci_median_salaries['nuts_2'] = ['Gloucestershire, Wiltshire and Bath/Bristol area' 
                                 if x=='Gloucestershire, Wiltshire and Bristol/Bath area' else x for x in sci_median_salaries['nuts_2']]

### Final processing

Add NUTS2 codes to the table



In [17]:
nuts_codes_url = 'https://opendata.arcgis.com/datasets/ded3b436114440e5a1561c1e53400803_0.geojson'

nuts_codes_names = requests.get(nuts_codes_url).json()['features']

In [18]:
#Add NUTS2 codes
nuts_names_to_codes = {x['properties']['NUTS218NM']:x['properties']['NUTS218CD'] for x in nuts_codes_names}

#Label the table with 2018 NUTS codes. 
sci_median_salaries['nuts_2_codes'] = [nuts_names_to_codes[x] if x in nuts_names_to_codes.keys() else np.nan for x in sci_median_salaries['nuts_2']]


In [19]:
set(sci_median_salaries.loc[sci_median_salaries['nuts_2_codes'].isna()]['nuts_2'])

{'Gloucestershire, Wiltshire and North Somerset',
 'Inner London',
 'Outer London',
 'South Western Scotland',
 'West Wales'}

There is a small number of mismatched areas due to changes in NUTS, plus aggregate non-NUTS london codes. We need to decide what to do about these.



## Save data

In [21]:
save_data(ashe_lep,'ashe_lep_all_occupations',int_path)

save_data(sci_median_salaries,'ashe_nuts_2_sci_tech',int_path)

### Create ASHE place indicator

In [35]:
sci_median_salaries.loc[sci_median_salaries['gross_annual_salary_median'].isna()]

Unnamed: 0,year,nuts_2,occupation,gross_annual_salary_median,nuts_2_codes
35,2011,Highlands and Islands,"science, research, engineering and technology ...",,UKM6
66,2012,Devon,"science, research, engineering and technology ...",,UKK4
126,2014,Shropshire and Staffordshire,"science, research, engineering and technology ...",,UKG2
150,2014,Northern Ireland,"science, research, engineering and technology ...",,UKN0
170,2015,Essex,"science, research, engineering and technology ...",,UKH3
188,2015,North Eastern Scotland,"science, research, engineering and technology ...",,UKM5
192,2016,Northumberland and Tyne and Wear,"science, research, engineering and technology ...",,UKC2


In [45]:
make_indicator(sci_median_salaries,'ashe_place',{'gross_annual_salary_median':'gbp_gross_median_salary_s_t'},
               nuts_var='nuts_2_codes',decimals=0,year_var='year')


   year nuts_id  nuts_year_spec  gbp_gross_median_salary_s_t
0  2011    UKC1            2018                      32961.0
1  2011    UKC2            2018                      35185.0
2  2011    UKD1            2018                      40351.0
3  2011    UKD6            2018                      37419.0
4  2011    UKD3            2018                      36333.0
