# HESA 

Some code to collect HESA data and merge it with university metadata in order to create indicators about skills supply in the UK.

We are interested in the following indicators:

* Research staff: https://www.hesa.ac.uk/data-and-analysis/staff/working-in-he
* Research spaces: https://www.hesa.ac.uk/data-and-analysis/estates/table-1
* STEM graduates produced: https://www.hesa.ac.uk/data-and-analysis/students/what-study#
* PhD graduates produced: this is also in https://www.hesa.ac.uk/data-and-analysis/students/what-study#


See [this table](https://docs.google.com/spreadsheets/d/1V2fAQcvuLsoImwo6uLdyIK3x80pBNoX97CxsxkjvRP4/edit?usp=sharing) for more information.


## Preamble

In [1]:
%run ../notebook_preamble.ipy

In [2]:
import csv
import zipfile
import io
import os
import requests
from ast import literal_eval

In [3]:
import seaborn as sn
from nuts_finder import NutsFinder

In [4]:
from itertools import combinations

### Functions

#### Simple utilities

In [5]:
def make_dirs(name,dirs = ['raw','processed']):
    '''
    Utility that creates directories to save the data
    
    '''
    
    for d in dirs:
        if name not in os.listdir(f'../../data/{d}'):
            os.mkdir(f'../../data/{d}/{name}')

def tidy_cols(my_csv):
    '''
    Tidies column names ie lower and replace spaces with underscores
    
    '''
    
    return([re.sub(' ','_',col.lower()) for col in my_csv.columns])

In [6]:
def filter_data(data,var_val_pairs):
    '''
    We use this to filter the data more easily than using pandas subsetting
    
    Args:
        data (df) is a dataframe
        var_val pairs (dict) is a dictionary where the keys are variables and the value are values

    
    '''
    d = data.copy()
    
    for k,v in var_val_pairs.items():
        d = d.loc[d[k]==v]
        
    return(d.reset_index(drop=True))
    

In [7]:
def check_categories(data,columns):
    '''
    This counts frequencies of categorical variables. We use it to decide what variables to choose, and to avoid double counting
    
    Args:
        Data (df) is the data
        Columns (list) are the categorical variables we want to check
    
    '''
    print('FREQUENCIES')
    print('===========')
    
    print('\n')
    #We check frequencies
    
    for var in columns:
    
        print(var)
        print('=====')
        print(data[var].value_counts())

        print('\n')
        
    print('CROSSTABS')
    print('===========')
    
    #We check combinations
    
    combs = list(combinations(columns,2))
    
    for comb in combs:
        print(comb[0]+' x '+comb[1])
        print('=====')
        print(pd.crosstab(data[comb[0]],data[comb[1]]))
        
        print('\n')
        
        
    


#### Data collection

In [8]:
def hesa_parser(url,out_name,skip=16,encoding='utf-8'):
    '''
    Function to obtain and parse data from the HESA website 
    
    Args:
        url (str) is the location of the csv file
        out_name (str) is the saved name of the file
        skip is the number of rows to skip (we could automate this by looking for rows at the top with lots of nans)
    
    '''
    
    #Request and parse
    rs = requests.get(url)
    
    #Parse the file
    parsed = rs.content.decode(encoding)
    
    #Save it
    
    with open(f'../../data/raw/hesa/{out_name}.txt','w') as outfile:
        outfile.write(parsed)
        
    #Read it.
    my_csv = pd.read_csv(f'../../data/raw/hesa/{out_name}.txt',skiprows=skip)
    
    #Clean column names
    my_csv.columns = tidy_cols(my_csv)
    
    
    return(my_csv)

    
    

#### Data processing

In [9]:
def gimme_nuts(lat,lon,level=2):
    '''
    Function to extract nuts information from a pair of coordinates
    
    Args:
        lat (float) is the latitude
        lon (float) is the longitude
        level (int) is the NUTS level we want
        
    
    '''
    
    info = nf.find(lat=lat,lon=lon)
    try:
        nuts_id = [x['NUTS_ID'] for x in info if x['LEVL_CODE']==level][0]
        nuts_name = [x['NUTS_NAME'] for x in info if x['LEVL_CODE']==level][0]
    #print(info)
    
    #nuts_id = info[level]['NUTS_ID']
    #nuts_name = info[level]['NUTS_NAME']
    
    except:
        print(f'failed with {np.round(lat,2)},{np.round(lon,2)}')
        nuts_id = np.nan
        nuts_name=np.nan
    
    return([nuts_id,nuts_name])

In [10]:
def compare_data(df_1,df_2,id_1,id_2,name_1,name_2):
    '''
    We use this function to check if the ids in two datasets we are merging are consistent.
    
    Args:
        dfs are the dfs we want to compare
        ids are the ids we want to check
        names are the names we want to use to explore the data
    
    '''
    
    print('In 1 but not in 2')
    print('==================')
    d1_miss = set(df_1[id_1].dropna())-set(df_2[id_2])
    print(set(df_1.loc[[x in d1_miss for x in df_1[id_1]]][name_1]))
    
    print('\n')

    
    print('In 2 but not in 1')
    print('==================')
    d2_miss = set(df_2[id_2].dropna())-set(df_1[id_1])
    print(set(df_2.loc[[x in d2_miss for x in df_2[id_2]]][name_2]))

    

#### Create NUTS aggregations

In [11]:
def make_nuts_estimate(data,nuts_lookup,counter,name,year_var=None):
    '''
    This function takes hesa data and creates a nuts estimate
    
    Args:
        data (df) where we have already selected variables of interest eg mode of employment
        nuts (dict) is the ukprn - nuts name and code lookup
        counter (str) is the variable with counts that we are interested in
        year_var (str) is the variable containing the years we want to group by. If None, then we are not grouping by year
    
    '''
    
    d = data.copy()
    
    #Add the nuts names and codes
    d['nuts_name'],d['nuts_code'] = [[nuts_lookup[ukprn][var] if ukprn in nuts_lookup.keys() else np.nan for ukprn in data['ukprn']] for
                                     var in ['nuts_name','nuts_code']]
    
    #We are focusing on numbers
    d[counter] = d[counter].astype(float)
    
    #Group results by year?
    if year_var == None:
        out = d.groupby(['nuts_name','nuts_code'])[counter].sum()
        
    else:
        
        out = d.groupby(['nuts_name','nuts_code',year_var])[counter].sum()
        
    
    out.name = name
    
    return(out)

In [12]:
def multiple_nuts_estimates(data,nuts_lookup,variables,select_var,value,year_var=None):
    '''
    Creates NUTS estimates for multiple variables.
    
    Args:
        data (df) is the filtered dataframe
        select_var (str) is the variable we want to use to select values
        nuts_lookup (dict) is the lookup between universities and nuts
        variables (list) is the list of variables for which we want to generate the analysis
        value (str) is the field that contains the numerical value we want to aggregate in the dataframe
        year_var (str) is the year_variable. If none, then we are not interested in years
    
    '''
    
    if year_var==None:
        concat = pd.concat([make_nuts_estimate(data.loc[data[select_var]==m],nuts_lookup,value,m) for m in 
                  variables],axis=1)
    
    #If we want to do this by year then we will create aggregates by nuts name and code and year and then concatenate over columns 
    else:
        
        year_store = []
        
        for m in variables:
            
            y = make_nuts_estimate(data.loc[data[select_var]==m],nuts_lookup,value,m,year_var='academic_year')
            
            year_store.append(y)
            
        concat = pd.concat(year_store,axis=1)
                
    return(concat)
        
    

In [13]:
def convert_academic_year(df,year_var = 'academic_year',position=0):
    '''
    This function converts an academic year variable from HESA into a year (int)
    
    Args:
        df (df) with the academic year we want to convert
        year_var (str) is the name of the year variable
        position (int) is the position of the year. We default to 0 (first year)
    
    '''
    
    #Make copy
    df_2 = df.copy()
    
    #Reset index so we can work with it easily
    df_2 = df_2.reset_index(level=2)
    
    #Create the new year variable by splitting the academic year variable on /
    df_2[year_var] = [int(x.split('/')[position]) if position==0 else int('20'+x.split('/')[position])  for x in df_2[year_var]]
    
    #Reappend the year index
    df_2.set_index(year_var,append=True,inplace=True)
    
    return(df_2)
        
    

In [14]:
def make_indicator(table,target_path,var_lookup,year_var,nuts_var='nuts_code',nuts_spec=2018,decimals=3):
    '''
    We use this function to create and save indicators using our standardised format.
    
    Args:
        table (df) is a df with relevant information
        target_path (str) is the location of the directory where we want to save the data (includes interim and processed)
        var_lookup (dict) is a lookup to rename the variable into our standardised name
        year (str) is the name of the year variable
        nuts_var (str) is the name of the NUTS code variable. We assume it is nuts_code
        nuts_spec (y) is the value of the NUTS specification. We assume we are working with 2018 NUTS
    
    '''
    #Copy
    t = table.reset_index(drop=False)
    
    #Reset index (we assume that the index is the nuts code, var name and year - this might need to be changed)
    
    
    #Process the interim data into an indicator
    
    #This is the variable name and code
    var_name = list(var_lookup.keys())[0]
    
    var_code = list(var_lookup.values())[0]
    
    #Focus on those
    t = t[[year_var,nuts_var,var_name]]
    
    #Add the nuts specification
    t['nuts_year_spec'] = nuts_spec
    
    #Rename variables
    t.rename(columns={var_name:var_code,year_var:'year',nuts_var:'nuts_id'},inplace=True)

    #Round variables
    t[var_code] = [np.round(x,decimals) if decimals>0 else int(x) for x in t[var_code]]
    
    
    #Reorder variables
    t = t[['year','nuts_id','nuts_year_spec',var_code]]
    
    print(t.head())
    
    #Save in the processed folder
    t.to_csv(f'../../data/processed/{target_path}/{var_code}.csv',index=False)
    
    

#### Directories etc

In [15]:
# Create a hesa directory in raw and processed

In [16]:
make_dirs('hesa',['raw','processed','interim'])

## Collect data

### University metadata

We have already reverse geocoded universities in the `0` notebook. We load a dict with university codes and NUTS codes from there

In [17]:
with open('../../data/metadata/uni_nuts.txt','r') as infile:
    
    uni_nuts = literal_eval(infile.read())
    

#### Research staff

In [18]:
res_staff = hesa_parser('https://www.hesa.ac.uk/data-and-analysis/staff/table-1.csv','staff',skip=24)

We also downloaded staff qualifications but probably won't use it

In [19]:
qual_staff = hesa_parser('https://www.hesa.ac.uk/data-and-analysis/staff/table-8.csv','qual_staff')

### Research spaces

In [20]:
spaces = hesa_parser('https://www.hesa.ac.uk/data-and-analysis/estates/data.csv','spaces',11)

### Stem graduates

This is a larger zip file so we have to use a different approach

In [21]:
#Request
rs = requests.get('https://www.hesa.ac.uk/data-and-analysis/students/table-13.csv')

In [22]:
#Unzip and save the file

#Note that the file contains tables for various years. We keep all of them
years = ['2014-15','2015-16','2016-17','2017-18','2018-19']

out_files = [zipfile.ZipFile(io.BytesIO(rs.content)).extract(f'table-13-({year}).csv','../../data/raw/hesa/') for year in years]

In [23]:
#We use a pipe to assign a year to each df and concatenate into a single df
# graduates_all_years = pd.concat(
#     [pd.read_csv(out_files[n],skiprows=14).pipe(lambda x: x.assign(academic_year = year)) for n,year in enumerate(years)],axis=0)


graduates_all_years = pd.concat(
    [pd.read_csv(out_files[n],skiprows=14) for n in np.arange(len(out_files))],axis=0)

In [24]:
graduates_all_years.columns = tidy_cols(graduates_all_years)

In [25]:
graduates_all_years.head()

Unnamed: 0,ukprn,he_provider,level_of_study,mode_of_study,country_of_he_provider,region_of_he_provider,academic_year,subject_of_study_marker,subject_of_study,number
0,10007783.0,The University of Aberdeen,All,All,All,All,2014/15,Subject area,(1) Medicine and dentistry,1110
1,10007783.0,The University of Aberdeen,All,All,All,All,2014/15,Subject area,(2) Subjects allied to medicine,645
2,10007783.0,The University of Aberdeen,All,All,All,All,2014/15,Subject area,(3) Biological sciences,1655
3,10007783.0,The University of Aberdeen,All,All,All,All,2014/15,Subject area,(4) Veterinary science,0
4,10007783.0,The University of Aberdeen,All,All,All,All,2014/15,Subject area,(5) Agriculture and related subjects,20


## 2. Processing

Processing involves:

1. Select variables we want to use for the indicators (eg year, mode of study)
2. Label the data with the NUTS information
3. Group over NUTS and generate estimate

We can probably create a function to do 2 and 3 taking the subset data as input


### Make indicators

#### 1. Number of research staff

In [26]:
res_staff.head()

Unnamed: 0,ukprn,he_provider,country_of_he_provider,region_of_he_provider,mode_of_employment,atypical_marker,contract_marker,academic_year,activity_standard_occupational_classification,number
0,10007783.0,The University of Aberdeen,All,All,All,Non-atypical,Academic,2014/15,"Managers, directors and senior officials",0
1,10007783.0,The University of Aberdeen,All,All,All,Non-atypical,Academic,2014/15,Professional occupations,1655
2,10007783.0,The University of Aberdeen,All,All,All,Non-atypical,Academic,2014/15,Associate professional and technical occupations,10
3,10007783.0,The University of Aberdeen,All,All,All,Non-atypical,Academic,2014/15,Clerical and manual occupations,0
4,10007783.0,The University of Aberdeen,All,All,All,Non-atypical,Academic,2014/15,Total academic staff,1665


In [27]:
#We check categories in interesting columns
interesting_cols = ['mode_of_employment','atypical_marker','contract_marker','academic_year','activity_standard_occupational_classification']

#check_categories(res_staff,interesting_cols)

In [28]:
res_staff_filter = {'mode_of_employment':'All','contract_marker':'Academic',
                   'activity_standard_occupational_classification':'Total academic staff',
                   'country_of_he_provider':'All','region_of_he_provider':'All'}

In [29]:
res_filtered = filter_data(res_staff,res_staff_filter)

len(res_filtered)

660

In [30]:
nuts_academics = make_nuts_estimate(res_filtered,uni_nuts,'number','academic_staff','academic_year')

In [31]:
nuts_academics.sort_values(ascending=False)

nuts_name                                   nuts_code  academic_year
Inner London - West                         UKI3       2016/17          30130.0
                                                       2017/18          29250.0
                                                       2015/16          28465.0
                                                       2014/15          27300.0
Berkshire, Buckinghamshire and Oxfordshire  UKJ1       2014/15          16355.0
                                                                         ...   
Cornwall and Isles of Scilly                UKK3       2014/15            235.0
Highlands and Islands                       UKM6       2017/18             75.0
                                                       2016/17             40.0
                                                       2015/16             40.0
                                                       2014/15             35.0
Name: academic_staff, Length: 160, dtype: float64

#### 2. Research space

See some variable definitions for estates [here](https://www.hesa.ac.uk/support/definitions/estates)

In [32]:
spaces.head()

Unnamed: 0,ukprn,he_provider,academic_year,country_of_he_provider,region_of_he_provider,table,category_marker,value
0,10007783,The University of Aberdeen,2017/18,Scotland,Scotland,Table-5,Total income (£),219471000
1,10007783,The University of Aberdeen,2017/18,Scotland,Scotland,Table-5,Teaching income (£),107977000
2,10007783,The University of Aberdeen,2017/18,Scotland,Scotland,Table-5,Research income (£),78696000
3,10007783,The University of Aberdeen,2017/18,Scotland,Scotland,Table-5,Other non-residential income (£),21315000
4,10007783,The University of Aberdeen,2017/18,Scotland,Scotland,Table-5,Non-residential income total (£),207988000


In [33]:
sp_interesting_cols = ['academic_year','country_of_he_provider','region_of_he_provider','category_marker','table']

#check_categories(spaces,sp_interesting_cols)

This contains a lot of information. We will only focus on a couple of variables:

* Total number of buildings
* Total site area
* Research income
* Research student FTE
* Total site area (hectares)

In [34]:
space_vars = ['Research income (£)','Research student FTE','Total number of buildings','Total site area (hectares)']

nuts_spaces = multiple_nuts_estimates(spaces,uni_nuts,space_vars,'category_marker','value',year_var='academic_year')

nuts_spaces.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Research income (£),Research student FTE,Total number of buildings,Total site area (hectares)
nuts_name,nuts_code,academic_year,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Bedfordshire and Hertfordshire,UKH2,2015/16,68415000.0,1305.0,419.0,385.402
Bedfordshire and Hertfordshire,UKH2,2016/17,65938000.0,1280.0,413.0,385.133
Bedfordshire and Hertfordshire,UKH2,2017/18,66830000.0,1270.0,407.0,385.078
"Berkshire, Buckinghamshire and Oxfordshire",UKJ1,2015/16,760879000.0,6210.0,684.0,341.8
"Berkshire, Buckinghamshire and Oxfordshire",UKJ1,2016/17,794974000.0,6150.0,661.0,722.01


### 3. Number of STEM graduates

The graduates file is quite big so I focus on grad short, which considers all full time graduates

In [35]:
grad_short = graduates_all_years.loc[(graduates_all_years['level_of_study']=='All')&((graduates_all_years['mode_of_study']=='Full-time'))]

In [36]:
grad_short.head()

Unnamed: 0,ukprn,he_provider,level_of_study,mode_of_study,country_of_he_provider,region_of_he_provider,academic_year,subject_of_study_marker,subject_of_study,number
736,10007783.0,The University of Aberdeen,All,Full-time,All,All,2014/15,Subject area,(1) Medicine and dentistry,1080
737,10007783.0,The University of Aberdeen,All,Full-time,All,All,2014/15,Subject area,(2) Subjects allied to medicine,520
738,10007783.0,The University of Aberdeen,All,Full-time,All,All,2014/15,Subject area,(3) Biological sciences,1590
739,10007783.0,The University of Aberdeen,All,Full-time,All,All,2014/15,Subject area,(4) Veterinary science,0
740,10007783.0,The University of Aberdeen,All,Full-time,All,All,2014/15,Subject area,(5) Agriculture and related subjects,20


In [37]:
grad_interesting_columns = ['level_of_study','mode_of_study','country_of_he_provider',
                            'region_of_he_provider','subject_of_study_marker','subject_of_study']

#check_categories(grad_short,grad_interesting_columns)

In [38]:
grad_filter = {'country_of_he_provider':'All','region_of_he_provider':'All'}

grad_filtered = filter_data(grad_short,grad_filter)

grad_filtered

Unnamed: 0,ukprn,he_provider,level_of_study,mode_of_study,country_of_he_provider,region_of_he_provider,academic_year,subject_of_study_marker,subject_of_study,number
0,10007783.0,The University of Aberdeen,All,Full-time,All,All,2014/15,Subject area,(1) Medicine and dentistry,1080
1,10007783.0,The University of Aberdeen,All,Full-time,All,All,2014/15,Subject area,(2) Subjects allied to medicine,520
2,10007783.0,The University of Aberdeen,All,Full-time,All,All,2014/15,Subject area,(3) Biological sciences,1590
3,10007783.0,The University of Aberdeen,All,Full-time,All,All,2014/15,Subject area,(4) Veterinary science,0
4,10007783.0,The University of Aberdeen,All,Full-time,All,All,2014/15,Subject area,(5) Agriculture and related subjects,20
...,...,...,...,...,...,...,...,...,...,...
153083,,Total,All,Full-time,All,All,2018/19,Total,(X2) Research and study skills in education,490
153084,,Total,All,Full-time,All,All,2018/19,Total,(X3) Academic studies in education,30465
153085,,Total,All,Full-time,All,All,2018/19,Total,(X9) Others in education,1440
153086,,Total,All,Full-time,All,All,2018/19,Total,(Y0) Combined,4695


In [39]:
disciplines = set(grad_filtered['subject_of_study'])

nuts_disciplines = multiple_nuts_estimates(grad_filtered,uni_nuts,disciplines,'subject_of_study','number',year_var='academic_year')

In [40]:
nuts_disciplines.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,(P1) Information services,(J5) Materials technology not otherwise specified,(F0) Broadly-based programmes within physical sciences,(L8) Development studies,(I2) Information systems,(H2) Civil engineering,(M9) Others in law,(I) Education,(D4) Agriculture,(1) Medicine and dentistry,...,(C2) Botany,(R6) Scandinavian studies,(C1) Biology,(H) Creative arts and design,(N7) Office skills,(W3) Music,(Q1) Linguistics,(I5) Health informatics,(F3) Physics,(A2) Pre-clinical dentistry
nuts_name,nuts_code,academic_year,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
Bedfordshire and Hertfordshire,UKH2,2014/15,0.0,25.0,0.0,0.0,55.0,175.0,0.0,1720.0,45.0,5.0,...,0.0,0.0,500.0,2855.0,0.0,420.0,140.0,0.0,115.0,0.0
Bedfordshire and Hertfordshire,UKH2,2015/16,0.0,20.0,0.0,0.0,80.0,175.0,0.0,1675.0,45.0,0.0,...,0.0,0.0,570.0,2940.0,0.0,455.0,130.0,0.0,110.0,0.0
Bedfordshire and Hertfordshire,UKH2,2016/17,0.0,25.0,0.0,0.0,65.0,170.0,0.0,1630.0,35.0,0.0,...,0.0,0.0,475.0,2915.0,0.0,405.0,135.0,0.0,100.0,0.0
Bedfordshire and Hertfordshire,UKH2,2017/18,0.0,10.0,0.0,0.0,85.0,175.0,0.0,1575.0,50.0,0.0,...,0.0,0.0,350.0,2815.0,0.0,375.0,120.0,20.0,90.0,0.0
Bedfordshire and Hertfordshire,UKH2,2018/19,0.0,10.0,0.0,0.0,75.0,180.0,0.0,1535.0,40.0,0.0,...,0.0,0.0,300.0,2745.0,0.0,385.0,90.0,10.0,70.0,0.0


### Number of postgraduates

This is a flavour of the variable above where we count the number of research postgraduates

In [41]:
#We will filter the data to focus on full time postgraduate researchers
post_grad_filter = grad_filter.copy()

post_grad_filter['level_of_study'] = 'Postgraduate (research)'
post_grad_filter['mode_of_study'] = 'Full-time'
post_grad_filter['subject_of_study_marker']= 'Subject area'

In [42]:
post_grad_filtered = filter_data(graduates_all_years,post_grad_filter)

In [43]:
nuts_postgrads = make_nuts_estimate(post_grad_filtered,uni_nuts,'number','postgrad_research',year_var='academic_year')

In [44]:
nuts_postgrads.head(n=10)

nuts_name                                   nuts_code  academic_year
Bedfordshire and Hertfordshire              UKH2       2014/15          1005.0
                                                       2015/16           945.0
                                                       2016/17           870.0
                                                       2017/18           895.0
                                                       2018/19           850.0
Berkshire, Buckinghamshire and Oxfordshire  UKJ1       2014/15          6085.0
                                                       2015/16          6485.0
                                                       2016/17          6340.0
                                                       2017/18          5765.0
                                                       2018/19          5905.0
Name: postgrad_research, dtype: float64

## 3. Output indicators

Produce output indicators

#### a. Research students (issue 90)

This is simply the number of postgraduates

In [46]:
#Save the interim file
nuts_postgrads.to_csv(f'../../data/interim/{today_str}_hesa_postgraduates.csv')

  nuts_postgrads.to_csv(f'../../data/interim/{today_str}_hesa_postgraduates.csv')


In [45]:
nuts_postgrads

nuts_name                       nuts_code  academic_year
Bedfordshire and Hertfordshire  UKH2       2014/15          1005.0
                                           2015/16           945.0
                                           2016/17           870.0
                                           2017/18           895.0
                                           2018/19           850.0
                                                             ...  
West Yorkshire                  UKE4       2014/15          3110.0
                                           2015/16          3210.0
                                           2016/17          3210.0
                                           2017/18          3305.0
                                           2018/19          3285.0
Name: postgrad_research, Length: 200, dtype: float64

In [49]:
make_indicator(convert_academic_year(nuts_postgrads),'hesa',{'postgrad_research':'total_postgraduates'},'academic_year',decimals=0)

   year nuts_id  nuts_year_spec  total_postgraduates
0  2014    UKH2            2018                 1005
1  2015    UKH2            2018                  945
2  2016    UKH2            2018                  870
3  2017    UKH2            2018                  895
4  2018    UKH2            2018                  850


**Or number of FTE research students? Perhaps focus on this one as it has been subject to less transformations**

In [72]:
make_indicator(convert_academic_year(nuts_spaces),'hesa',{'Research student FTE':'fte_research_students'},'academic_year',decimals=0)

   year nuts_id  nuts_year_spec  fte_research_students
0  2015    UKH2            2018                   1305
1  2016    UKH2            2018                   1280
2  2017    UKH2            2018                   1270
3  2015    UKJ1            2018                   6210
4  2016    UKJ1            2018                   6150


#### b. Students in STEM disciplines (issue 91)

Load definition of STEM disciplines (which needs to be checked by BEIS)

In [49]:
# Save the processed file

nuts_disciplines.to_csv(f'../../data/interim/hesa/{today_str}_students_disciplines_nuts.csv')

In [50]:
with open('../../data/aux/stem_hesa.txt','r') as infile:
    
    stem_hesa = infile.read().split('\n')
    

In [51]:
stem_students = nuts_disciplines[stem_hesa].sum(axis=1)

stem_students.name = 'stem_students'

stem_students.head()

nuts_name                       nuts_code  academic_year
Bedfordshire and Hertfordshire  UKH2       2014/15          9210.0
                                           2015/16          8775.0
                                           2016/17          8785.0
                                           2017/18          8435.0
                                           2018/19          8440.0
Name: stem_students, dtype: float64

In [73]:
make_indicator(convert_academic_year(stem_students),'hesa',{'stem_students':'total_stem_students'},'academic_year',decimals=0)

   year nuts_id  nuts_year_spec  total_stem_students
0  2014    UKH2            2018                 9210
1  2015    UKH2            2018                 8775
2  2016    UKH2            2018                 8785
3  2017    UKH2            2018                 8435
4  2018    UKH2            2018                 8440


#### c. Stem postgraduates (issue 112)

We need to recalculate the STEM values focusing only on research postgraduates


In [53]:
post_grad_filter = {'country_of_he_provider':'All','region_of_he_provider':'All','mode_of_study':'Full-time','level_of_study':'Postgraduate (research)',
                   'subject_of_study_marker':'Subject area'}

postgrad_filtered = filter_data(grad_short,post_grad_filter)

post_grad_filtered.head()

Unnamed: 0,ukprn,he_provider,level_of_study,mode_of_study,country_of_he_provider,region_of_he_provider,academic_year,subject_of_study_marker,subject_of_study,number
0,10007783.0,The University of Aberdeen,Postgraduate (research),Full-time,All,All,2014/15,Subject area,(1) Medicine and dentistry,40
1,10007783.0,The University of Aberdeen,Postgraduate (research),Full-time,All,All,2014/15,Subject area,(2) Subjects allied to medicine,125
2,10007783.0,The University of Aberdeen,Postgraduate (research),Full-time,All,All,2014/15,Subject area,(3) Biological sciences,165
3,10007783.0,The University of Aberdeen,Postgraduate (research),Full-time,All,All,2014/15,Subject area,(4) Veterinary science,0
4,10007783.0,The University of Aberdeen,Postgraduate (research),Full-time,All,All,2014/15,Subject area,(5) Agriculture and related subjects,5


We have noticed that the postgraduate data only seems to be available for subject areas.

In [54]:
#Extract information
nuts_postgrad_discipline = multiple_nuts_estimates(post_grad_filtered,uni_nuts,disciplines,'subject_of_study','number',year_var='academic_year')

In [55]:
#Extract STEM subjects
stem_postgrads_detailed = nuts_postgrad_discipline[stem_hesa]

stem_postgrads_detailed.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,(3) Biological sciences,(6) Physical sciences,(7) Mathematical sciences,(8) Computer science,(9) Engineering and technology
nuts_name,nuts_code,academic_year,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Bedfordshire and Hertfordshire,UKH2,2014/15,90.0,90.0,0.0,95.0,485.0
Bedfordshire and Hertfordshire,UKH2,2015/16,80.0,85.0,0.0,65.0,480.0
Bedfordshire and Hertfordshire,UKH2,2016/17,75.0,80.0,0.0,50.0,450.0
Bedfordshire and Hertfordshire,UKH2,2017/18,85.0,70.0,0.0,60.0,450.0
Bedfordshire and Hertfordshire,UKH2,2018/19,90.0,65.0,0.0,50.0,430.0


In [56]:
#Aggregate them
stem_postgraduates = stem_postgrads_detailed.sum(axis=1)

stem_postgraduates.name = 'stem_postgraduate_students'

stem_postgraduates


nuts_name                       nuts_code  academic_year
Bedfordshire and Hertfordshire  UKH2       2014/15           760.0
                                           2015/16           710.0
                                           2016/17           655.0
                                           2017/18           665.0
                                           2018/19           635.0
                                                             ...  
West Yorkshire                  UKE4       2014/15          1585.0
                                           2015/16          1615.0
                                           2016/17          1600.0
                                           2017/18          1660.0
                                           2018/19          1695.0
Name: stem_postgraduate_students, Length: 200, dtype: float64

In [74]:
make_indicator(convert_academic_year(stem_postgraduates),'hesa',{'stem_postgraduate_students':'total_stem_postgraduates'},'academic_year',decimals=0)

   year nuts_id  nuts_year_spec  total_stem_postgraduates
0  2014    UKH2            2018                       760
1  2015    UKH2            2018                       710
2  2016    UKH2            2018                       655
3  2017    UKH2            2018                       665
4  2018    UKH2            2018                       635


### d. Area of university states (Issue 56)

These indicators will require little processing

In [58]:
nuts_spaces.to_csv(f'../../data/interim/hesa/{today_str}_university_spaces.csv')

In [75]:
make_indicator(convert_academic_year(nuts_spaces),'hesa',{'Total site area (hectares)':'area_university_site'},'academic_year',decimals=2)

   year nuts_id  nuts_year_spec  area_university_site
0  2015    UKH2            2018                385.40
1  2016    UKH2            2018                385.13
2  2017    UKH2            2018                385.08
3  2015    UKJ1            2018                341.80
4  2016    UKJ1            2018                722.01


### e. Number of buildings (issue 55)

In [76]:
make_indicator(convert_academic_year(nuts_spaces),'hesa',{'Total number of buildings':'total_university_buildings'},'academic_year',decimals=0)

   year nuts_id  nuts_year_spec  total_university_buildings
0  2015    UKH2            2018                         419
1  2016    UKH2            2018                         413
2  2017    UKH2            2018                         407
3  2015    UKJ1            2018                         684
4  2016    UKJ1            2018                         661


### f. Research income (issue 53)

In [61]:
nuts_spaces

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Research income (£),Research student FTE,Total number of buildings,Total site area (hectares)
nuts_name,nuts_code,academic_year,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Bedfordshire and Hertfordshire,UKH2,2015/16,68415000.0,1305.0,419.0,385.402
Bedfordshire and Hertfordshire,UKH2,2016/17,65938000.0,1280.0,413.0,385.133
Bedfordshire and Hertfordshire,UKH2,2017/18,66830000.0,1270.0,407.0,385.078
"Berkshire, Buckinghamshire and Oxfordshire",UKJ1,2015/16,760879000.0,6210.0,684.0,341.800
"Berkshire, Buckinghamshire and Oxfordshire",UKJ1,2016/17,794974000.0,6150.0,661.0,722.010
...,...,...,...,...,...,...
West Wales and The Valleys,UKL1,2016/17,116378000.0,1705.0,710.0,631.246
West Wales and The Valleys,UKL1,2017/18,125192000.0,1705.0,716.0,645.369
West Yorkshire,UKE4,2015/16,202427000.0,3565.0,519.0,261.142
West Yorkshire,UKE4,2016/17,207754000.0,3515.0,522.0,273.271


In [77]:
make_indicator(convert_academic_year(nuts_spaces),'hesa',{'Research income (£)':'gbp_research_income'},'academic_year',decimals=0)

   year nuts_id  nuts_year_spec  gbp_research_income
0  2015    UKH2            2018             68415000
1  2016    UKH2            2018             65938000
2  2017    UKH2            2018             66830000
3  2015    UKJ1            2018            760879000
4  2016    UKJ1            2018            794974000
