# HESA 

Some code to collect HESA data and merge it with university metadata in order to create indicators about skills supply in the UK.

We are interested in the following indicators:

* Research staff: https://www.hesa.ac.uk/data-and-analysis/staff/working-in-he
* Research spaces: https://www.hesa.ac.uk/data-and-analysis/estates/table-1
* STEM graduates produced: https://www.hesa.ac.uk/data-and-analysis/students/what-study#
* PhD graduates produced: this is also in https://www.hesa.ac.uk/data-and-analysis/students/what-study#


See [this table](https://docs.google.com/spreadsheets/d/1V2fAQcvuLsoImwo6uLdyIK3x80pBNoX97CxsxkjvRP4/edit?usp=sharing) for more information.


## Preamble

In [None]:
%run ../notebook_preamble.ipy

In [None]:
import csv
import zipfile
import io
import os
import requests

In [None]:
import seaborn as sn
from nuts_finder import NutsFinder

In [None]:
from itertools import combinations

### Functions

#### Simple utilities

In [None]:
def make_dirs(name,dirs = ['raw','processed']):
    '''
    Utility that creates directories to save the data
    
    '''
    
    for d in dirs:
        if name not in os.listdir(f'../../data/{d}'):
            os.mkdir(f'../../data/{d}/{name}')

def tidy_cols(my_csv):
    '''
    Tidies column names ie lower and replace spaces with underscores
    
    '''
    
    return([re.sub(' ','_',col.lower()) for col in my_csv.columns])

In [None]:
def filter_data(data,var_val_pairs):
    '''
    We use this to filter the data more easily than using pandas subsetting
    
    Args:
        data (df) is a dataframe
        var_val pairs (dict) is a dictionary where the keys are variables and the value are values

    
    '''
    d = data.copy()
    
    for k,v in var_val_pairs.items():
        d = d.loc[d[k]==v]
        
    return(d.reset_index(drop=True))
    

In [None]:
def check_categories(data,columns):
    '''
    This counts frequencies of categorical variables. We use it to decide what variables to choose, and to avoid double counting
    
    Args:
        Data (df) is the data
        Columns (list) are the categorical variables we want to check
    
    '''
    print('FREQUENCIES')
    print('===========')
    
    print('\n')
    #We check frequencies
    
    for var in columns:
    
        print(var)
        print('=====')
        print(data[var].value_counts())

        print('\n')
        
    print('CROSSTABS')
    print('===========')
    
    #We check combinations
    
    combs = list(combinations(columns,2))
    
    for comb in combs:
        print(comb[0]+' x '+comb[1])
        print('=====')
        print(pd.crosstab(data[comb[0]],data[comb[1]]))
        
        print('\n')
        
        
    


#### Data collection

In [None]:
def hesa_parser(url,out_name,skip=16,encoding='utf-8'):
    '''
    Function to obtain and parse data from the HESA website 
    
    Args:
        url (str) is the location of the csv file
        out_name (str) is the saved name of the file
        skip is the number of rows to skip (we could automate this by looking for rows at the top with lots of nans)
    
    '''
    
    #Request and parse
    rs = requests.get(url)
    
    #Parse the file
    parsed = rs.content.decode(encoding)
    
    #Save it
    
    with open(f'../../data/raw/hesa/{out_name}.txt','w') as outfile:
        outfile.write(parsed)
        
    #Read it.
    my_csv = pd.read_csv(f'../../data/raw/hesa/{out_name}.txt',skiprows=skip)
    
    #Clean column names
    my_csv.columns = tidy_cols(my_csv)
    
    
    return(my_csv)

    
    

#### Data processing

In [None]:
def gimme_nuts(lat,lon,level=2):
    '''
    Function to extract nuts information from a pair of coordinates
    
    Args:
        lat (float) is the latitude
        lon (float) is the longitude
        level (int) is the NUTS level we want
        
    
    '''
    
    info = nf.find(lat=lat,lon=lon)
    try:
        nuts_id = [x['NUTS_ID'] for x in info if x['LEVL_CODE']==level][0]
        nuts_name = [x['NUTS_NAME'] for x in info if x['LEVL_CODE']==level][0]
    #print(info)
    
    #nuts_id = info[level]['NUTS_ID']
    #nuts_name = info[level]['NUTS_NAME']
    
    except:
        print(f'failed with {np.round(lat,2)},{np.round(lon,2)}')
        nuts_id = np.nan
        nuts_name=np.nan
    
    return([nuts_id,nuts_name])

In [None]:
def compare_data(df_1,df_2,id_1,id_2,name_1,name_2):
    '''
    We use this function to check if the ids in two datasets we are merging are consistent.
    
    Args:
        dfs are the dfs we want to compare
        ids are the ids we want to check
        names are the names we want to use to explore the data
    
    '''
    
    print('In 1 but not in 2')
    print('==================')
    d1_miss = set(df_1[id_1].dropna())-set(df_2[id_2])
    print(set(df_1.loc[[x in d1_miss for x in df_1[id_1]]][name_1]))
    
    print('\n')

    
    print('In 2 but not in 1')
    print('==================')
    d2_miss = set(df_2[id_2].dropna())-set(df_1[id_1])
    print(set(df_2.loc[[x in d2_miss for x in df_2[id_2]]][name_2]))

    

#### Create NUTS aggregations

In [None]:
def make_nuts_estimate(data,nuts_lookup,counter,name,year_var=None):
    '''
    This function takes hesa data and creates a nuts estimate
    
    Args:
        data (df) where we have already selected variables of interest eg mode of employment
        nuts (dict) is the ukprn - nuts name and code lookup
        counter (str) is the variable with counts that we are interested in
        year_var (str) is the variable containing the years we want to group by. If None, then we are not grouping by year
    
    '''
    
    d = data.copy()
    
    #Add the nuts names and codes
    d['nuts_name'],d['nuts_code'] = [[nuts_lookup[ukprn][var] if ukprn in nuts_lookup.keys() else np.nan for ukprn in data['ukprn']] for
                                     var in ['nuts_name','nuts_code']]
    
    #We are focusing on numbers
    d[counter] = d[counter].astype(float)
    
    #Group results by year?
    if year_var == None:
        out = d.groupby(['nuts_name','nuts_code'])[counter].sum()
        
    else:
        
        out = d.groupby(['nuts_name','nuts_code',year_var])[counter].sum()
        
    
    out.name = name
    
    return(out)

In [None]:
def multiple_nuts_estimates(data,nuts_lookup,variables,select_var,value,year_var=None):
    '''
    Creates NUTS estimates for multiple variables.
    
    Args:
        data (df) is the filtered dataframe
        select_var (str) is the variable we want to use to select values
        nuts_lookup (dict) is the lookup between universities and nuts
        variables (list) is the list of variables for which we want to generate the analysis
        value (str) is the field that contains the numerical value we want to aggregate in the dataframe
        year_var (str) is the year_variable. If none, then we are not interested in years
    
    '''
    
    if year_var==None:
        concat = pd.concat([make_nuts_estimate(data.loc[data[select_var]==m],nuts_lookup,value,m) for m in 
                  variables],axis=1)
    
    #If we want to do this by year then we will create aggregates by nuts name and code and year and then concatenate over columns 
    else:
        
        year_store = []
        
        for m in variables:
            
            y = make_nuts_estimate(data.loc[data[select_var]==m],nuts_lookup,value,m,year_var='academic_year')
            
            year_store.append(y)
            
        concat = pd.concat(year_store,axis=1)
                
    return(concat)
        
    

In [None]:
def make_indicator(table,target_path,var_lookup,year_var,nuts_var='nuts_code',nuts_spec=2018):
    '''
    We use this function to create and save indicators using our standardised format.
    
    Args:
        table (df) is a df with relevant information
        target_path (str) is the location of the directory where we want to save the data (includes interim and processed)
        var_lookup (dict) is a lookup to rename the variable into our standardised name
        year (str) is the name of the year variable
        nuts_var (str) is the name of the NUTS code variable. We assume it is nuts_code
        nuts_spec (y) is the value of the NUTS specification. We assume we are working with 2018 NUTS
    
    '''
    #Copy
    t = table.reset_index(drop=False)
    
    #Reset index (we assume that the index is the nuts code, var name and year - this might need to be changed)
    
    
    #Process the interim data into an indicator
    
    #This is the variable name and code
    var_name = list(var_lookup.keys())[0]
    
    var_code = list(var_lookup.values())[0]
    
    #Focus on those
    t = t[[year_var,nuts_var,var_name]]
    
    #Add the nuts specification
    t['nuts_year_spec'] = nuts_spec
    
    #Rename variables
    t.rename(columns={var_name:var_code,year_var:'year',nuts_var:'nuts_id'},inplace=True)

    
    #Reorder variables
    t = t[['year','nuts_id','nuts_year_spec',var_code]]
    
    print(t.head())
    
    #Save in the processed folder
    t.to_csv(f'../../data/processed/{target_path}/{var_code}.csv')
    
    

#### Directories etc

In [None]:
# Create a hesa directory in raw and processed

In [None]:
make_dirs('hesa',['raw','processed','interim'])

## Collect data

### University metadata

The [learning providers website](http://learning-provider.data.ac.uk/) contains information about universities.



In [None]:
uni_meta = pd.read_csv('http://learning-provider.data.ac.uk/data/learning-providers-plus.csv')

In [None]:
uni_meta.columns = tidy_cols(uni_meta)

In [None]:
uni_meta.head()

Label universities

In [None]:
#This initialises an object to label lons and lats with their NUTS code
nf = NutsFinder(scale=1)

In [None]:
#Create a dict from ukprn to name, nuts2
uni_nuts = {row['ukprn']:{'name':row['view_name'],
                          'nuts_code':gimme_nuts(lat=row['latitude'],lon=row['longitude'])[0],
                          'nuts_name':gimme_nuts(lat=row['latitude'],lon=row['longitude'])[1]} for rid,row in uni_meta.iterrows()}

#### Research staff

In [None]:
res_staff = hesa_parser('https://www.hesa.ac.uk/data-and-analysis/staff/table-1.csv','staff',skip=24)

We also downloaded staff qualifications but probably won't use it

In [None]:
qual_staff = hesa_parser('https://www.hesa.ac.uk/data-and-analysis/staff/table-8.csv','qual_staff')

### Research spaces

In [None]:
spaces = hesa_parser('https://www.hesa.ac.uk/data-and-analysis/estates/data.csv','spaces',11)

### Stem graduates

This is a larger zip file so we have to use a different approach

In [None]:
#Request
rs = requests.get('https://www.hesa.ac.uk/data-and-analysis/students/table-13.csv')

In [None]:
#Unzip and save the file

#Note that the file contains tables for various years. We keep all of them
years = ['2014-15','2015-16','2016-17','2017-18','2018-19']

out_files = [zipfile.ZipFile(io.BytesIO(rs.content)).extract(f'table-13-({year}).csv','../../data/raw/hesa/') for year in years]

In [None]:
#We use a pipe to assign a year to each df and concatenate into a single df
# graduates_all_years = pd.concat(
#     [pd.read_csv(out_files[n],skiprows=14).pipe(lambda x: x.assign(academic_year = year)) for n,year in enumerate(years)],axis=0)


graduates_all_years = pd.concat(
    [pd.read_csv(out_files[n],skiprows=14) for n in np.arange(len(out_files))],axis=0)

In [None]:
graduates_all_years.columns = tidy_cols(graduates_all_years)

In [None]:
graduates_all_years.head()

## 2. Processing

Processing involves:

1. Select variables we want to use for the indicators (eg year, mode of study)
2. Label the data with the NUTS information
3. Group over NUTS and generate estimate

We can probably create a function to do 2 and 3 taking the subset data as input


### Do all the universities in HESA have metadata?

In [None]:
compare_data(res_staff,uni_meta,'ukprn','ukprn','he_provider','view_name')

In [None]:
compare_data(spaces,uni_meta,'ukprn','ukprn','he_provider','view_name')

In [None]:
# Graduates takes too long to run!

#We create a shorter version

In [None]:
grad_short = graduates_all_years.loc[(graduates_all_years['level_of_study']=='All')&((graduates_all_years['mode_of_study']=='Full-time'))]

In [None]:
compare_data(grad_short,uni_meta,'ukprn','ukprn','he_provider','view_name')

These are small universities - I have checked names and found that the difference between sets isn't caused by mismatches in codes (eg the same university having different codes in different sources)

### Make indicators

#### 1. Number of research staff

In [None]:
res_staff.head()

In [None]:
#We check categories in interesting columns
interesting_cols = ['mode_of_employment','atypical_marker','contract_marker','academic_year','activity_standard_occupational_classification']

#check_categories(res_staff,interesting_cols)

In [None]:
res_staff_filter = {'mode_of_employment':'All','contract_marker':'Academic',
                   'activity_standard_occupational_classification':'Total academic staff',
                   'country_of_he_provider':'All','region_of_he_provider':'All'}

In [None]:
res_filtered = filter_data(res_staff,res_staff_filter)

len(res_filtered)

In [None]:
nuts_academics = make_nuts_estimate(res_filtered,uni_nuts,'number','academic_staff','academic_year')

In [None]:
nuts_academics.sort_values(ascending=False)

#### 2. Research space

See some variable definitions for estates [here](https://www.hesa.ac.uk/support/definitions/estates)

In [None]:
spaces.head()

In [None]:
sp_interesting_cols = ['academic_year','country_of_he_provider','region_of_he_provider','category_marker','table']

#check_categories(spaces,sp_interesting_cols)

This contains a lot of information. We will only focus on a couple of variables:

* Total number of buildings
* Total site area
* Research income
* Research student FTE
* Total site area (hectares)

In [None]:
space_vars = ['Research income (£)','Research student FTE','Total number of buildings','Total site area (hectares)']

nuts_spaces = multiple_nuts_estimates(spaces,uni_nuts,space_vars,'category_marker','value',year_var='academic_year')

nuts_spaces.head()

### 3. Number of STEM graduates

The graduates file is quite big so I focus on grad short, which considers all full time graduates

In [None]:
grad_short.head()

In [None]:
grad_interesting_columns = ['level_of_study','mode_of_study','country_of_he_provider',
                            'region_of_he_provider','subject_of_study_marker','subject_of_study']

#check_categories(grad_short,grad_interesting_columns)

In [None]:
grad_filter = {'country_of_he_provider':'All','region_of_he_provider':'All'}

grad_filtered = filter_data(grad_short,grad_filter)

grad_filtered

In [None]:
disciplines = set(grad_filtered['subject_of_study'])

nuts_disciplines = multiple_nuts_estimates(grad_filtered,uni_nuts,disciplines,'subject_of_study','number',year_var='academic_year')

In [None]:
nuts_disciplines.head()

### Number of postgraduates

This is a flavour of the variable above where we count the number of research postgraduates

In [None]:
#We will filter the data to focus on full time postgraduate researchers
post_grad_filter = grad_filter.copy()

post_grad_filter['level_of_study'] = 'Postgraduate (research)'
post_grad_filter['mode_of_study'] = 'Full-time'
post_grad_filter['subject_of_study_marker']= 'Subject area'

In [None]:
post_grad_filtered = filter_data(graduates_all_years,post_grad_filter)

In [None]:
nuts_postgrads = make_nuts_estimate(post_grad_filtered,uni_nuts,'number','postgrad_research',year_var='academic_year')

In [None]:
nuts_postgrads.head(n=10)

## 3. Output indicators

Produce output indicators

#### a. Research students (issue 90)

This is simply the number of postgraduates

In [None]:
#Save the interim file
nuts_postgrads.to_csv(f'../../data/interim/{today_str}_hesa_postgraduates.csv')

In [None]:
make_indicator(nuts_spaces,'hesa',{'Research student FTE':'fte_research_students'},'academic_year')

Or number of FTE research students? Perhaps focus on this one as it has been subject to less transformations

In [None]:
make_indicator(nuts_spaces,'hesa',{'Research income (£)':'gbp_research_income'},'academic_year')

#### b. Students in STEM disciplines (issue 91)

Load definition of STEM disciplines (which needs to be checked by BEIS)

In [None]:
# Save the processed file

nuts_disciplines.to_csv(f'../../data/interim/hesa/{today_str}_students_disciplines_nuts.csv')

In [None]:
with open('../../data/aux/stem_hesa.txt','r') as infile:
    
    stem_hesa = infile.read().split('\n')
    

In [None]:
stem_students = nuts_disciplines[stem_hesa].sum(axis=1)

stem_students.name = 'stem_students'

stem_students.head()

In [None]:
make_indicator(stem_students,'hesa',{'stem_students':'total_stem_students'},'academic_year')

#### c. Stem postgraduates (issue 112)

We need to recalculate the STEM values focusing only on research postgraduates


In [None]:
post_grad_filter = {'country_of_he_provider':'All','region_of_he_provider':'All','mode_of_study':'Full-time','level_of_study':'Postgraduate (research)',
                   'subject_of_study_marker':'Subject area'}

postgrad_filtered = filter_data(grad_short,post_grad_filter)

post_grad_filtered.head()

We have noticed that the postgraduate data only seems to be available for subject areas.

In [None]:
#Extract information
nuts_postgrad_discipline = multiple_nuts_estimates(post_grad_filtered,uni_nuts,disciplines,'subject_of_study','number',year_var='academic_year')

In [None]:
#Extract STEM subjects
stem_postgrads_detailed = nuts_postgrad_discipline[stem_hesa]

stem_postgrads_detailed.head()

In [None]:
#Aggregate them
stem_postgraduates = stem_postgrads_detailed.sum(axis=1)

stem_postgraduates.name = 'stem_postgraduate_students'

stem_postgraduates


In [None]:
make_indicator(stem_postgraduates,'hesa',{'stem_postgraduate_students':'total_stem_postgraduates'},'academic_year')

### d. Area of university states (Issue 56)

These indicators will require little processing

In [None]:
nuts_spaces.to_csv(f'../../data/interim/hesa/{today_str}_university_spaces.csv')

In [None]:
make_indicator(nuts_spaces,'hesa',{'Total site area (hectares)':'area_university_site'},'academic_year')

### e. Number of buildings (issue 55)

In [None]:
make_indicator(nuts_spaces,'hesa',{'Total number of buildings':'total_university_buildings'},'academic_year')

### f. Research income (issue 53)

In [None]:
nuts_spaces

In [None]:
make_indicator(nuts_spaces,'hesa',{'Research income (£)':'gbp_research_income'},'academic_year')