# Research Excellence Framework results

We read and convert the HEFCE Research Excellence Framework (REF) data which is available from HEFCE (weird link: https://results.ref.ac.uk/(S(hlvnuqzwkag44jp3df3d4q14))/)

This is another university page with learning provider (`ukprn`) codes so the process that we will use to geocode the institutions is similar to what we did before.


## Preamble

In [None]:
%run ../notebook_preamble.ipy

In [None]:
import csv
import zipfile
import io

In [None]:
from ast import literal_eval

In [None]:
import seaborn as sn
from nuts_finder import NutsFinder

### Functions

#### Simple utilities

In [None]:
def tidy_cols(my_csv):
    '''
    Tidies column names ie lower and replace spaces with underscores
    
    '''
    
    return([re.sub(' ','_',col.lower()) for col in my_csv.columns])

#### Create NUTS aggregations

In [None]:
def make_nuts_estimate(data,nuts_lookup,counter,name):
    '''
    This function takes hesa data and creates a nuts estimate
    
    Args:
        data (df) where we have already selected variables of interest eg mode of employment
        nuts (dict) is the ukprn - nuts name and code lookup
        counter (str) is the variable with counts that we are interested in
    
    '''
    
    d = data.copy()
    
    #Add the nuts names and codes
    d['nuts_name'],d['nuts_code'] = [[nuts_lookup[ukprn][var] if ukprn in nuts_lookup.keys() else np.nan for ukprn in data['ukprn']] for
                                     var in ['nuts_name','nuts_code']]
    
    #We are focusing on numbers
    d[counter] = d[counter].astype(float)
    
    out = d.groupby(['nuts_name','nuts_code'])[counter].sum()
    
    out.name = name
    
    return(out)

In [None]:
def multiple_nuts_estimates(data,nuts_lookup,variables,select_var,value):
    '''
    Creates NUTS estimates for multiple variables.
    
    Args:
        data (df) is the filtered dataframe
        select_var (str) is the variable we want to use to select values
        nuts_lookup (dict) is the lookup between universities and nuts
        variables (list) is the list of variables for which we want to generate the analysis
        value (str) is the field that contains the numerical value we want to aggregate in the dataframe
    
    '''
    
    concat = pd.concat([make_nuts_estimate(data.loc[data[select_var]==m],nuts_lookup,value,m) for m in 
              variables],axis=1)
    
    return(concat)
        
    

#### Directories etc

In [None]:
# Create a hesa directory in raw, interim and processed

if 'ref' not in os.listdir('../../data/raw'):
    os.mkdir('../../data/raw/ref')

if 'ref' not in os.listdir('../../data/interim'):
    os.mkdir('../../data/interim/ref')
    
if 'ref' not in os.listdir('../../data/processed'):
    os.mkdir('../../data/processed/ref')
    

## 1 Load data

### University metadata

In [None]:
#Read and evaluate the university NUTS dictionary
with open('../../data/metadata/uni_nuts.txt','r') as infile:
    uni_nuts = literal_eval(infile.read())

### HEFCE data

In [None]:
#Read data
ref = pd.read_excel('https://results.ref.ac.uk/(S(hlvnuqzwkag44jp3df3d4q14))/DownloadFile/AllResults/xlsx',skiprows=7,na_values='-')

In [None]:
ref.head()

In [None]:
ref.columns = tidy_cols(ref)

In [None]:
focus_vars = ['institution_code_(ukprn)','institution_name','unit_of_assessment_name','profile','fte_category_a_staff_submitted','4*','3*','2*','1*','unclassified']



In [None]:
ref_2 = ref[focus_vars]

## 2. Processing

We want to do the following: 

* Estimate FTE in each category (multiply submitted by percentages)
* Create NUTS aggregates
* Save

**FTE in category**

In [None]:
#Create the full time estimate equivalents in each category

for x in ['4*','3*','2*','1*','unclassified']:
    
    ref_2[x+'_fte'] = [fte*star/100 for fte,star in zip(ref_2['fte_category_a_staff_submitted'],ref_2[x])]

In [None]:
#Focus on the overall variable rather than its components
ref_3 = ref_2.loc[ref['profile']=='Overall']

In [None]:
#Now we melt the ref df so it is easier to create the aggregations later.
focus_vars_2 = ['institution_code_(ukprn)','institution_name','unit_of_assessment_name','4*_fte','3*_fte','2*_fte','1*_fte','unclassified_fte']


ref_long = ref_3[focus_vars_2].melt(id_vars=['institution_code_(ukprn)','institution_name','unit_of_assessment_name'])

In [None]:
ref_long.head()

In [None]:
#We rename the variable with the ukprn code so it works with our functions
ref_long.rename(columns={'institution_code_(ukprn)':'ukprn'},inplace=True)

#We reduce the institute of zoology, which does not have a UKPRN

ref_long = ref_long.loc[ref_long['ukprn']!='ZZZZZZZZ']

In [None]:
ref_long['ukprn'] = ref_long['ukprn'].astype('float')

**Convert to NUTS**

We will subset by discicpline and aggregate over ftes


In [None]:
out = []

#For each unique discipline
for disc in set(ref_long['unit_of_assessment_name']):
    
    #Subset by that discipline
    df_in_unit = ref_long.loc[ref_long['unit_of_assessment_name']==disc]
    
    #Aggregate over nuts
    nuts_in_unit = multiple_nuts_estimates(df_in_unit,uni_nuts,set(df_in_unit['variable']),'variable','value')
    
    #Add the discipline (unit of assessment) name so we know what everything is when we concatenate
    nuts_in_unit['unit_of_assessment_name'] = disc
    
    #Put in the list
    out.append(nuts_in_unit)

In [None]:
#Concatenate
nuts_ref_ftes = pd.concat(out,axis=0)

**Tidy up variables**

In [None]:
#FTE variables ordered
fte_vars = ['4*_fte','3*_fte','2*_fte','1*_fte','unclassified_fte']

nuts_ref_ftes = nuts_ref_ftes[['unit_of_assessment_name']+fte_vars]

In [None]:
nuts_ref_ftes['total_fte'] = nuts_ref_ftes[fte_vars].sum(axis=1)

In [None]:
#Mini exploration

high_score_discipline = nuts_ref_ftes.pivot_table(index='nuts_name',columns='unit_of_assessment_name',values='4*_fte').fillna(0)

sn.clustermap(high_score_discipline.corr())

The above suggests that there is a 'classics' cluster with traditional disciplines, a more applied cluster, and a cluster of newer perhaps less academic disciplines. It would be very interesting to dig into this much deeper.

**Save**

In [None]:
nuts_ref_ftes.to_csv(f'../../data/interim/ref/{today_str}_ref_nuts.csv')

## 3. Create indicators

### Subsection 1: Comparative advantage in performing excellent public research													

#### 1. REF Scores 

We assume that this means overall FTE score, that is: FTE score weighted by FTE for all disciplines

In [None]:
#In order to calculate this we need to melt the data

In [None]:
ref_melted = pd.melt(nuts_ref_ftes.reset_index(drop=False),id_vars=['nuts_name','nuts_code','unit_of_assessment_name','total_fte'])

ref_melted.head()

In [None]:
ref_melted['score'] = [int(x.split('*')[0]) if 'unclassified' not in x else 0 for x in ref_melted['variable']]

In [None]:
ref_weighted_scores = ref_melted.groupby(['nuts_code','nuts_name']).apply(lambda x: np.sum((x['value']/x['value'].sum())*x['score'])
                                                                         ).sort_values(ascending=False)

ref_weighted_scores.head(n=10)

#### 2. REF scores in STEM disciplines

We need to define what STEM disciplines are! We will load a json stored in `aux` and change if needed.

In [None]:
with open('../../data/aux/ref_stem.txt','r') as infile:
    stem = infile.read().split('\n')

In [None]:
ref_stem = ref_melted.loc[[x in stem for x in ref_melted['unit_of_assessment_name']]]

In [None]:
ref_stem_weighted_scores = ref_stem.groupby(
    ['nuts_code','nuts_name']).apply(lambda x: np.sum((x['value']/x['value'].sum())*x['score'])).sort_values(ascending=False)

In [None]:
ref_stem_weighted_scores.head(n=10)

#### 3. Excellent researchers submitted to REF

This is the 4* FTEs

In [None]:
ref_excellent = nuts_ref_ftes.groupby(['nuts_code','nuts_name'])['4*_fte'].sum().sort_values(ascending=False)

In [None]:
ref_excellent.head(n=10)

## 4. Save indicators

In [None]:
#Some processing to save the files in the right format etc

for file,name in zip([ref_weighted_scores,ref_stem_weighted_scores,ref_excellent],['mean_ref','mean_ref_stem','total_4_fte']):
    
    file.name = name
    
    f = file.reset_index(drop=False)
    
    f['year']=2014
    
    f['nuts_year_spec'] = 2018
    
    f.rename(columns={'nuts_code':'nuts_id'},inplace=True)
    
    f[['year','nuts_id','nuts_year_spec',name]].to_csv(f'../../data/processed/ref/{name}.csv',index=False)
      