# HE-BCI

Here we collect data from the Higher-Education Business Community Interaction Survey available from the HESA website ([link](https://www.hesa.ac.uk/data-and-analysis/business-community))

The structure is very similar to other HESA we collected in `01_jmg` so eventually we might want to merge both notebooks. I will definitely be reusing a lot of the code here.

In terms of indicators, we would like to create the following:

* Graduate start-ups rate (HE-BCI)
* Research resource (income) per spin-out (HE-BCI)
* Average external investment per formal spin-out (HE-BCI)
* Licensing and other IP income as proportion of research income (HE-BCI)
* Contract research income with businesses (HE-BCI)
* Consultancy income with businesses (HE-BCI)
* Contract research income with the public and third sector (HE-BCI)
* Consultancy income with the public and third sector  (HE-BCI)

## Preamble

In [None]:
%run ../notebook_preamble.ipy

In [None]:
import csv
import zipfile
import io
from ast import literal_eval

In [None]:
import seaborn as sn
from nuts_finder import NutsFinder

In [None]:
today_str = str(datetime.datetime.today()).split(' ')[0]

### Functions

#### Simple utilities

In [None]:
def tidy_cols(my_csv):
    '''
    Tidies column names ie lower and replace spaces with underscores
    
    '''
    
    return([re.sub(' ','_',col.lower()) for col in my_csv.columns])

In [None]:
def filter_data(data,var_val_pairs):
    '''
    We use this to filter the data more easily than using pandas subsetting
    
    Args:
        data (df) is a dataframe
        var_val pairs (dict) is a dictionary where the keys are variables and the value are values

    
    '''
    d = data.copy()
    
    for k,v in var_val_pairs.items():
        d = d.loc[d[k]==v]
        
    return(d.reset_index(drop=True))
    

In [None]:
def check_categories(data,columns):
    '''
    This counts frequencies of categorical variables. We use it to decide what variables to choose, and to avoid double counting
    
    Args:
        Data (df) is the data
        Columns (list) are the categorical variables we want to check
    
    '''
    print('FREQUENCIES')
    print('===========')
    
    print('\n')
    #We check frequencies
    
    for var in columns:
    
        print(var)
        print('=====')
        print(data[var].value_counts())

        print('\n')
        
    print('CROSSTABS')
    print('===========')
    
    #We check combinations
    
    combs = list(combinations(columns,2))
    
    for comb in combs:
        print(comb[0]+' x '+comb[1])
        print('=====')
        print(pd.crosstab(data[comb[0]],data[comb[1]]))
        
        print('\n')
        
        
    


#### Data collection

In [None]:
def hesa_parser(url,out_name,skip=16,encoding='utf-8'):
    '''
    Function to obtain and parse data from the HESA website 
    
    Args:
        url (str) is the location of the csv file
        out_name (str) is the saved name of the file
        skip is the number of rows to skip (we could automate this by looking for rows at the top with lots of nans)
    
    '''
    
    #Request and parse
    rs = requests.get(url)
    
    #Parse the file
    parsed = rs.content.decode(encoding)
    
    #Save it
    
    with open(f'../../data/raw/hesa/{out_name}.txt','w') as outfile:
        outfile.write(parsed)
        
    #Read it.
    my_csv = pd.read_csv(f'../../data/raw/hesa/{out_name}.txt',skiprows=skip)
    
    #Clean column names
    my_csv.columns = tidy_cols(my_csv)
    
    
    return(my_csv)

    
    

#### Data processing

In [None]:
def gimme_nuts(lat,lon,level=2):
    '''
    Function to extract nuts information from a pair of coordinates
    
    Args:
        lat (float) is the latitude
        lon (float) is the longitude
        level (int) is the NUTS level we want
        
    
    '''
    
    info = nf.find(lat=lat,lon=lon)
    try:
        nuts_id = [x['NUTS_ID'] for x in info if x['LEVL_CODE']==level][0]
        nuts_name = [x['NUTS_NAME'] for x in info if x['LEVL_CODE']==level][0]
    #print(info)
    
    #nuts_id = info[level]['NUTS_ID']
    #nuts_name = info[level]['NUTS_NAME']
    
    except:
        print(f'failed with {np.round(lat,2)},{np.round(lon,2)}')
        nuts_id = np.nan
        nuts_name=np.nan
    
    return([nuts_id,nuts_name])

In [None]:
def compare_data(df_1,df_2,id_1,id_2,name_1,name_2):
    '''
    We use this function to check if the ids in two datasets we are merging are consistent.
    
    Args:
        dfs are the dfs we want to compare
        ids are the ids we want to check
        names are the names we want to use to explore the data
    
    '''
    
    print('In 1 but not in 2')
    print('==================')
    d1_miss = set(df_1[id_1].dropna())-set(df_2[id_2])
    print(set(df_1.loc[[x in d1_miss for x in df_1[id_1]]][name_1]))
    
    print('\n')

    
    print('In 2 but not in 1')
    print('==================')
    d2_miss = set(df_2[id_2].dropna())-set(df_1[id_1])
    print(set(df_2.loc[[x in d2_miss for x in df_2[id_2]]][name_2]))

    

#### Create NUTS aggregations

In [None]:
def make_nuts_estimate(data,nuts_lookup,counter,name,year_var='academic_year'):
    '''
    This function takes hesa data and creates a nuts estimate
    
    Args:
        data (df) where we have already selected variables of interest eg mode of employment
        nuts (dict) is the ukprn - nuts name and code lookup
        counter (str) is the variable with counts that we are interested in
        year_var (str) is the variable containing the years we want to group by. If None, then we are not grouping by year
    
    '''
    
    d = data.copy()
    
    #Add the nuts names and codes
    d['nuts_name'],d['nuts_code'] = [[nuts_lookup[ukprn][var] if ukprn in nuts_lookup.keys() else np.nan for ukprn in data['ukprn']] for
                                     var in ['nuts_name','nuts_code']]
    
    #We are focusing on numbers
    d[counter] = d[counter].astype(float)
    
    #Group results by year?
    if year_var == None:
        out = d.groupby(['nuts_name','nuts_code'])[counter].sum()
        
    else:
        
        out = d.groupby(['nuts_name','nuts_code',year_var])[counter].sum()
        
    
    out.name = name
    
    return(out)

In [None]:
def multiple_nuts_estimates(data,nuts_lookup,variables,select_var,value,year_var='academic_year'):
    '''
    Creates NUTS estimates for multiple variables.
    
    Args:
        data (df) is the filtered dataframe
        select_var (str) is the variable we want to use to select values
        nuts_lookup (dict) is the lookup between universities and nuts
        variables (list) is the list of variables for which we want to generate the analysis
        value (str) is the field that contains the numerical value we want to aggregate in the dataframe
        year_var (str) is the year_variable. If none, then we are not interested in years
    
    '''
    
    if year_var==None:
        concat = pd.concat([make_nuts_estimate(data.loc[data[select_var]==m],nuts_lookup,value,m) for m in 
                  variables],axis=1)
    
    #If we want to do this by year then we will create aggregates by nuts name and code and year and then concatenate over columns 
    else:
        
        year_store = []
        
        for m in variables:
            
            y = make_nuts_estimate(data.loc[data[select_var]==m],nuts_lookup,value,m,year_var='academic_year')
            
            year_store.append(y)
            
        concat = pd.concat(year_store,axis=1)
                
    return(concat)

In [None]:
def convert_academic_year(df,year_var = 'academic_year',position=0):
    '''
    This function converts an academic year variable from HESA into a year (int)
    
    Args:
        df (df) with the academic year we want to convert
        year_var (str) is the name of the year variable
        position (int) is the position of the year. We default to 0 (first year)
    
    '''
    
    #Make copy
    df_2 = df.copy()
    
    #Reset index so we can work with it easily
    df_2 = df_2.reset_index(level=2)
    
    #Create the new year variable by splitting the academic year variable on /
    df_2[year_var] = [int(x.split('/')[position]) if position==0 else int('20'+x.split('/')[position])  for x in df_2[year_var]]
    
    #Reappend the year index
    df_2.set_index(year_var,append=True,inplace=True)
    
    return(df_2)

In [None]:
def make_indicator(table,target_path,var_lookup,year_var,nuts_var='nuts_code',nuts_spec=2018,decimals=3):
    '''
    We use this function to create and save indicators using our standardised format.
    
    Args:
        table (df) is a df with relevant information
        target_path (str) is the location of the directory where we want to save the data (includes interim and processed)
        var_lookup (dict) is a lookup to rename the variable into our standardised name
        year (str) is the name of the year variable
        nuts_var (str) is the name of the NUTS code variable. We assume it is nuts_code
        nuts_spec (y) is the value of the NUTS specification. We assume we are working with 2018 NUTS
    
    '''
    #Copy
    t = table.reset_index(drop=False)
    
    #Reset index (we assume that the index is the nuts code, var name and year - this might need to be changed)
    
    
    #Process the interim data into an indicator
    
    #This is the variable name and code
    var_name = list(var_lookup.keys())[0]
    
    var_code = list(var_lookup.values())[0]
    
    #Focus on those
    t = t[[year_var,nuts_var,var_name]]
    
    #Add the nuts specification
    t['nuts_year_spec'] = nuts_spec
    
    #Rename variables
    t.rename(columns={var_name:var_code,year_var:'year',nuts_var:'nuts_id'},inplace=True)

    #Round variables
    t[var_code] = [np.round(x,decimals) if decimals>0 else int(x) for x in t[var_code]]
    
    
    #Reorder variables
    t = t[['year','nuts_id','nuts_year_spec',var_code]]
    
    print(t.head())
    
    #Save in the processed folder
    t.to_csv(f'../../data/processed/{target_path}/{var_code}.csv',index=False)

#### Directories etc

In [None]:
# Create a hesa directory in raw and processed

In [None]:
if 'hebci' not in os.listdir('../../data/raw'):
    os.mkdir('../../data/raw/hebci')
    
if 'hebci' not in os.listdir('../../data/interim'):
    os.mkdir('../../data/interim/hebci')
    
if 'hebci' not in os.listdir('../../data/processed'):
    os.mkdir('../../data/processed/hebci')

## 1. Collect data

### University metadata

The [learning providers website](http://learning-provider.data.ac.uk/) contains information about universities. 

We have geocoded them in `0-jmg-university...`



In [None]:
with open('../../data/metadata/uni_nuts.txt','r') as infile:
    
    uni_nuts = literal_eval(infile.read())

### Spin-out activity

In [None]:
url_1 = 'https://www.hesa.ac.uk/data-and-analysis/providers/business-community/table-4e.csv'

In [None]:
spin = hesa_parser(url_1,'spin',skip=11)

In [None]:
spin.head()

### Licensing income

In [None]:
url_2 = 'https://www.hesa.ac.uk/data-and-analysis/providers/business-community/table-4d.csv'

In [None]:
ip = hesa_parser(url_2,'ip',skip=11)

In [None]:
ip.head()

### Services income

In [None]:
url_3 = 'https://www.hesa.ac.uk/data-and-analysis/providers/business-community/table-2a.csv'

In [None]:
services = hesa_parser(url_3,'services',skip=11)

In [None]:
services.head()

### Collaborative research involving public funding

In [None]:
url_4 = 'https://www.hesa.ac.uk/data-and-analysis/providers/business-community/table-1.csv'

In [None]:
collab = hesa_parser(url_4,'collab',skip=11)

In [None]:
collab.head()

## 2. Create indicators

### spinout related

Here we will focus on the number of spinouts in different categories and the levels of external investment that they have received.

This includes issues `77`, `78`, `79`.

In [None]:
spin.head()

In [None]:
interesting_columns_spin = ['country_of_he_provider','region_of_he_provider','academic_year','metric','category_marker']

#check_categories(spin,interesting_columns_spin)

In [None]:
#This creates a df with the number of active firms per category. This can be subset later

#Create a dict to filter the data
spin_n_filter = {'metric':'Number of active firms'}

spins_number= multiple_nuts_estimates(filter_data(spin,spin_n_filter),uni_nuts,set(spin['category_marker']),'category_marker','value',year_var='academic_year')

spins_number.columns = [x+'_n' for x in tidy_cols(spins_number)]

spins_number

In [None]:
spins_number.sort_values('graduate_start-ups_n',ascending=False).head()

In [None]:
#This one focused on levels of investment received

#Create a dict to filter the data
spin_inv_filter = {'academic_year':'2017/18','metric':'Estimated external investment received (£ thousands)'}

spins_inv= multiple_nuts_estimates(filter_data(spin,spin_inv_filter),uni_nuts,set(spin['category_marker']),'category_marker','value')

spins_inv.columns = [x+'_inv_thGPB' for x in tidy_cols(spins_inv)]

In [None]:
spins_inv.sort_values('staff_start-ups_inv_thGPB',ascending=False).head()

In [None]:
spin_nuts = pd.concat([spins_number,spins_inv],axis=1)

### Licensing income related

We will extract total IP

In [None]:
income.head()

In [None]:
interesting_columns_income = ['country_of_he_provider','region_of_he_provider','academic_year','category_marker','unit']

#check_categories(income,interesting_columns_income)

In [None]:
income_filter = {'academic_year':'2017/18','category_marker':'Total IP revenues'}

In [None]:
income_nuts = make_nuts_estimate(filter_data(income,income_filter),uni_nuts,'value','total_ip_revenues')

In [None]:
income_nuts.sort_values(ascending=False).head()

### services related

In [None]:
services.head()

In [None]:
interesting_columns = ['type_of_service','type_of_organisation','number/value_marker']

check_categories(services,interesting_columns)

In [None]:
services_filter = {'academic_year':'2017/18','type_of_organisation':'Total','number/value_marker':'Value'}

services_nuts = multiple_nuts_estimates(filter_data(
    services,services_filter),uni_nuts,set(services['type_of_service']),'type_of_service','number/value')

services_nuts.columns = [x+'_thGBP' for x in tidy_cols(services_nuts)]

### Research collab related

In [None]:
collab.head()

In [None]:
interesting_columns_collab = ['academic_year','source_of_public_funding','type_of_income']

check_categories(collab,interesting_columns_collab)

None of these looks particularly relevant / new compared to information we have collected from other HESA sources - we will leave them out for now.

### Combine everything

In [None]:
hebci_nuts = pd.concat([spin_nuts,income_nuts,services_nuts],axis=1)

In [None]:
from scipy.stats import zscore

In [None]:
sn.clustermap(hebci_nuts.apply(lambda x: zscore(x)).corr(),cmap='Oranges')

In [None]:
hebci_nuts.to_csv(f'../../data/processed/hesa/{today_str}_hebci_nuts.csv')