# Gateway to Research Discipline Labelling

We use an older version of GtR with better topic coverage to

* Analyse the community structure of the topic co-occurrence network to identify disciplines
* Train a discipline classifier that we can use with the latest GtR data.

**Observations**

During data exploration we have identified a large number of missing abstracts for the social sciences. These abstracts were not missing from previous versions of the data like the one we use here to train our machine learning model. For now, we replace missing abstracts with the older ones but we still need to identify what is the source of the problem - is it GtR or our data collection process?


## Preamble

In [None]:
#import matplotlib
#matplotlib.use('cairo')

In [None]:
import ast
import os
import random

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
%matplotlib inline

pd.options.mode.chained_assignment = None  # default='warn'

In [None]:
# %load lda_pipeline.py
from gensim import corpora, models
from string import punctuation
from string import digits
import re
import pandas as pd
import numpy as np

#Characters to drop
drop_characters = re.sub('-','',punctuation)+digits

#Stopwords
from nltk.corpus import stopwords

stop = stopwords.words('English')

#Stem functions
from nltk.stem import *
stemmer = PorterStemmer()

In [None]:
# Utility functions

def flatten_list(my_list):
    '''
    Flattens a list
    '''
    
    return([x for el in my_list for x in el])

def flatten(my_list):
    '''
    Flattens a list
    '''
    
    return([x for el in my_list for x in el])


def clean_tokenise(string,drop_characters=drop_characters,stopwords=stop):
    '''
    Takes a string and cleans (makes lowercase and removes stopwords)
    
    '''
    

    #Lowercase
    str_low = string.lower()
    
    
    #Remove symbols and numbers
    str_letters = re.sub('[{drop}]'.format(drop=drop_characters),'',str_low)
    
    
    #Remove stopwords
    clean = [x for x in str_letters.split(' ') if (x not in stop) & (x!='')]
    
    return(clean)


class CleanTokenize():
    '''
    This class takes a list of strings and returns a tokenised, clean list of token lists ready
    to be processed with the LdaPipeline
    
    It has a clean method to remove symbols and stopwords
    
    It has a bigram method to detect collocated words
    
    It has a stem method to stem words
    
    '''
    
    def __init__(self,corpus):
        '''
        Takes a corpus (list where each element is a string)
        '''
        
        #Store
        self.corpus = corpus
        
    def clean(self,drop=drop_characters,stopwords=stop):
        '''
        Removes strings and stopwords, 
        
        '''
        
        cleaned = [clean_tokenise(doc,drop_characters=drop,stopwords=stop) for doc in self.corpus]
        
        self.tokenised = cleaned
        return(self)
    
    def stem(self):
        '''
        Optional: stems words
        
        '''
        #Stems each word in each tokenised sentence
        stemmed = [[stemmer.stem(word) for word in sentence] for sentence in self.tokenised]
    
        self.tokenised = stemmed
        return(self)
        
    
    def bigram(self,threshold=10):
        '''
        Optional Create bigrams.
        
        '''
        
        #Colocation detector trained on the data
        phrases = models.Phrases(self.tokenised,threshold=threshold)
        
        bigram = models.phrases.Phraser(phrases)
        
        self.tokenised = bigram[self.tokenised]
        
        return(self)
        
        
        
        

class LdaPipeline():
    '''
    This class processes lists of keywords.
    How does it work?
    -It is initialised with a list where every element is a collection of keywords
    -It has a method to filter keywords removing those that appear less than a set number of times
    
    -It has a method to process the filtered df into an object that gensim can work with
    -It has a method to train the LDA model with the right parameters
    -It has a method to predict the topics in a corpus
    
    '''
    
    def __init__(self,corpus):
        '''
        Takes the list of terms
        '''
        
        #Store the corpus
        self.tokenised = corpus
        
    def filter(self,minimum=5):
        '''
        Removes keywords that appear less than 5 times.
        
        '''
        
        #Load
        tokenised = self.tokenised
        
        #Count tokens
        token_counts = pd.Series([x for el in tokenised for x in el]).value_counts()
        
        #Tokens to keep
        keep = token_counts.index[token_counts>minimum]
        
        #Filter
        tokenised_filtered = [[x for x in el if x in keep] for el in tokenised]
        
        #Store
        self.tokenised = tokenised_filtered
        self.empty_groups = np.sum([len(x)==0 for x in tokenised_filtered])
        
        return(self)
    
    def clean(self):
        '''
        Remove symbols and numbers
        
        '''
        
        
        
    
        
    def process(self):
        '''
        This creates the bag of words we use in the gensim analysis
        
        '''
        #Load the list of keywords
        tokenised = self.tokenised
        
        #Create the dictionary
        dictionary = corpora.Dictionary(tokenised)
        
        #Create the Bag of words. This converts keywords into ids
        corpus = [dictionary.doc2bow(x) for x in tokenised]
        
        self.corpus = corpus
        self.dictionary = dictionary
        return(self)
        
    def tfidf(self):
        '''
        This is optional: We extract the term-frequency inverse document frequency of the words in
        the corpus. The idea is to identify those keywords that are more salient in a document by normalising over
        their frequency in the whole corpus
        
        '''
        #Load the corpus
        corpus = self.corpus
        
        #Fit a TFIDF model on the data
        tfidf = models.TfidfModel(corpus)
        
        #Transform the corpus and save it
        self.corpus = tfidf[corpus]
        
        return(self)
    
    def fit_lda(self,num_topics=20,passes=5,iterations=75,random_state=1803):
        '''
        
        This fits the LDA model taking a set of keyword arguments.
        #Number of passes, iterations and random state for reproducibility. We will have to consider
        reproducibility eventually.
        
        '''
        
        #Load the corpus
        corpus = self.corpus
        
        #Train the LDA model with the parameters we supplied
        lda = models.LdaModel(corpus,id2word=self.dictionary,
                              num_topics=num_topics,passes=passes,iterations=iterations,random_state=random_state)
        
        #Save the outputs
        self.lda_model = lda
        self.lda_topics = lda.show_topics(num_topics=num_topics)
        

        return(self)
    
    def predict_topics(self):
        '''
        This predicts the topic mix for every observation in the corpus
        
        '''
        #Load the attributes we will be working with
        lda = self.lda_model
        corpus = self.corpus
        
        #Now we create a df
        predicted = lda[corpus]
        
        #Convert this into a dataframe
        predicted_df = pd.concat([pd.DataFrame({x[0]:x[1] for x in topics},
                                              index=[num]) for num,topics in enumerate(predicted)]).fillna(0)
        
        self.predicted_df = predicted_df
        
        return(self)
    

## Import Data

We are using as an input the processed data from `01`


In [None]:
random.seed('8')

In [None]:
raw_gtr_df = pd.read_csv(
    #'../data/raw/gtr/gtr_projects.csv',
    '../data/processed/13_6_2019_gtr_for_prediction.csv',
    converters={
        'research_topics': ast.literal_eval,
        'researc_subjects': ast.literal_eval,
    }
)

# gtr_df = raw_gtr_df[(raw_gtr_df['start_year'] >= 2006) & (raw_gtr_df['start_year'] < 2017)]
# gtr_df = gtr_df[(gtr_df['funder_name'] != 'BBSRC') & (gtr_df['funder_name'] != 'MRC')]

gtr_df = raw_gtr_df[(raw_gtr_df['year'] >= 2006) & (raw_gtr_df['year'] < 2019)]
#gtr_df = gtr_df[(gtr_df['leadFunder'] != 'BBSRC') & (gtr_df['leadFunder'] != 'MRC')]

#gtr_df = gtr_df[(gtr_df['leadFunder'] != 'MRC')]

gtr_df = gtr_df.loc[['Unclassified' not in x for x in gtr_df['research_topics']]]

gtr_df.shape

## Identify topic communities in the period

#### Create all years co-occurrence graph

In [None]:
import itertools
import networkx as nx
import community

In [None]:
# We will create an edgelist from the groups in each project

#This creates combinations 
combs = [sorted(tuple) for tuple in flatten([list(itertools.combinations(vals,2)) for vals in gtr_df['research_topics']])]

#Count combinations (this is the weight)
combs_df = pd.Series(['__'.join(x) for x in combs]).value_counts().reset_index(drop=False)

combs_df.columns = ['vars','weight']

#This is an edgelist
combs_df['source'],combs_df['target'] = [[v.split('__')[num] for v in combs_df['vars']] for num in [0,1]]

combs_df.drop(['vars'],axis=1,inplace=True)

In [None]:
#Create a network
network = nx.from_pandas_edgelist(combs_df,edge_attr='weight')

#Extract the best partition
part = community.best_partition(network,resolution=0.7)

NB This is only a preliminary analysis that we still need to tune

In [None]:
#What do these look like?
pd.Series(part).reset_index(drop=False).groupby(0)['index'].apply(lambda x: print(' '.join(list(x))+'\n'))

The communities look intuitive. We will create a lookup and convert each topic into its discipline.

We will assign projects to their top discipline distinguishing between 'pure' discipline projects and mixed ones.

In [None]:
category_name_lookup = {6:'biological_sciences',
                        1:'physics',
                        0:'engineering_technology',
                        2:'environmental_sciences',
                        4:'social_sciences',
                        3:'arts_humanities',
                       5:'mathematics_computing'}


topic_discipline_lookup = {top:category_name_lookup[disc] for top,disc in part.items()}

Now we look-up the disciplines

In [None]:
gtr_df['discipline'] = gtr_df['research_topics'].apply(lambda x: [topic_discipline_lookup[val] for val in x])

gtr_df['discipline'] = [['medical_sciences'] if lf=='MRC' else x for x,lf in zip(gtr_df['discipline'],gtr_df['leadFunder'])]

In [None]:
gtr_df['discipline_sets'] = [set(x) for x in gtr_df['discipline']]

gtr_df['single_disc'] = [True if len(x)==1 else np.nan if len(x)==0 else False for x in gtr_df['discipline_sets']]

gtr_df['single_disc'].mean()

92% of projects are pure discipline

In [None]:
gtr_df.loc[gtr_df.single_disc==True,'discipline_sets'].value_counts().plot.bar(color='darkblue',title='Disciplines')

Note the absence of medical sciences

In [None]:
#Now we very crudely assume that any project funded by the MRC is in 'Health'

gtr_df['discipline_sets'] = [set(['medical_sciences']) if f =='MRC' else x for f,x in zip(gtr_df['leadFunder'],
                                                                                       gtr_df['discipline_sets'])]

In [None]:
#And now we create the training set

#Also dropping the cases with no abstracts
gtr_pure = gtr_df.loc[[len(x)==1 for x in gtr_df['discipline_sets']]].dropna(axis=0,subset=['abstractText'])

gtr_pure.leadFunder.value_counts()

### Classification

In [None]:
#ML imports
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score,GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import LabelBinarizer

import warnings

warnings.simplefilter('ignore',UserWarning)

In [None]:
# Utility functions

def flatten_list(my_list):
    '''
    Flattens a list
    '''
    
    return([x for el in my_list for x in el])


def dummies_from_list(list_of_categories):
    '''
    This function takes a list of categories and returns a df where every column is a dummie for each unique variable
    in the category. Admittedly, the function could be nicer.
    
    '''

    #We concatenate a bunch of series whose indices are the names of the variables.
    #We could have done something similar by creating DFs with one row
    
    cats = [x for x in set(flatten_list(list_of_categories))]

    df = pd.DataFrame()

    for category in cats:
    
        var = [category in x for x in list_of_categories]

        df[category] = var

    
    
    
    #dummy_df = pd.concat([pd.Series({v:1 for v in obs}) for obs in list_of_categories],axis=1).T.fillna(0)
    return(df)

In [None]:
# %load text_classifier.py
# CLasses

#One class for text classification based on text inputs

class TextClassification():
    '''
    This class takes a corpus (could be a list of strings or a tokenised corpus) and a target (could be multiclass or single class).
    
    When it is initialised it vectorises the list of tokens using sklearn's count vectoriser.
    
    It has a grid search method that takes a list of models and parameters and trains the model.
    
    It returns the output of grid search for diagnosis
    
    '''
    
    def __init__(self,corpus,target):
        '''
        
        Initialise. The class will recognise if we are feeding it a list of strings or a list of
        tokenised documents and vectorise accordingly. 
        
        It will also recognise is this a multiclass or one class problem based on the dimensions of the target array
        
        Later on, it will use control flow to modify model parameters depending on the type of data we have
        
        '''
        
        #Is this a multiclass classification problem or a single class classification problem?
        if target.shape[1]>1:
            self.mode = 'multiclass'
            
        else:
            self.mode = 'single_class'
    
    
        #Store the target
        self.Y = target
    
        #Did we feed the model a bunch of strings or a list of tokenised docs? If the latter, we clean and tokenise.
        
        if type(corpus[0])==str:
            #corpus = CleanTokenize(corpus).clean().bigram().tokenised
            corpus = CleanTokenize(corpus).clean().tokenised
            
        #Turn every list of tokens into a string for count vectorising
        corpus_string =  [' '.join(words) for words in corpus]
        
        
        #And then we count vectorise in a hacky way.
        count_vect = CountVectorizer(stop_words='english',min_df=5).fit(corpus_string)
        
        #Store the features
        self.X = count_vect.transform(corpus_string)
        
        #Store the count vectoriser (we will use it later on for prediction on new data)
        self.count_vect = count_vect
        
    def grid_search(self,models):
        '''
        The grid search method takes a list with models and their parameters and it does grid search crossvalidation.
        
        '''
        
        #Load inputs and targets into the model
        Y = self.Y
        X = self.X
        
        if self.mode=='multiclass':
            '''
            If the model is multiclass then we need to add some prefixes to the model paramas
            
            '''
        
            for mod in models:
                #Make ovr
                mod[0] = OneVsRestClassifier(mod[0])
                
                #Add the estimator prefix
                mod[1] = {'estimator__'+k:v for k,v in mod[1].items()}
                
        
        #Container with results
        results = []

        #For each model, run the analysis.
        for num,mod in enumerate(models):
            print(num)

            #Run the classifier
            clf = GridSearchCV(mod[0],mod[1])

            #Fit
            clf.fit(X,Y)

            #Append results
            results.append(clf)
        
        self.results = results
        return(self)

    
#Class to visualise the outputs of multilabel models.

#I call it OrangeBrick after YellowBrick, the package for ML output visualisation 
#(which currently doesn't support multilabel classification)


class OrangeBrick():
    '''
    This class takes a df with the true classes for a multilabel classification exercise and produces some charts visualising findings.
    
    The methods include:
    
        .confusion_stack: creates a stacked barchart with the confusion matrices stacked by category, sorting classes by performance
        .prec_rec: creates a barchart showing each class precision and recall;
        #Tobe done: Consider mixes between classes?
    
    '''
    
    def __init__(self,true_labels,predicted_labels,var_names):
        '''
        Initialise with a true labels, predicted labels and the variable names
        '''
         
        self.true_labels = true_labels
        self.predicted_labels = predicted_labels
        self.var_names = var_names
    
    def make_metrics(self):
        '''
        Estimates performance metrics (for now just confusion charts by class and precision/recall scores for the 0.5 
        decision rule.
        
        '''
        #NB in a confusion matrix in SKlearn the X axis indicates the predicted class and the Y axis indicates the ground truth.
        #This means that:
            #cf[0,0]-> TN
            #cf[1,1]-> TP
            #cf[0,1]-> FN (prediction is false, groundtruth is true)
            #cf[1,0]-> FP (prediction is true, ground truth is false)



        #Predictions and true labels
        true_labels = self.true_labels
        pred_labels = self.predicted_labels

        #Variable names
        var_names = self.var_names

        #Store confusion matrices
        score_store = []


        for num in np.arange(len(var_names)):

            #This is the confusion matrix
            cf = confusion_matrix(pred_labels[:,num],true_labels[:,num])

            #This is a melted confusion matrix
            melt_cf = pd.melt(pd.DataFrame(cf).reset_index(drop=False),id_vars='index')['value']
            melt_cf.index = ['true_negative','false_positive','false_negative','true_positive']
            melt_cf.name = var_names[num]
            
            #Order variables to separate failed vs correct predictions
            melt_cf = melt_cf.loc[['true_positive','true_negative','false_positive','false_negative']]

            #We are also interested in precision and recall
            prec = cf[1,1]/(cf[1,1]+cf[1,0])
            rec = cf[1,1]/(cf[1,1]+cf[0,1])

            prec_rec = pd.Series([prec,rec],index=['precision','recall'])
            prec_rec.name = var_names[num]
            score_store.append([melt_cf,prec_rec])
    
        self.score_store = score_store
        
        return(self)
    
    def confusion_chart(self,ax):
        '''
        Plot the confusion charts
        
        
        '''
        
        #Visualise confusion matrix outputs
        cf_df = pd.concat([x[0] for x in self.score_store],1)

        #This ranks categories by the error rates
        failure_rate = cf_df.apply(lambda x: x/x.sum(),axis=0).loc[['false' in x for x in cf_df.index]].sum().sort_values(
            ascending=False).index

        
        #Plot and add labels
        cf_df.T.loc[failure_rate,:].plot.bar(stacked=True,ax=ax,width=0.8,cmap='Accent')

        ax.legend(bbox_to_anchor=(1.01,1))
        #ax.set_title('Stacked confusion matrix for disease areas',size=16)
    
    
    def prec_rec_chart(self,ax):
        '''
        
        Plot a precision-recall chart
        
        '''
    

        #Again, we sort them here to assess model performance in different disease areas
        prec_rec = pd.concat([x[1] for x in self.score_store],1).T.sort_values('precision')
        prec_rec.plot.bar(ax=ax)

        #Add legend and title
        ax.legend(bbox_to_anchor=(1.01,1))
        #ax.set_title('Precision and Recall by disease area',size=16)

In [None]:
#Here is the corpus
corpus = list(gtr_pure['abstractText'])

#We use a utility function to create a df for a one vs rest classification
target = dummies_from_list(gtr_pure['discipline_sets'])


In [None]:
#Run grid search with these model parameters
my_models = [
    [RandomForestClassifier(),
     {'class_weight':['balanced',None],'min_samples_leaf':[1,5]}],
    
    [LogisticRegression(),
     {'class_weight':['balanced',None],'penalty':['l1','l2'],
      'C':[0.1,1,100]}]]

In [None]:
# Predict groups

#Initialise the TextClassification class
gtr_t = TextClassification(corpus,target)

In [None]:
# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

In [None]:
gtr_t.grid_search(my_models)

In [None]:
#Check scores and best estimators
for res in gtr_t.results:
    print(res.best_score_)
    print(res.best_estimator_)
    
    #This is the best estimator
best_est = gtr_t.results[1].best_estimator_

In [None]:
gtr_diag = OrangeBrick(true_labels=np.array(target),
                      predicted_labels=best_est.predict(gtr_t.X),
                      var_names=target.columns).make_metrics()

In [None]:
fig,ax = plt.subplots(nrows=2,figsize=(10,7.5))

gtr_diag.confusion_chart(ax=ax[0])
gtr_diag.prec_rec_chart(ax=ax[1])

#fig.suptitle('Model evaluation for GTR disciplines',y=1.01,size=16)

plt.tight_layout()

Not bad

## Load unlabelled data for prediction

We will label all the projects in the df we loaded initially

In [None]:
def get_latest_file(date_list,date_format='%d-%m-%Y'):
    '''
    This function takes a list of date strings and returns the most recent one
    
    Args:
        date_list: a list of strings with the format date_filename
        date_format: the format for the date, defaults to %d-%m-%Y
    
    Returns:
        The element with the latest date
    
    '''
    
    #This gets the maximum date in the gtr directory
    dates = [datetime.datetime.strptime('-'.join(x.split('_')[:3]),date_format) for x in date_list]
    
    #Return the most recent file
    most_recent = sorted([(x,y) for x,y in zip(date_list,dates)],key=lambda x:x[1])[-1][0]
    
    return(most_recent)
                                        
    
    

In [None]:
prediction_files = [x for x in os.listdir('../data/processed/') if 'for_prediction' in x]


latest_file = get_latest_file(prediction_files)

latest_file

In [None]:
#gtr_unlabelled = pd.read_csv('../data/processed/'+latest_file)

gtr_unlabelled = raw_gtr_df.copy()

During EDA we have noticed several garbagey abstract names. Let's weed them out

In [None]:
beginning_abs = pd.Series([x[:100] for x in gtr_unlabelled['abstractText']]).value_counts()

beginning_abs[:10]

#### Issue: Missing abstracts in 2010 ESRC data. Are they present in the older dataset / can we replace them from there?


In [None]:
missing_abstracts_esrc = gtr_unlabelled.query('leadFunder == "ESRC" & year < 2014')['abstractText']
missing_abstracts_esrc.shape

In [None]:
missing_abstracts_esrc_indices = list(missing_abstracts_esrc.index)

In [None]:
old_gtr_df = pd.read_csv(
    '../data/raw/gtr/gtr_projects.csv',
    #'../data/processed/13_6_2019_gtr_for_prediction.csv',
    converters={
        'research_topics': ast.literal_eval,
        'researc_subjects': ast.literal_eval,
    }
)

In [None]:
old_gtr_df['project_id_short'] = old_gtr_df['project_id'].apply(lambda x: x.split('/')[-1])

In [None]:
older_abstracts = {index:abstract for index,abstract in zip(old_gtr_df['project_id_short'],
                                                            old_gtr_df['abstract_texts'])  if (index in missing_abstracts_esrc_indices) &
                  (pd.isnull(abstract)==False)}

#Most of these projects actually have abstracts - odd!

In [None]:
gtr_unlabelled['missing_abs'] = [abst[:10] in beginning_abs.index[0] for abst in gtr_unlabelled['abstractText']]

fig,ax = plt.subplots()

pd.crosstab(gtr_unlabelled['year'],gtr_unlabelled['missing_abs'],normalize=1).plot(ax=ax)

ax.set_xlim(2006,2018)

And now we replace these in the unlabelled data

In [None]:
gtr_unlabelled['abstractText'] = [abst if ind not in older_abstracts.keys() else older_abstracts[ind] for
                                         ind,abst in zip(gtr_unlabelled.index,
                                                            gtr_unlabelled['abstractText'])]

### Now we redo the analysis

For now we will drop any project with abstract appearing > 25 times

In [None]:
uninformative_abstracts = list(beginning_abs.index[beginning_abs>25])

In [None]:
gtr_unlabelled_cleaned = gtr_unlabelled.loc[[x[:100] not in uninformative_abstracts for x in gtr_unlabelled['abstractText']]]

In [None]:
gtr_unlab_features = gtr_t.count_vect.transform(gtr_unlabelled_cleaned['abstractText'])

In [None]:
#Predicted labels
gtr_unlab_probs = pd.DataFrame(best_est.predict_proba(gtr_unlab_features),columns=target.columns)

#Get discipline names to subset things easily later
discs = target.columns

In [None]:
gtr_predicted = pd.concat([gtr_unlabelled_cleaned.reset_index(drop=False),gtr_unlab_probs],axis=1)

In [None]:
with open(f'../models/{today_str}_gtr_discipline_classifier.p','wb') as outfile:
    pickle.dump(gtr_t,outfile)

## Some checks

In [None]:
import seaborn as sns

#### What disciplines co-occur?

In [None]:
sns.heatmap(gtr_predicted[discs].applymap(lambda x: 1 if x>0.1 else 0).corr())

#### Who funds what disciplines?

In [None]:
fig,ax = plt.subplots(figsize=(5,7))

gtr_predicted.groupby(['leadFunder'])[discs].mean().plot.barh(ax=ax,width=0.8,title='Mean predicted probabilities by funder')

ax.legend(bbox_to_anchor=(1,1))

Makes sense. Each funder seems to focus on its 'core discipline'

#### What are the trends over times?

In [None]:
fig,ax = plt.subplots(figsize=(8,5))

pd.crosstab(gtr_predicted['year'],gtr_predicted[discs].idxmax(axis=1)).plot(ax=ax)

ax.set_xlim(2006,2018)

#### What about the grant type category?

In [None]:
fig,ax = plt.subplots(figsize=(5,9))

gtr_predicted.groupby(['grantCategory'])[discs].mean().plot.barh(ax=ax,width=0.8,title='Mean predicted probabilities by grant category')

ax.legend(bbox_to_anchor=(1,1))

Some important differences here. For example...

* Procurement is dominated by maths. Is this because it picks up IT procuremenr?
* Proof of market / concept projects dominated by engineering and technology.
* Vouchers / SIGs / SME support have a strong presence of physics, picking up access to facilities?

Tidy up the variable names

In [None]:
columns_clean = ['id_y','title','abstractText','year',
                 'leadFunder','status','grantCategory','amount','currencyCode',
                 'prods','tech','spin','pubs','databases',
                 'research_topics','research_activities',
                 'mathematics_computing','engineering_technology','environmental_sciences','social_sciences','physics',
                 'medical_sciences','biological_sciences','arts_humanities']
gtr_predicted = gtr_predicted[columns_clean]

gtr_new = gtr_predicted.copy()

gtr_new.rename(columns={'id_y':'project_id','abstractText':'abstract',
                             'leadFunder':'funder','grantCategory':'grant_category','currencyCode':'currency',
                             'prods':'out_prod','tech':'out_tech','spin':'out_spin','pubs':'out_pubs','databases':'out_db',
                              'mathematics_computing':'disc_maths_comp',
                              'engineering_technology':'disc_eng_tech',
                              'environmental_sciences':'disc_env',
                              'social_sciences':'disc_social',
                              'physics':'disc_physics',
                              'medical_sciences':'disc_medical',
                              'biological_sciences':'disc_biological',
                              'arts_humanities':'disc_arts_humanities'},inplace=True)

In [None]:
gtr_new.to_csv(f'../data/processed/{today_str}_gtr_labelled.csv',compression='zip')

In [None]:
def random_check(corpus,num,length):
    '''
    Prints num random examples form corpus
    
    '''
    
    selected = np.random.randint(0,len(corpus),num)
    
    texts  = [text for num,text in enumerate(corpus) if num in selected]
    
    for t in texts:
        print(t[:length])
        print('====')