# Other relevant topic labelling

**Context**
We are monitoring drivers for the diffusion of AI research in different industries. In a first pass of the analysis we operationalise these drivers with an expanded keyword search. We briefly outlined the problems with this in `01a_jmg_`, namely, that our vocabualaries are small and we don't have an index of certainty in our classification. Here we address this by training a model on a labelled dataset including information about projects related to ethics and projects related to law. We will then use that model to predict the probability that any project in the data is considering ethical issues.

**Activities**
1. Load and process GtR data
2. Identify relevant keywords
3. Train and evaluate models
4. Test results


## 0. Preamble

In [None]:
%run notebook_preamble.ipy
%run lda_pipeline.py
%run text_classifier.py
%run keyword_searches.py
%run utils.py

In [None]:
import ast
import numpy as np


In [None]:
pd.options.mode.chained_assignment = None  # default='warn'

## 1. Load data

In [None]:
raw_gtr_df = pd.read_csv(
    '../data/raw/gtr/gtr_projects.csv',
    converters={
        'research_topics': ast.literal_eval,
        'researc_subjects': ast.literal_eval,
    }
)

gtr_df = raw_gtr_df[(raw_gtr_df['start_year'] >= 2006) & (raw_gtr_df['start_year'] < 2017)]

gtr_df.dropna(axis=0,subset=['abstract_texts'],inplace=True)

gtr_df = gtr_df.loc[[len(x)>0 for x in gtr_df['research_topics']]]

#gtr_df = gtr_df[(gtr_df['funder_name'] != 'BBSRC') & (gtr_df['funder_name'] != 'MRC')]

In [None]:
legal_ethical = [x for x in set(flatten_list(gtr_df['research_topics'])) if any(var in x.lower().split(' ') for var in ['law','rights','jurisprudence',
                                                                                                                        'legal','ethics','ethical','moral','privacy'
                                                                                                                       ])]

In [None]:
gtr_df['legal_ethical'] = [(len(set(topics)&set(legal_ethical))/len(topics))>0.5 for topics in gtr_df['research_topics']]

## 2. Train model

In [None]:
target = pd.get_dummies(gtr_df['legal_ethical'])

In [None]:
tc = TextClassification(corpus=gtr_df['abstract_texts'],target=target)

In [None]:
#Run grid search with these model parameters
models = [
    [RandomForestClassifier(),
     {'class_weight':['balanced'],'min_samples_leaf':[1,5]}],
    
    [LogisticRegression(),
     {'class_weight':['balanced'],'penalty':['l1','l2'],
      'C':[0.1,1,100]}]]

In [None]:
tc.grid_search(models)

In [None]:
#Check scores and best estimators
for res in tc.results:
    print(res.best_score_)
    print(res.best_estimator_)
    
    #This is the best estimator
best_est = tc.results[1].best_estimator_

In [None]:
diag = OrangeBrick(true_labels=np.array(target),
                      predicted_labels=best_est.predict(tc.X),
                      var_names=target.columns).make_metrics()

In [None]:
fig,ax = plt.subplots(nrows=2,figsize=(10,7.5))

diag.confusion_chart(ax=ax[0])
diag.prec_rec_chart(ax=ax[1])

#fig.suptitle('Model evaluation for GTR disciplines',y=1.01,size=16)

plt.tight_layout()

In [None]:
pd.DataFrame(best_est.predict(tc.X)).sum()

In [None]:
var_comb = pd.concat([pd.DataFrame(target),pd.DataFrame(best_est.predict(tc.X)),gtr_df['abstract_texts']],axis=1)
var_comb.columns = ['actual_no_legal','actual_legal','pred_no_legal','pred_legal','abstract']

In [None]:
random_check(var_comb.loc[(var_comb.actual_legal==1) & (var_comb.pred_legal==1)]['abstract'],length=1000,num=5)

In [None]:
random_check(var_comb.loc[(var_comb.actual_legal==0) & (var_comb.pred_legal==1)]['abstract'],length=1000,num=5)

In [None]:
random_check(var_comb.loc[(var_comb.actual_legal==1) & (var_comb.pred_legal==0)]['abstract'],length=1000,num=5)

### Modelling approach 2: Using document vectors

We will use document vectors (in 300 dimensional space) to predict the labels.

In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [None]:
#Get the tokenised corpus
corpus_tokenised = CleanTokenize(gtr_df['abstract_texts']).clean().bigram().tokenised

In [None]:
#Create the tagged documents
tagged_docs = [TaggedDocument(w,[i]) for i,w in enumerate(corpus_tokenised)]

#Train the doc2vec model
d2v = Doc2Vec(documents=tagged_docs,size=300,window=5,min_count=2)

In [None]:
#Run grid search with these model parameters
models = [
    [RandomForestClassifier(),
     {'class_weight':['balanced'],'min_samples_leaf':[1,5]}],
    
    [LogisticRegression(),
     {'class_weight':['balanced'],'penalty':['l1','l2'],
      'C':[0.1,1,100]}]]

In [None]:
def grid_search(target,features,models):
        '''
        Grid search over models with different parameters. 
        
        Arguments:
            target: the variable(s) we want to predict
            features: the predictor
            models: dicts with parameters we will grid search over
            
        returns:
            The results of the grid search
        
        
        '''
        
        #Load inputs and targets into the model
        Y = target
        X = features
        
        for mod in models:
            #Make ovr
            mod[0] = OneVsRestClassifier(mod[0])
                
            #Add the estimator prefix
            mod[1] = {'estimator__'+k:v for k,v in mod[1].items()}
        
        #Container with results
        results = []

        #For each model, run the analysis.
        for num,mod in enumerate(models):
            print(num)

            #Run the classifier
            clf = GridSearchCV(mod[0],mod[1])

            #Fit
            clf.fit(X,Y)

            #Append results
            results.append(clf)
        
        return(results)

In [None]:
doc2vec_features = np.array(d2v.docvecs.vectors_docs)

doc_models = grid_search(target=target,features=doc2vec_features,models=models)

In [None]:
#Check scores and best estimators
for res in doc_models:
    print(res.best_score_)
    print(res.best_estimator_)
    
    #This is the best estimator
best_est = doc_models[1].best_estimator_

In [None]:
eth_diag = OrangeBrick(true_labels=np.array(target),
                      predicted_labels=best_est.predict(doc2vec_features),
                      var_names=target.columns).make_metrics()

In [None]:
fig,ax = plt.subplots(nrows=2,figsize=(10,7.5))

eth_diag.confusion_chart(ax=ax[0])
eth_diag.prec_rec_chart(ax=ax[1])

#fig.suptitle('Model evaluation for GTR disciplines',y=1.01,size=16)

plt.tight_layout()