# Topic modelling

This notebook performs a hierarchical topic modelling (and perhaps other types of topic modelling too) with two corpora:

* The corpus of arXiv AI papers
* The corpus of Gateway to Research

In both cases we seek to extract topics which can be used as features in additional analyses. In the case of GtR, we want to identify topics related to ethics, prediction and data that can be used to model the diffusion of AI in different fields. In the case of arXiv, we want to identify the purpose of AI projects and explore differences across countries and teams with different levels of gender diversity

## Preamble

In [None]:
%matplotlib inline

In [None]:
# Imports

#Imports
from sbmtm import sbmtm
import graph_tool.all as gt

In [None]:
# %load lda_pipeline.py
from gensim import corpora, models
from string import punctuation
from string import digits
import re
import pandas as pd
import numpy as np

#Characters to drop
drop_characters = re.sub('-','',punctuation)+digits

#Stopwords
from nltk.corpus import stopwords

stop = stopwords.words('English')

#Stem functions
from nltk.stem import *
stemmer = PorterStemmer()


def clean_tokenise(string,drop_characters=drop_characters,stopwords=stop):
    '''
    Takes a string and cleans (makes lowercase and removes stopwords)
    
    '''
    

    #Lowercase
    str_low = string.lower()
    
    
    #Remove symbols and numbers
    str_letters = re.sub('[{drop}]'.format(drop=drop_characters),'',str_low)
    
    
    #Remove stopwords
    clean = [x for x in str_letters.split(' ') if (x not in stop) & (x!='')]
    
    return(clean)


class CleanTokenize():
    '''
    This class takes a list of strings and returns a tokenised, clean list of token lists ready
    to be processed with the LdaPipeline
    
    It has a clean method to remove symbols and stopwords
    
    It has a bigram method to detect collocated words
    
    It has a stem method to stem words
    
    '''
    
    def __init__(self,corpus):
        '''
        Takes a corpus (list where each element is a string)
        '''
        
        #Store
        self.corpus = corpus
        
    def clean(self,drop=drop_characters,stopwords=stop):
        '''
        Removes strings and stopwords, 
        
        '''
        
        cleaned = [clean_tokenise(doc,drop_characters=drop,stopwords=stop) for doc in self.corpus]
        
        self.tokenised = cleaned
        return(self)
    
    def stem(self):
        '''
        Optional: stems words
        
        '''
        #Stems each word in each tokenised sentence
        stemmed = [[stemmer.stem(word) for word in sentence] for sentence in self.tokenised]
    
        self.tokenised = stemmed
        return(self)
        
    
    def bigram(self,threshold=10):
        '''
        Optional Create bigrams.
        
        '''
        
        #Colocation detector trained on the data
        phrases = models.Phrases(self.tokenised,threshold=threshold)
        
        bigram = models.phrases.Phraser(phrases)
        
        self.tokenised = bigram[self.tokenised]
        
        return(self)
        
        
        
        

class LdaPipeline():
    '''
    This class processes lists of keywords.
    How does it work?
    -It is initialised with a list where every element is a collection of keywords
    -It has a method to filter keywords removing those that appear less than a set number of times
    
    -It has a method to process the filtered df into an object that gensim can work with
    -It has a method to train the LDA model with the right parameters
    -It has a method to predict the topics in a corpus
    
    '''
    
    def __init__(self,corpus):
        '''
        Takes the list of terms
        '''
        
        #Store the corpus
        self.tokenised = corpus
        
    def filter(self,minimum=5):
        '''
        Removes keywords that appear less than 5 times.
        
        '''
        
        #Load
        tokenised = self.tokenised
        
        #Count tokens
        token_counts = pd.Series([x for el in tokenised for x in el]).value_counts()
        
        #Tokens to keep
        keep = token_counts.index[token_counts>minimum]
        
        #Filter
        tokenised_filtered = [[x for x in el if x in keep] for el in tokenised]
        
        #Store
        self.tokenised = tokenised_filtered
        self.empty_groups = np.sum([len(x)==0 for x in tokenised_filtered])
        
        return(self)
    
    def clean(self):
        '''
        Remove symbols and numbers
        
        '''
        
        
        
    
        
    def process(self):
        '''
        This creates the bag of words we use in the gensim analysis
        
        '''
        #Load the list of keywords
        tokenised = self.tokenised
        
        #Create the dictionary
        dictionary = corpora.Dictionary(tokenised)
        
        #Create the Bag of words. This converts keywords into ids
        corpus = [dictionary.doc2bow(x) for x in tokenised]
        
        self.corpus = corpus
        self.dictionary = dictionary
        return(self)
        
    def tfidf(self):
        '''
        This is optional: We extract the term-frequency inverse document frequency of the words in
        the corpus. The idea is to identify those keywords that are more salient in a document by normalising over
        their frequency in the whole corpus
        
        '''
        #Load the corpus
        corpus = self.corpus
        
        #Fit a TFIDF model on the data
        tfidf = models.TfidfModel(corpus)
        
        #Transform the corpus and save it
        self.corpus = tfidf[corpus]
        
        return(self)
    
    def fit_lda(self,num_topics=20,passes=5,iterations=75,random_state=1803):
        '''
        
        This fits the LDA model taking a set of keyword arguments.
        #Number of passes, iterations and random state for reproducibility. We will have to consider
        reproducibility eventually.
        
        '''
        
        #Load the corpus
        corpus = self.corpus
        
        #Train the LDA model with the parameters we supplied
        lda = models.LdaModel(corpus,id2word=self.dictionary,
                              num_topics=num_topics,passes=passes,iterations=iterations,random_state=random_state)
        
        #Save the outputs
        self.lda_model = lda
        self.lda_topics = lda.show_topics(num_topics=num_topics)
        

        return(self)
    
    def predict_topics(self):
        '''
        This predicts the topic mix for every observation in the corpus
        
        '''
        #Load the attributes we will be working with
        lda = self.lda_model
        corpus = self.corpus
        
        #Now we create a df
        predicted = lda[corpus]
        
        #Convert this into a dataframe
        predicted_df = pd.concat([pd.DataFrame({x[0]:x[1] for x in topics},
                                              index=[num]) for num,topics in enumerate(predicted)]).fillna(0)
        
        self.predicted_df = predicted_df
        
        return(self)
    

In [None]:
import random

## Load arXiv

In [None]:
arxiv = pd.read_csv('../data/external/18_7_2019_arxiv_mag.csv',compression='zip',
                   dtype={'paper_id':str})[['paper_id','abstract','is_ai']]

In [None]:
#We are going to want to remove the papers that don't have a GRID match
grid_matches = pd.read_csv('../data/external/1_8_2019_paper_institute_locations.csv',compression='zip',dtype={'article_id':str})

In [None]:
matched_paper_ids= set(grid_matches.dropna(axis=0,subset=['institute_id'])['article_id'])

len(set(arxiv['paper_id']))

In [None]:
len(set(grid_matches['article_id']))

In [None]:
arxiv = arxiv.loc[[x in matched_paper_ids for x in arxiv['paper_id']]]

arxiv.shape

In [None]:
# Focus on AI papers

In [None]:
ai = arxiv.loc[arxiv['is_ai']==True].reset_index(drop=True)

In [None]:
ai_text = [re.sub('\n',' ',x).strip() for x in ai['abstract']]

project_ids = list(ai['paper_id'])

In [None]:
ct = CleanTokenize(ai_text).clean().bigram().bigram()

In [None]:
corpus_tokenised = list(ct.tokenised)

In [None]:
# Extract a random set of observations

def get_random(initial,size):
    '''
    This function extracts a random set of observations and returns them together with their indices in the original
    
    Inputs:
        -initial: the list of observations
        -size: the set we want to extract
        
    Outputs:
        -A list with the selected group and its indices
    
    '''
    
    indices = set(random.sample(list(np.arange(len(initial))),k=size))
    
    selected = [x for n,x in enumerate(initial) if n in indices]
    
    return([indices,selected])

In [None]:
short_test = get_random(corpus_tokenised,25000)

short_ids = [x for n,x in enumerate(project_ids) if n in short_test[0]]
short_corpus = short_test[1]

In [None]:
%%time
## we create an instance of the sbmtm-class
model = sbmtm()

## we have to create the word-document network from the corpus
model.make_graph(short_corpus,documents=short_ids)

## fit the model
gt.seed_rng(32) ## seed for graph-tool's random number generator --> same results
model.fit()

In [None]:
#model.topics(l=1)

In [None]:
with open(f'../models/{today_str}_arxiv_sbm.p','wb') as outfile:
    pickle.dump([model,short_ids,short_corpus],outfile)

## Corex with the GtR data

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
gtr = pd.read_csv('../data/processed/19_7_2019_gtr_processed.csv',compression='zip')[['project_id','abstract']]

In [None]:
gtr_abstract = [re.sub('\n|nsbp',' ',x).strip() for x in gtr['abstract']]

In [None]:
#ct_gtr = CleanTokenize(gtr_abstract).clean().bigram().bigram()

In [None]:
count_vect = CountVectorizer(ngram_range=[1,3],min_df=5,max_df=0.5,stop_words='english',strip_accents='unicode')

In [None]:
doc_vec = count_vect.fit_transform(gtr_abstract)

In [None]:
word_vector = [val[0] for val in sorted([(x,y) for x,y in count_vect.vocabulary_.items()],key=lambda x:x[1],reverse=False)]

In [None]:
anchor_list = [['ethical','ethics','unethical','ethical issues'],['legal','law','legality','illegal'],['prediction','predictive','predict'],
          ['data','dataset','data infrastructure','database'],['automate','automatic','automation']]

### Corex

In [None]:
import corextopic as ct

In [None]:
topic_model = ct.Corex(n_hidden=50)

In [None]:
topic_model.fit(doc_vec, words=word_vector, anchors=anchor_list, anchor_strength=4)

In [None]:
topics = topic_model.get_topics()
for topic_n,topic in enumerate(topics):
    words,mis = zip(*topic)
    topic_str = str(topic_n+1)+': '+','.join(words)
    print(topic_str)

In [None]:
topic_df_probabilities = pd.DataFrame(topic_model.p_y_given_x,columns=[f'{n}:'+' '.join([x[0] for x in topic]) for n,topic in enumerate(topics)],index=gtr['project_id'])

In [None]:
topic_df_probabilities.to_csv(f'../data/processed/{today_str}_gtr_corex_topic_mix.csv',compression='zip')

### This is what those topics look like

In [None]:
import seaborn as sns
sns.clustermap(topic_df_probabilities.corr(),cmap='seismic')