# ArXiv keyword search

We have attempted two supervised models to classify arXiv projects into creative industries without success.

Having given up on that stream of analysis, here we adopt a keyword-based search instead.



## Preamble

In [None]:
%run notebook_preamble.ipy

In [None]:
# Functions etc here

import random

def flatten_list(a_list):
    return([x for el in a_list for x in el])

## Load data etc

In [None]:
arx = pd.read_csv('../data/processed/6_8_2019_arxiv_processed.csv',compression='zip')

arx_papers = arx.drop_duplicates('article_id').reset_index(drop=True)

In [None]:
# %load lda_pipeline.py
from gensim import corpora, models
from string import punctuation
from string import digits
import re
import pandas as pd
import numpy as np

#Characters to drop
drop_characters = re.sub('-','',punctuation)+digits

#Stopwords
from nltk.corpus import stopwords

stop = stopwords.words('English')

#Stem functions
from nltk.stem import *
stemmer = PorterStemmer()


def clean_tokenise(string,drop_characters=drop_characters,stopwords=stop):
    '''
    Takes a string and cleans (makes lowercase and removes stopwords)
    
    '''
    

    #Lowercase
    str_low = string.lower()
    
    
    #Remove symbols and numbers
    str_letters = re.sub('[{drop}]'.format(drop=drop_characters),'',str_low)
    
    
    #Remove stopwords
    clean = [x for x in str_letters.split(' ') if (x not in stop) & (x!='')]
    
    return(clean)


class CleanTokenize():
    '''
    This class takes a list of strings and returns a tokenised, clean list of token lists ready
    to be processed with the LdaPipeline
    
    It has a clean method to remove symbols and stopwords
    
    It has a bigram method to detect collocated words
    
    It has a stem method to stem words
    
    '''
    
    def __init__(self,corpus):
        '''
        Takes a corpus (list where each element is a string)
        '''
        
        #Store
        self.corpus = corpus
        
    def clean(self,drop=drop_characters,stopwords=stop):
        '''
        Removes strings and stopwords, 
        
        '''
        
        cleaned = [clean_tokenise(doc,drop_characters=drop,stopwords=stop) for doc in self.corpus]
        
        self.tokenised = cleaned
        return(self)
    
    def stem(self):
        '''
        Optional: stems words
        
        '''
        #Stems each word in each tokenised sentence
        stemmed = [[stemmer.stem(word) for word in sentence] for sentence in self.tokenised]
    
        self.tokenised = stemmed
        return(self)
        
    
    def bigram(self,threshold=10):
        '''
        Optional Create bigrams.
        
        '''
        
        #Colocation detector trained on the data
        phrases = models.Phrases(self.tokenised,threshold=threshold)
        
        bigram = models.phrases.Phraser(phrases)
        
        self.tokenised = bigram[self.tokenised]
        
        return(self)
        
        
        
        

class LdaPipeline():
    '''
    This class processes lists of keywords.
    How does it work?
    -It is initialised with a list where every element is a collection of keywords
    -It has a method to filter keywords removing those that appear less than a set number of times
    
    -It has a method to process the filtered df into an object that gensim can work with
    -It has a method to train the LDA model with the right parameters
    -It has a method to predict the topics in a corpus
    
    '''
    
    def __init__(self,corpus):
        '''
        Takes the list of terms
        '''
        
        #Store the corpus
        self.tokenised = corpus
        
    def filter(self,minimum=5):
        '''
        Removes keywords that appear less than 5 times.
        
        '''
        
        #Load
        tokenised = self.tokenised
        
        #Count tokens
        token_counts = pd.Series([x for el in tokenised for x in el]).value_counts()
        
        #Tokens to keep
        keep = token_counts.index[token_counts>minimum]
        
        #Filter
        tokenised_filtered = [[x for x in el if x in keep] for el in tokenised]
        
        #Store
        self.tokenised = tokenised_filtered
        self.empty_groups = np.sum([len(x)==0 for x in tokenised_filtered])
        
        return(self)
    
    def clean(self):
        '''
        Remove symbols and numbers
        
        '''
        
        
        
    
        
    def process(self):
        '''
        This creates the bag of words we use in the gensim analysis
        
        '''
        #Load the list of keywords
        tokenised = self.tokenised
        
        #Create the dictionary
        dictionary = corpora.Dictionary(tokenised)
        
        #Create the Bag of words. This converts keywords into ids
        corpus = [dictionary.doc2bow(x) for x in tokenised]
        
        self.corpus = corpus
        self.dictionary = dictionary
        return(self)
        
    def tfidf(self):
        '''
        This is optional: We extract the term-frequency inverse document frequency of the words in
        the corpus. The idea is to identify those keywords that are more salient in a document by normalising over
        their frequency in the whole corpus
        
        '''
        #Load the corpus
        corpus = self.corpus
        
        #Fit a TFIDF model on the data
        tfidf = models.TfidfModel(corpus)
        
        #Transform the corpus and save it
        self.corpus = tfidf[corpus]
        
        return(self)
    
    def fit_lda(self,num_topics=20,passes=5,iterations=75,random_state=1803):
        '''
        
        This fits the LDA model taking a set of keyword arguments.
        #Number of passes, iterations and random state for reproducibility. We will have to consider
        reproducibility eventually.
        
        '''
        
        #Load the corpus
        corpus = self.corpus
        
        #Train the LDA model with the parameters we supplied
        lda = models.LdaModel(corpus,id2word=self.dictionary,
                              num_topics=num_topics,passes=passes,iterations=iterations,random_state=random_state)
        
        #Save the outputs
        self.lda_model = lda
        self.lda_topics = lda.show_topics(num_topics=num_topics)
        

        return(self)
    
    def predict_topics(self):
        '''
        This predicts the topic mix for every observation in the corpus
        
        '''
        #Load the attributes we will be working with
        lda = self.lda_model
        corpus = self.corpus
        
        #Now we create a df
        predicted = lda[corpus]
        
        #Convert this into a dataframe
        predicted_df = pd.concat([pd.DataFrame({x[0]:x[1] for x in topics},
                                              index=[num]) for num,topics in enumerate(predicted)]).fillna(0)
        
        self.predicted_df = predicted_df
        
        return(self)
    

### Keyword expansion

In [None]:
from gensim.models import Word2Vec

In [None]:
#Clean and tokenise the arXiv data
arxiv_pro = CleanTokenize(arx['summary']).clean().bigram().bigram()

In [None]:
#Use word2vec
w2v_arx = Word2Vec(arxiv_pro.tokenised,window=5)

In [None]:
creative_sector_seed = {
    'Advertising and marketing':['advertising','advert','marketing','search_engine','social_media'],
    
    'Architecture':['smart_city','building','construction'],
     #'Crafts':
    
    'Design':['user_interface','user_interaction','designer','usability'],
     
    'Film, TV, video, radio and photography':['video','film','television','radio','photography','image','photo'],
    
    'IT, software and computer services':['software','programming'],
    
    'Museums, galleries and libraries':['museum','gallery','library','repository'],
    
    'Music, performing and visual arts':['music','song','audio','art','artist','creativity','theatre'],
    
    'Publishing': ['printing','book','e-book','book','journalism','newspaper']}

In [None]:
#This dict comprehension extracts similar terms to the one above, but only if they are above a certain similarity threshold

#Here we go.
expanded_kws = {k:set(
    flatten_list([[x[0] for x in w2v_arx.wv.most_similar(w) if x[1]>0.8] for w in v])) for k,v in 
    creative_sector_seed.items()}

In [None]:
#Now we label papers with term occurrences

out = []

for k,val in creative_sector_seed.items():
    
    #name = ['kw_pres'+k]
    
    #Count the number of occurrences of keywords related to a creative industry in a paper
    arx['kw_n_'+re.sub(' ','_',k.lower())] = [len(expanded_kws[k]&set(abst)) for abst in arxiv_pro.tokenised]
    

In [None]:
kw_names = ['kw_n_'+re.sub(' ','_',k.lower()) for k in creative_sector_seed.keys()]

In [None]:
arx[kw_names].sum()

In [None]:
# Check

for s in kw_names:
    
    print(s)
    print('===')
    
    #ai_ex = arx.loc[(cb_sectors[s]>0) & (cb_sectors['ai_flag']==True)]
    
    arx_temp = arx.loc[arx[s]>0]
    
    choose_three = random.sample(list(arx_temp['summary']),3)
    
    for des in choose_three:
        
        print(des[:500])
        print('\n')
    

In [None]:
a