### Some Graphtool experimentation

In [None]:
import graph_tool as gt

In [None]:
gr = gt.load_graph('../data/processed/11_8_2019_network_all.graphml')

In [None]:
sbm_comm= gt.inference.minimize_nested_blockmodel_dl(gr,B_min=40)

In [None]:
sbm_comm.draw(output='../reports/figures/sbm_out.png')

In [None]:
sbm_comm.print_summary()

In [None]:
levels = sbm_comm.get_levels()

levels[0].get_blocks()[0]

In [None]:
#Group the vertex names into communites

comm_allocations = {gr.vertex_properties['_graphml_vertex_id'][v]:levels[0].get_blocks()[v] for v in gr.get_vertices()}

gs = pd.Series(comm_allocations).reset_index(drop=False).groupby(0)['index'].apply(lambda x: ' '.join([el for el in x]))

for x in np.arange(0,len(gs)):
    
    print(x)
    print('===')
    
    
    print(gs[x])

In [None]:
gr.vertex_properties['_graphml_vertex_id'][0]

### Word embedding analysis

In [None]:
import random


In [None]:
#Clean and tokenise all AI abstracts

corpus = [re.sub('\n',' ',x.strip()) for x in arx['abstract'].iloc[random.sample(list(np.arange(0,len(arx))),750000)]]


In [None]:
ct = CleanTokenize(corpus).clean().bigram().bigram()

In [None]:
from gensim.models import Word2Vec

In [None]:
#Train word2vec model
w2v = Word2Vec(ct.tokenised,window=15)

In [None]:
def compare_conceps(concept_list,topic_list,w2v=w2v):
    '''
    This function compares a list of terms associated with a 'concept' and the list of names in a topic
    
    Args:
        -concept_list: a list of terms associated to a concept
        -topic_list: list of terms associated with a topic
        -w2v is the word embeddings representation
    
    Output:
        -The mean of the pairwise distances between elements in the concept list and elements in the topic list
    
    
    '''
    
    #Create the pairs
    pairs = product(concept_list,topic_list)
    
    #Calculate the distances
    
    dists = []
    
    for p in pairs:
        
        try:
            a_dist = w2v.wv.similarity(p[0],p[1])
            dists.append(a_dist)
        except:
            pass
        

        
    return(np.mean(dists))
    
    

In [None]:
#These are the keywords for each topic
topics_l0 = [[x[0] for x in word_mix] for word_mix in model.topics(l=0).values()]

In [None]:
# Noe we create the concept - term dict

concept_dict = {
    'product':['product','service'],
    'ethics':['ethical','moral'],
    'social':['societal'],
    'user':['user','person'],
    'theory':['theoretical'],
    'military':['weapon','warfare'],
    'surveillance':['surveillance']
}

In [None]:
topic_similarities = []

#For each key-value pair in the dict
for k,v in concept_dict.items():
    
    #Compare terms in the concepts with concepts in all the topics
    comp = [[t,compare_conceps(v,topics_l0[n])] for n,t in enumerate(topic_names)]
    
    #Create a df and turn the topics into an index for concatenation later
    comp_df = pd.DataFrame(comp).set_index(0)
    comp_df.index.name='topic_name'
    
    #Rename the column
    comp_df.columns = [k]
    
    topic_similarities.append(comp_df)
    
topic_concepts_df = pd.concat(topic_similarities,axis=1)

In [None]:
topic_concepts_df.sort_values('social',ascending=False).head(n=50)

In [None]:
topic_concepts_df.corr()

In [None]:
pd.concat([w_reg[1]['coefficient'],topic_concepts_df],axis=1).corr().iloc[0]

### Make the analysis binary

In [None]:
surv = ['person-surveillance-persons-pedestrian-pedestrians',
       'face-faces-identity-face_recognition-facial']

In [None]:
surv_cross = pd.concat([pd.crosstab(nf_df['nf'],nf_df[t]>0.1,normalize=0)[True] for t in surv],axis=1)
surv_cross.columns = surv

In [None]:
surv_cross.T.plot.bar()

### Look at trends

In [None]:
year_trends = pd.concat([(analysis_df.loc[analysis_df.year==y,surv]>0.1).sum() for y in np.arange(2006,2019)],axis=1)

year_trends.columns = np.arange(2006,2019)

year_trends_cs = year_trends.T.cumsum()

In [None]:
ax = year_trends_cs.plot()

ax.legend(bbox_to_anchor=(1,1))

In [None]:
abst = analysis_df.loc[(analysis_df['person-surveillance-persons-pedestrian-pedestrians']>0.1),'abstract']

In [None]:
for a in abst[:10]:
    
    print(a)

### 3. Document Modelling


Here we want to measure the similarities between documents with certain topics and 'concept topics' that we have obtained from Wikipedia (see `aux_5`).

This involves:

1. Combining the arXiv documents with the wikipedia summaries (all AI?)
2. Preprocessing them
3. Training a doc2vec model on the data
4. Consider document distances



In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [None]:
#This is the text we obtained from wikipedia before
wiki_text = pd.read_csv('../data/external/11_8_2019_wiki_text.csv',index_col=None)

#We give the text the same columns as in the arxiv papers so that we can concatenate them easily
wiki_text.columns = ['paper_id','abstract']


In [None]:
#We will focus on articles in AI (note that this includes all, not just the articles where we have trained the topic models)
#This needs to be trained on all articles
#arx_ai = arx.loc[arx['is_ai']==True]

arx_ai=arx

#We focus on the id and the abstract
corpus_1 = arx_ai[['paper_id','abstract']]

corpus_2 = pd.concat([corpus_1,wiki_text],axis=0)

In [None]:
#Concatenate and turn into a list of list where the first element is the id and the second is the abstract
#We also clean some of the markup (eg line breaks)

corpus = [[row['paper_id'],re.sub('\n',' ',row['abstract']).strip()] for pid, row in corpus_2.iterrows()]

In [None]:
#Turn the corpus into two lists we will use in Doc2Vec
doc_corpus_id, doc_corpus_text = [[x[num] for x in corpus] for num in [0,1]]

#### Train Doc2Vec

In [None]:
# #preprocess the text
# documents_text = CleanTokenize(doc_corpus_text).clean().bigram()

In [None]:
#Create the tagged document corpus - each element is the tokenised text and its id

documents = [TaggedDocument(words, [doc_id]) for doc_id, words in zip(doc_corpus_id,documents_text.tokenised)]

In [None]:
#Train the model!
model = Doc2Vec(documents,vector_size=200, window=10, min_count=2, workers=4,epochs=20)

#### Compare documents in different topics with the 'concept' topics

In [None]:
# Write a function that takes all documents with a topic and compares their docvec with the concept topics.

def concept_similarity(d2v,topic_mix,topic,concept_names,threshold=0):
    '''
    
    This function calculates a vector of similarities between documents with a topic and concept topics extracted from Wikipedia.
    
    Args:
        d2v: the Doc2Vec model that contains the similarities
        topic_mix: the df with the topic distribution for each document
        topic to compare with the concept vectors
        concept_names: the conceps ids in the doc2vec model
        threshold: the threshold for classifying a document in a vector
    Returns:
        A vector of distances, the mean and median distance.
    
    '''
    
    #Ids with topic
    ids_with_topic = list(topic_mix.loc[topic_mix[topic]>threshold].index)
    
    #Store for the similarities
    concept_store = {name:[] for name in concept_names}
    
    #For each concept...
    for c in concept_names:
        
        #Calculate similarities with concept
        sims = [d2v.docvecs.similarity(c,doc_id) for doc_id in ids_with_topic]
        
        #Append similarities to the concept name store
        concept_store[c] = sims
    
    concept_stats = {k:np.median(v) for k,v in concept_store.items()}
    
    return([concept_store,pd.Series(concept_stats,name=topic)])
    


In [None]:
concept_names = list(wiki_text['paper_id'])