### Some Graphtool experimentation

In [None]:
import graph_tool as gt

In [None]:
gr = gt.load_graph('../data/processed/11_8_2019_network_all.graphml')

In [None]:
sbm_comm= gt.inference.minimize_nested_blockmodel_dl(gr,B_min=40)

In [None]:
sbm_comm.draw(output='../reports/figures/sbm_out.png')

In [None]:
sbm_comm.print_summary()

In [None]:
levels = sbm_comm.get_levels()

levels[0].get_blocks()[0]

In [None]:
#Group the vertex names into communites

comm_allocations = {gr.vertex_properties['_graphml_vertex_id'][v]:levels[0].get_blocks()[v] for v in gr.get_vertices()}

gs = pd.Series(comm_allocations).reset_index(drop=False).groupby(0)['index'].apply(lambda x: ' '.join([el for el in x]))

for x in np.arange(0,len(gs)):
    
    print(x)
    print('===')
    
    
    print(gs[x])

In [None]:
gr.vertex_properties['_graphml_vertex_id'][0]

### Word embedding analysis

In [None]:
import random


In [None]:
#Clean and tokenise all AI abstracts

corpus = [re.sub('\n',' ',x.strip()) for x in arx['abstract'].iloc[random.sample(list(np.arange(0,len(arx))),750000)]]


In [None]:
ct = CleanTokenize(corpus).clean().bigram().bigram()

In [None]:
from gensim.models import Word2Vec

In [None]:
#Train word2vec model
w2v = Word2Vec(ct.tokenised,window=15)

In [None]:
def compare_conceps(concept_list,topic_list,w2v=w2v):
    '''
    This function compares a list of terms associated with a 'concept' and the list of names in a topic
    
    Args:
        -concept_list: a list of terms associated to a concept
        -topic_list: list of terms associated with a topic
        -w2v is the word embeddings representation
    
    Output:
        -The mean of the pairwise distances between elements in the concept list and elements in the topic list
    
    
    '''
    
    #Create the pairs
    pairs = product(concept_list,topic_list)
    
    #Calculate the distances
    
    dists = []
    
    for p in pairs:
        
        try:
            a_dist = w2v.wv.similarity(p[0],p[1])
            dists.append(a_dist)
        except:
            pass
        

        
    return(np.mean(dists))
    
    

In [None]:
#These are the keywords for each topic
topics_l0 = [[x[0] for x in word_mix] for word_mix in model.topics(l=0).values()]

In [None]:
# Noe we create the concept - term dict

concept_dict = {
    'product':['product','service'],
    'ethics':['ethical','moral'],
    'social':['societal'],
    'user':['user','person'],
    'theory':['theoretical'],
    'military':['weapon','warfare'],
    'surveillance':['surveillance']
}

In [None]:
topic_similarities = []

#For each key-value pair in the dict
for k,v in concept_dict.items():
    
    #Compare terms in the concepts with concepts in all the topics
    comp = [[t,compare_conceps(v,topics_l0[n])] for n,t in enumerate(topic_names)]
    
    #Create a df and turn the topics into an index for concatenation later
    comp_df = pd.DataFrame(comp).set_index(0)
    comp_df.index.name='topic_name'
    
    #Rename the column
    comp_df.columns = [k]
    
    topic_similarities.append(comp_df)
    
topic_concepts_df = pd.concat(topic_similarities,axis=1)

In [None]:
topic_concepts_df.sort_values('social',ascending=False).head(n=50)

In [None]:
topic_concepts_df.corr()

In [None]:
pd.concat([w_reg[1]['coefficient'],topic_concepts_df],axis=1).corr().iloc[0]

### Make the analysis binary

In [None]:
surv = ['person-surveillance-persons-pedestrian-pedestrians',
       'face-faces-identity-face_recognition-facial']

In [None]:
surv_cross = pd.concat([pd.crosstab(nf_df['nf'],nf_df[t]>0.1,normalize=0)[True] for t in surv],axis=1)
surv_cross.columns = surv

In [None]:
surv_cross.T.plot.bar()

### Look at trends

In [None]:
year_trends = pd.concat([(analysis_df.loc[analysis_df.year==y,surv]>0.1).sum() for y in np.arange(2006,2019)],axis=1)

year_trends.columns = np.arange(2006,2019)

year_trends_cs = year_trends.T.cumsum()

In [None]:
ax = year_trends_cs.plot()

ax.legend(bbox_to_anchor=(1,1))

In [None]:
abst = analysis_df.loc[(analysis_df['person-surveillance-persons-pedestrian-pedestrians']>0.1),'abstract']

In [None]:
for a in abst[:10]:
    
    print(a)

### 3. Document Modelling


Here we want to measure the similarities between documents with certain topics and 'concept topics' that we have obtained from Wikipedia (see `aux_5`).

This involves:

1. Combining the arXiv documents with the wikipedia summaries (all AI?)
2. Preprocessing them
3. Training a doc2vec model on the data
4. Consider document distances



In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [None]:
#This is the text we obtained from wikipedia before
wiki_text = pd.read_csv('../data/external/11_8_2019_wiki_text.csv',index_col=None)

#We give the text the same columns as in the arxiv papers so that we can concatenate them easily
wiki_text.columns = ['paper_id','abstract']


In [None]:
#We will focus on articles in AI (note that this includes all, not just the articles where we have trained the topic models)
#This needs to be trained on all articles
#arx_ai = arx.loc[arx['is_ai']==True]

arx_ai=arx

#We focus on the id and the abstract
corpus_1 = arx_ai[['paper_id','abstract']]

corpus_2 = pd.concat([corpus_1,wiki_text],axis=0)

In [None]:
#Concatenate and turn into a list of list where the first element is the id and the second is the abstract
#We also clean some of the markup (eg line breaks)

corpus = [[row['paper_id'],re.sub('\n',' ',row['abstract']).strip()] for pid, row in corpus_2.iterrows()]

In [None]:
#Turn the corpus into two lists we will use in Doc2Vec
doc_corpus_id, doc_corpus_text = [[x[num] for x in corpus] for num in [0,1]]

#### Train Doc2Vec

In [None]:
# #preprocess the text
# documents_text = CleanTokenize(doc_corpus_text).clean().bigram()

In [None]:
#Create the tagged document corpus - each element is the tokenised text and its id

documents = [TaggedDocument(words, [doc_id]) for doc_id, words in zip(doc_corpus_id,documents_text.tokenised)]

In [None]:
#Train the model!
model = Doc2Vec(documents,vector_size=200, window=10, min_count=2, workers=4,epochs=20)

#### Compare documents in different topics with the 'concept' topics

In [None]:
# Write a function that takes all documents with a topic and compares their docvec with the concept topics.

def concept_similarity(d2v,topic_mix,topic,concept_names,threshold=0):
    '''
    
    This function calculates a vector of similarities between documents with a topic and concept topics extracted from Wikipedia.
    
    Args:
        d2v: the Doc2Vec model that contains the similarities
        topic_mix: the df with the topic distribution for each document
        topic to compare with the concept vectors
        concept_names: the conceps ids in the doc2vec model
        threshold: the threshold for classifying a document in a vector
    Returns:
        A vector of distances, the mean and median distance.
    
    '''
    
    #Ids with topic
    ids_with_topic = list(topic_mix.loc[topic_mix[topic]>threshold].index)
    
    #Store for the similarities
    concept_store = {name:[] for name in concept_names}
    
    #For each concept...
    for c in concept_names:
        
        #Calculate similarities with concept
        sims = [d2v.docvecs.similarity(c,doc_id) for doc_id in ids_with_topic]
        
        #Append similarities to the concept name store
        concept_store[c] = sims
    
    concept_stats = {k:np.median(v) for k,v in concept_store.items()}
    
    return([concept_store,pd.Series(concept_stats,name=topic)])
    


In [None]:
concept_names = list(wiki_text['paper_id'])

### Descriptive analysis

Add a bunch of exogenous variables to the analysis df

In [None]:
#Variables of interest
interesting_cuts = [['freedom_list','NF'],
                    ['country_list','China'],['country_list','Russia'],['country_list','Turkey'],
                    ['type_list','Company'],['type_list','Government'],['type_list','Education'],
                    ['institute_list','Google'],['institute_list','Facebook'],['institute_list','IBM'],['institute_list','Microsoft']]

#Create the expanded df
analysis_df_expanded = analysis_df.copy()

#For each interesting variable we expand the df
for detect in interesting_cuts:
    
    analysis_df_expanded = make_exog(analysis_df_expanded,value_container=detect[0],value=detect[1])


In [None]:
#hf = topic_comparison(analysis_df_2,topics_filtered,'has_female',mean_sim_df)

In [None]:
#hf['health'].apply(lambda x: x/x.sum(),axis=0).plot.bar()

**This doesn't work very well**

There are several reasons for this:

* The documents I am using to measure ethics, surveillance etc are not very good
* The topics are too aggregated to pick up similarity with a concept
* Topics co-occur with each other. Their relation with the concepts aren't linear.
* Let's park this for now


### Playing with Datashader



In [None]:
my_comm_names = [7,13,31,3,18]

In [None]:
comm_names

In [None]:
import math
import numpy as np
import pandas as pd

import datashader as ds
import datashader.transfer_functions as tf
from datashader.layout import random_layout, circular_layout, forceatlas2_layout
from datashader.bundling import connect_edges, hammer_bundle

from itertools import chain

In [None]:
def make_ds_network_from_doc_term_matrix(mat,threshold,id_var):
    '''
    Create a network from a document term matrix.
    
    Args
        Document term matrix where the rows are documents and the columns are topics
        threshold is the threshold to consider that a topic is present in a matrix.
        
    Returns: 
        A network
    
    '''
    
    #Melt the topic mix and remove empty entries
    cd = pd.melt(mat.reset_index(drop=False),id_vars=[id_var])

    cd = cd.loc[cd['value']>threshold]

    #This gives us the topic co-occurrence matrix
    co_occurrence = cd.groupby(id_var)['variable'].apply(lambda x: list(x))
    
    #Here the idea is to create a proximity matrix based on co-occurrences

    #Turn co-occurrences into combinations of pairs we can use to construct a similarity matrix
    sector_combs = flatten_list([sorted(list(combinations(x,2))) for x in co_occurrence])
    sector_combs = [x for x in sector_combs if len(x)>0]

    #Turn the sector combs into an edgelist
    edge_list = pd.DataFrame(sector_combs,columns=['source','target'])
    
    node_list = pd.DataFrame([x for x in mat.columns],columns=['name'])
    
    node_map = {val:num for num,val in enumerate(node_list['name'])}
    
    edge_list_mapped = edge_list.applymap(lambda x: node_map[x])
    
    node_list_mapped = node_list.applymap(lambda x: 'node'+str(node_map[x]))
    
    node_list_mapped['cat'] = [comms[n] if comms[n] in my_comm_names else 0 for n in node_list['name']]
    
    node_list_mapped['cat']=node_list_mapped['cat'].astype('category')
    
    #return(node_map)
    
    return([node_map,edge_list_mapped,node_list_mapped])

In [None]:
out = make_ds_network_from_doc_term_matrix(doc_topic_l0[topics_filtered],0.05,'paper_id')

In [None]:
nodes = out[2]

edges=out[1]

In [None]:
nodes['cat'].value_counts()

In [None]:
circular  = circular_layout(nodes, uniform=False)
randomloc = random_layout(nodes)
randomloc.tail()

In [None]:
from matplotlib.cm import Accent

cvsopts = dict(plot_height=400, plot_width=400)

def nodesplot(nodes, name=None, canvas=None, cat=None):
    canvas = ds.Canvas(**cvsopts) if canvas is None else canvas
    aggregator=None if cat is None else ds.count_cat(cat)
    agg=canvas.points(nodes,'x','y',aggregator)
    return tf.spread(tf.shade(agg, cmap=Accent), px=5, name=name)

tf.Images(nodesplot(randomloc,"Random layout",cat='cat'),
          nodesplot(circular, "Circular layout",cat='cat'))

In [None]:
from numba.errors import NumbaDeprecationWarning, NumbaPendingDeprecationWarning, NumbaWarning
import warnings

warnings.simplefilter('ignore', category=NumbaDeprecationWarning)
warnings.simplefilter('ignore', category=NumbaPendingDeprecationWarning)

In [None]:
%time 
forcedirected = forceatlas2_layout(nodes, edges)
tf.Images(nodesplot(forcedirected, "ForceAtlas2 layout",cat='cat'))

In [None]:
def edgesplot(edges, name=None, canvas=None):
    canvas = ds.Canvas(**cvsopts) if canvas is None else canvas
    return tf.shade(canvas.line(edges, 'x','y', agg=ds.count()), name=name)

def graphplot(nodes, edges, name="", canvas=None, cat=None):
    if canvas is None:
        xr = nodes.x.min(), nodes.x.max()
        yr = nodes.y.min(), nodes.y.max()
        canvas = ds.Canvas(x_range=xr, y_range=yr, **cvsopts)

    np = nodesplot(nodes, name + " nodes", canvas, cat)
    ep = edgesplot(edges, name + " edges", canvas)
    return tf.stack(ep, np, how="over", name=name)

In [None]:
cd = circular
fd = forcedirected

%time cd_d = graphplot(cd, connect_edges(cd,edges), "Circular layout",cat='cat')
%time fd_d = graphplot(fd, connect_edges(fd,edges), "Force-directed",cat='cat')
#%time cd_b = graphplot(cd, hammer_bundle(cd,edges), "Circular layout, bundled")
#%time fd_b = graphplot(fd, hammer_bundle(fd,edges), "Force-directed, bundled")

tf.Images(cd_d,fd_d,
          #cd_b,fd_b
         ).cols(2)

**This doesn't work very well**

There are several reasons for this:

* The documents I am using to measure ethics, surveillance etc are not very good
* The topics are too aggregated to pick up similarity with a concept
* Topics co-occur with each other. Their relation with the concepts aren't linear.
* Let's park this for now


#### National disruption

In [None]:
data.columns[-20:]

In [None]:
# top_countries_2 = list(flatten_freq(data_w_countries['country_list'])[1:6].index)+['multinational']

# national_disr = pd.concat([make_disruption_tables(data_w_countries.loc[
#     [c in c_list for c_list in data_w_countries['country_list']]])[1] for c in top_countries_2],axis=1)
# national_disr.columns = top_countries_2

type_disr = pd.concat([make_disruption_tables(data_w_countries.loc[
    [t in t_list for t_list in data_w_countries['type_list']]])[1] for t in ['Company','Education']],axis=1)
type_disr.columns = ['Company','Education']



In [None]:
ax = type_disr.dropna().rolling(window=3).mean().dropna().plot(cmap='Set1',linewidth=4,figsize=(6,8))
ax.legend(bbox_to_anchor=(1,1))

In [None]:
data['entropy'] = calculate_entropy(data,topics,'entropy')['entropy']

d = data.loc[data['top_field']=='field_machine_learning_data']

In [None]:
period=list(np.arange(2000,2019))

fig,ax = plt.subplots(figsize=(13,5))

gr = d.groupby('year')['entropy']

ax.violinplot([gr.get_group(y) for y in period],widths=0.9,showmedians=True)
ax.set_xticks(np.arange(0,len(period)+1))  
ax.set_xticklabels(['']+period,rotation=90)  

#d.groupby('year')['entropy'].median().plot(ax=ax)  

### Data shader test

In [None]:
import datashader as ds
from datashader import transfer_functions as tf
from datashader.colors import Greys9
Greys9_r = list(reversed(Greys9))[:-2]

In [None]:
grid_matched_clean.columns

In [None]:
paper_counts =pd.DataFrame(grid_matched_clean.groupby(['institute_lat','institute_lon']).size()).reset_index(drop=False)

paper_counts.columns = ['lat','lon','counts']

In [None]:
cvs = ds.Canvas(plot_width=1200, plot_height=700)
agg = cvs.points(paper_counts, 'lon', 'lat',  ds.count('counts'))
img = tf.shade(agg, cmap=["white", 'darkblue'], how='linear')

In [None]:
tf.shade(agg, cmap=Greys9, how='eq_hist')

In [None]:
fig,ax = plt.subplots(figsize=(12,7))



ax.hexbin(grid_matched['institute_lon'],grid_matched['institute_lat'],cmap='Reds',bins=10,gridsize=75,edgecolor='lightgrey',linewidth=0.1)