Networks and Word Vectors with MeSH Labels
==========================================

In [2]:
# %load_ext line_profiler
# %load_ext memory_profiler
%load_ext autoreload
%autoreload 2

In [3]:
import os
import ast
import json
import itertools

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from collections import defaultdict, Counter
from datetime import datetime
from itertools import zip_longest
from matplotlib.ticker import NullFormatter

from analysis.src.data.readnwrite import get_data_dir
from analysis.src.data.data_utilities import flatten, eval_column, grouper

pd.options.display.max_columns = 99

In [4]:
from rhodonite.phylomemetic import PhylomemeticGraph
from rhodonite.cooccurrence import CooccurrenceGraph
from rhodonite.spectral import association_strength

In [5]:
from gensim.corpora import Dictionary

In [6]:
from graph_tool.generation import price_network
from graph_tool.draw import graph_draw
from graph_tool.all import GraphView

In [7]:
%matplotlib inline

# Paths
# Get the top path
data_path = get_data_dir()

# Create the path for external data
ext_data = os.path.join(data_path, 'external')
# Raw data
raw_data = os.path.join(data_path, 'raw')
# And external data
proc_data = os.path.join(data_path, 'processed')
# And interim data
inter_data = os.path.join(data_path, 'interim')
# And figures
fig_path = os.path.join(data_path, 'figures')

# Get date for saving files
today = datetime.utcnow()

today_str = "_".join([str(x) for x in [today.year,today.month,today.day]])

## 1. Load Data

We are going to load both the GDB and the RWJF Pioneer and Global projects, and join them into a single dataframe.

In [8]:
gdb_df = pd.read_csv(os.path.join(raw_data, 'gdb.csv'))

  interactivity=interactivity, compiler=compiler, result=result)


In [9]:
rwjf_df = pd.read_csv(os.path.join(inter_data, 'rwjf_pioneer_and_global_projects.csv'))

Now we need to join the other relevant data modules:

Dates for GDB:

In [10]:
gdb_dates_df = pd.read_csv(os.path.join(inter_data, 'gdb_dates.csv'))
gdb_df = pd.concat([gdb_df, gdb_dates_df], axis=1)

MeSH labels:

In [11]:
gdb_mesh_df = pd.read_csv(os.path.join(inter_data, 'gdb_mesh_labels.csv'))
rwjf_mesh_df = pd.read_csv(os.path.join(inter_data, 'rwjf_mesh_labels.csv'))

gdb_df = pd.concat([gdb_df, gdb_mesh_df], axis=1)
rwjf_df = pd.concat([rwjf_df, rwjf_mesh_df], axis=1)

We're going to remove projects from GitHub as they don't play nicely with MeSH terms, and Crunchbase as they're very short. There are also some projects with null descriptions.

In [12]:
gdb_df = gdb_df[gdb_df['source_id'] != 'GitHub']
gdb_df = gdb_df[gdb_df['source_id'] != 'Crunchbase']
gdb_df['description'][pd.isnull(gdb_df['description'])] = ''

Let's concatenate the two sets of projects and extract their descriptions

In [13]:
gdb_df = pd.concat([gdb_df, rwjf_df], axis=0)
gdb_df.set_index('doc_id', inplace=True)
gdb_df = gdb_df.drop_duplicates(subset='description')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [14]:
descriptions = list(gdb_df['description'].values)

## Building a MeSH Label Corpus

We need to build a corpus of MeSH label transformed documents that is appropriate for the network we want to build. This will require some filtering, however first we should build a vocabulary of all the terms that we have, so that we can reference any of them by a unique ID at any time.

In [15]:
description_mesh_labels = eval_column(gdb_df, 'mesh_labels')

For filtering later, we will calculate the counts of the MeSH labels. We know already that there are some labels which are highly over-represented, and many which occur only once in the data.

In [16]:
def frequency_filter(docs, high_threshold=None, low_threshold=None, remove=[], counter=None):
    """freqency_filter
    Filters words from a corpus that occur more frequently than high_threshold
    and less frequently than low_threshold.
    
    Args:
        docs (:obj:`list` of :obj:`list`): Corupus of tokenised documents.
        high_threshold (int): Upper limit for token frequency
        low_threshold (int): Lower limit for token frequency
        remove (:obj:`list`): List of terms to remove
    
    Yields:
        doc_filtered (:obj:`list`): Document with elements removed based
            on frequency
    """
    docs_filtered = []
    if counter is None:
        counter = Counter(flatten(docs))
    for doc in docs:
        doc_filtered = []
        for t in doc:
            if t in remove:
                continue
            if high_threshold is not None:
                if counter[t] > high_threshold:
                    continue
            if low_threshold is not None:
                if counter[t] < low_threshold:
                    continue
            doc_filtered.append(t)
        docs_filtered.append(doc_filtered)
    return docs_filtered

def filter_description_labels(description_labels, fn):
    return [list(filter(fn, dl)) for dl in description_labels]   

In [17]:
mesh_label_counts = Counter(flatten(description_mesh_labels))
mesh_label_counts.most_common(20)

[('Students', 49316),
 ('Humans', 47839),
 ('Animals', 18113),
 ('Research', 17409),
 ('Goals', 16430),
 ('Universities', 16126),
 ('Research Personnel', 13132),
 ('United States', 12550),
 ('Female', 10022),
 ('Brain', 8834),
 ('Public Health', 7638),
 ('Child', 6428),
 ('Faculty', 5329),
 ('Mathematics', 5255),
 ('Awards and Prizes', 5209),
 ('HIV Infections', 5079),
 ('Fellowships and Scholarships', 4977),
 ('Polymers', 4968),
 ('Surveys and Questionnaires', 4793),
 ('Software', 4689)]

In [18]:
description_mesh_labels_filtered = frequency_filter(description_mesh_labels, high_threshold=18000,
                                                    low_threshold=5,
                                                    remove = 
                                                    ['Students', 'Humans', 'Animals', 'Research','Goals',
                                                     'Universities', 'Research Personnel', 'United States', 
                                                     'United Kingdom', 'Research', 'Awards and Prizes',
                                                     'Faculty', 'Mice', 'Mathematics', 'Fellowships and Scholarships',
                                                    'Surveys and Questionnaires'])

In [19]:
from gensim.models.phrases import Phrases, Phraser

In [20]:
bigrams = Phrases(description_mesh_labels_filtered, min_count=3)
bigrammer = Phraser(bigrams)

In [21]:
description_mesh_labels_bigrams = [bigrammer[d] for d in description_mesh_labels_filtered]

In [22]:
trigrams = Phrases(description_mesh_labels_bigrams)
trigrammer = Phraser(trigrams)

In [23]:
description_mesh_labels_trigrams = [trigrammer[d] for d in description_mesh_labels_bigrams]

In [24]:
description_mesh_labels_final = []
for d in description_mesh_labels_trigrams:
    corrected_d = []
    for t in d:
        if len(t.split('_')) > 1:
            parts = t.split('_')
            corrected_d.append(' '.join(sorted(set(parts))))
        else:
            corrected_d.append(t)
    description_mesh_labels_final.append(corrected_d)

## Filtering Descriptions

In [25]:
gdb_df['cooccurrence_labels'] = description_mesh_labels_final

In [26]:
gdb_df_co = gdb_df[gdb_df['cooccurrence_labels'].str.len() > 2]

## Splitting Projects by Year

We'll take the most recent 10 years of projects.

In [27]:
gdb_df_co = gdb_df_co[(gdb_df_co['year'] >= 2006) & (gdb_df_co['year'] < 2018)]

In [28]:
gdb_df_co['year'].value_counts()

2016.0    6068
2015.0    5852
2017.0    5129
2013.0    4266
2014.0    4217
2012.0    3916
2010.0    3808
2009.0    3799
2011.0    3377
2008.0    3078
2007.0    2687
2006.0    1048
Name: year, dtype: int64

## Building Coocurrence Networks

From here we will want to create a new set of labelled descriptions where the terms with very high counts and little semantic value are removed, and also those that appear very few times in the corpus. We will also need to map the labels to token IDs which can then act as the vertex values in our graph.

In [71]:
times = range(2006, 2018)

In [72]:
# dictionary = Dictionary(gdb_df_co['cooccurrence_labels'])
gdb_df_co['cooccurrence_ids'] = [dictionary.doc2idx(d) for d in gdb_df_co['cooccurrence_labels']]

cooccurrence_ids_split = []
for time in times:
    cooccurrence_ids_split.append(gdb_df_co[gdb_df_co['year'] == time]['cooccurrence_ids'].values)

In [73]:
co_graphs = []
for cis in cooccurrence_ids_split:
    co = CooccurrenceGraph()
    co.from_sequences(cis, dictionary, window_size=2)
    co_graphs.append(co)

In [74]:
from rhodonite.spectral import association_strength

In [75]:
association_strengths = [association_strength(co) for co in co_graphs]

for a_s, co in zip(association_strengths, co_graphs):
    co.ep['association_strength'] = a_s

  np.multiply(occurrences, occurrences.transpose()))


In [49]:
from py_cfinder import CFinder

In [75]:
os.environ['CFINDER'] = '/home/ec2-user/cfinder_linux/CFinder_commandline64'

In [76]:
cf = CFinder()

In [63]:
a_s_thresh = [np.percentile(co.ep['association_strength'].get_array(), 5) for co in co_graphs]

In [65]:
co_graphs_filt = [
    GraphView(
        co, 
        efilt=lambda e: co.ep['association_strength'][e] > a_s
    ) 
    for co, a_s in zip(co_graphs, a_s_thresh)
]

In [52]:
from rhodonite.utilities import save_edgelist

In [66]:
for time, co in zip(times, co_graphs_filt):
    save_edgelist(co, os.path.join(inter_data, 'gdb_co_graph_{}'.format(time)))

In [77]:
cliques = []
for time in times:
    in_path = os.path.join(inter_data, 'gdb_co_graph_{}'.format(time))
    out_path = os.path.join(inter_data, 'gdb_co_cliques_{}'.format(time))
    cliques.append(cf.find(i=in_path, o=out_path))

In [87]:
clique_sets = [c['vertices'] for c in cliques]

In [89]:
for cs in cliqe_sets:
    counter = Counter([len(c) for c in cs])
    print(counter)

Counter({3: 1897, 4: 69})
Counter({3: 7831, 4: 763, 5: 40})
Counter({3: 8749, 4: 815, 5: 27})
Counter({3: 12805, 4: 2131, 5: 125, 6: 1})
Counter({3: 14123, 4: 2850, 5: 292, 6: 18})
Counter({3: 11382, 4: 1514, 5: 65, 6: 1})
Counter({3: 15283, 4: 2499, 5: 144, 6: 5})
Counter({3: 16562, 4: 2983, 5: 225, 6: 7})
Counter({3: 17922, 4: 3657, 5: 330, 6: 13})
Counter({3: 26258, 4: 7308, 5: 892, 6: 61, 7: 5})
Counter({3: 27927, 4: 9695, 5: 1540, 6: 266, 7: 31, 8: 1})
Counter({3: 22021, 4: 6187, 5: 789, 6: 48, 7: 1})


In [None]:
pg = PhylomemeticGraph()
pg.from_communities(
    clique_sets,
    labels=times,
    min_clique_size=4,
    workers=14,
    parent_limit=3
)

In [98]:
pg_f = GraphView(pg, efilt=lambda e: pg.ep['link_strength'][e] > 0.4)

In [31]:
from rhodonite.phylomemetic import label_ages, label_density, label_emergence, label_special_events

In [72]:
density = label_density(pg, co_graphs, norm=np.median)

In [103]:
pg.vp['density'] = density

In [107]:
emergence = label_emergence(pg)
pg.vp['emergence'] = emergence
branching, merging = label_special_events(pg)
pg.vp['branching'] = branching
pg.vp['merging'] = merging

In [108]:
pg.save(os.path.join(inter_data, 'gdb_pg_graph.gt'))

In [118]:
pg_thresh = GraphView(pg, efilt=lambda e: pg.ep['link_strength'][e] > 0.45)
pg_thresh = GraphView(pg_thresh, vfilt=lambda v: v.out_degree() > 0)

In [None]:
dictionary.save(os.path.join(inter_data, 'dictionary'))

### Load Phylomemetic Graph

In [70]:
pg = PhylomemeticGraph()
pg.load(os.path.join(proc_data, 'gdb_phylomemetic_10102018/gdb_pg_graph.gt'))

dictionary = Dictionary.load(os.path.join(proc_data, 'gdb_phylomemetic_10102018/dictionary'))
# dictionary.load(os.path.join(proc_data, 'gdb_phylomemetic_10102018/dictionary'))

In [38]:
from rhodonite.tabular import vertices_to_dataframe

In [39]:
pg_thresh = GraphView(pg, efilt=lambda e: pg.ep['link_strength'][e] > 0.4)

In [76]:
a=3

In [40]:
from itertools import combinations

In [53]:
def agg_community_property(community, prop, agg):
    """agg_community_property
    
    Args:
        community (:obj:`iter`):
        prop (:obj:`PropertyMap`):
        agg (function):
        
    Returns:
        agg_prop_val (:obj:`float`):
    """
    prop_vals = []
    for i, j in combinations(community, 2):
        prop_vals.append(prop[(i, j)])
    agg_prop_val = agg(prop_vals)
    return agg_prop_val

def label_cooccurrence_property(g, co_graphs, prop, agg, norm=None):
    """label_cooccurrence_property"""
    community_properties = g.new_vertex_property('float')
    df = vertices_to_dataframe(pg)
    label_groups = df.groupby('label')
    for (_, group), co in zip(label_groups, co_graphs):
        strengths = [agg_community_property(c, co.ep[prop], agg) for c in group['item']]
        
        if norm is not None:
            strengths = np.array(strengths) / norm(strengths)
        for v, d in zip(group['vertex'], strengths):
            community_properties[v] = d
    return community_properties

In [77]:
association_strength_means = label_cooccurrence_property(pg, co_graphs, 'association_strength', np.mean, norm=np.mean)

In [78]:
pg.vp['association_strength_mean'] = association_strength_means

In [79]:
pg_vertice_df = vertices_to_dataframe(pg)

In [80]:
pg_vertice_df['_emergence'] = pg_vertice_df['emergence'].map(
    {0: 'ephemeral', 1:'emerging', 2: 'steady', 3: 'declining'}
)

In [81]:
pg_vertice_df.groupby('_emergence').mean()

Unnamed: 0_level_0,label,density,emergence,branching,merging,association_strength_mean
_emergence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
declining,2014.355821,2.338779,3.0,0.0,0.785888,0.418636
emerging,2012.412836,2.489536,1.0,0.825165,0.0,0.548716
ephemeral,2012.873137,7.585991,0.0,0.0,0.0,1.753727
steady,2013.327889,1.527844,2.0,0.834882,0.820273,0.235351
