Networks and Word Vectors with MeSH Labels
==========================================

In [None]:
%load_ext line_profiler
%load_ext memory_profiler
%load_ext autoreload
%autoreload 2

In [None]:
import os
import ast
import json
import itertools

import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt

from collections import defaultdict, Counter
from datetime import datetime
from itertools import zip_longest
from matplotlib.ticker import NullFormatter

from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.models import ColumnDataSource, HoverTool

from analysis.src.data.readnwrite import get_data_dir
from analysis.src.data.data_utilities import flatten, eval_column, grouper

pd.options.display.max_columns = 99
output_notebook()

In [None]:
from rhodonite.dynamics import PhylomemeticGraph
from rhodonite.graphs import SlidingWindowGraph
from rhodonite.spectral import association_strength

In [None]:
from gensim.corpora import Dictionary

In [None]:
from graph_tool.generation import price_network
from graph_tool.draw import graph_draw
from graph_tool.all import GraphView

In [None]:
%matplotlib inline

# Paths
# Get the top path
data_path = get_data_dir()

# Create the path for external data
ext_data = os.path.join(data_path, 'external')
# Raw data
raw_data = os.path.join(data_path, 'raw')
# And external data
proc_data = os.path.join(data_path, 'processed')
# And interim data
inter_data = os.path.join(data_path, 'interim')
# And figures
fig_path = os.path.join(data_path, 'figures')

# Get date for saving files
today = datetime.utcnow()

today_str = "_".join([str(x) for x in [today.year,today.month,today.day]])

## 1. Load Data

We are going to load both the GDB and the RWJF Pioneer and Global projects, and join them into a single dataframe.

In [None]:
gdb_df = pd.read_csv(os.path.join(raw_data, 'gdb.csv'))

In [None]:
rwjf_df = pd.read_csv(os.path.join(inter_data, 'rwjf_pioneer_and_global_projects.csv'))

Now we need to join the other relevant data modules:

Dates for GDB:

In [None]:
gdb_dates_df = pd.read_csv(os.path.join(inter_data, 'gdb_dates.csv'))
gdb_df = pd.concat([gdb_df, gdb_dates_df], axis=1)

MeSH labels:

In [None]:
gdb_mesh_df = pd.read_csv(os.path.join(inter_data, 'gdb_mesh_labels.csv'))
rwjf_mesh_df = pd.read_csv(os.path.join(inter_data, 'rwjf_mesh_labels.csv'))

gdb_df = pd.concat([gdb_df, gdb_mesh_df], axis=1)
rwjf_df = pd.concat([rwjf_df, rwjf_mesh_df], axis=1)

We're going to remove projects from GitHub as they don't play nicely with MeSH terms, and Crunchbase as they're very short. There are also some projects with null descriptions.

In [None]:
gdb_df = gdb_df[gdb_df['source_id'] != 'GitHub']
gdb_df = gdb_df[gdb_df['source_id'] != 'Crunchbase']
gdb_df['description'][pd.isnull(gdb_df['description'])] = ''

Let's concatenate the two sets of projects and extract their descriptions

In [None]:
gdb_df = pd.concat([gdb_df, rwjf_df], axis=0)
gdb_df.set_index('doc_id', inplace=True)
gdb_df = gdb_df.drop_duplicates(subset='description')

In [None]:
descriptions = list(gdb_df['description'].values)

## Building a MeSH Label Corpus

We need to build a corpus of MeSH label transformed documents that is appropriate for the network we want to build. This will require some filtering, however first we should build a vocabulary of all the terms that we have, so that we can reference any of them by a unique ID at any time.

In [None]:
description_mesh_labels = eval_column(gdb_df, 'mesh_labels')

For filtering later, we will calculate the counts of the MeSH labels. We know already that there are some labels which are highly over-represented, and many which occur only once in the data.

In [None]:
def frequency_filter(docs, high_threshold=None, low_threshold=None, remove=[], counter=None):
    """freqency_filter
    Filters words from a corpus that occur more frequently than high_threshold
    and less frequently than low_threshold.
    
    Args:
        docs (:obj:`list` of :obj:`list`): Corupus of tokenised documents.
        high_threshold (int): Upper limit for token frequency
        low_threshold (int): Lower limit for token frequency
        remove (:obj:`list`): List of terms to remove
    
    Yields:
        doc_filtered (:obj:`list`): Document with elements removed based
            on frequency
    """
    docs_filtered = []
    if counter is None:
        counter = Counter(flatten(docs))
    for doc in docs:
        doc_filtered = []
        for t in doc:
            if t in remove:
                continue
            if high_threshold is not None:
                if counter[t] > high_threshold:
                    continue
            if low_threshold is not None:
                if counter[t] < low_threshold:
                    continue
            doc_filtered.append(t)
        docs_filtered.append(doc_filtered)
    return docs_filtered

def filter_description_labels(description_labels, fn):
    return [list(filter(fn, dl)) for dl in description_labels]   

In [None]:
mesh_label_counts = Counter(flatten(description_mesh_labels))
mesh_label_counts.most_common(20)

In [None]:
description_mesh_labels_filtered = frequency_filter(description_mesh_labels, high_threshold=40000,
                                                    low_threshold=5,
                                                    remove = 
                                                    ['Students', 'Humans', 'Animals', 'Research','Goals',
                                                     'Universities', 'Research Personnel', 'United States', 
                                                     'United Kingdom', 'Research', 'Awards and Prizes',
                                                     'Faculty', 'Mice', 'Mathematics', 'Fellowships and Scholarships',
                                                    'Surveys and Questionnaires'])

In [None]:
from gensim.models.phrases import Phrases, Phraser

In [None]:
bigrams = Phrases(description_mesh_labels_filtered, min_count=3)
bigrammer = Phraser(bigrams)

In [None]:
description_mesh_labels_bigrams = [bigrammer[d] for d in description_mesh_labels_filtered]

In [None]:
trigrams = Phrases(description_mesh_labels_bigrams)
trigrammer = Phraser(trigrams)

In [None]:
description_mesh_labels_trigrams = [trigrammer[d] for d in description_mesh_labels_bigrams]

In [None]:
description_mesh_labels_final = []
for d in description_mesh_labels_trigrams:
    corrected_d = []
    for t in d:
        if len(t.split('_')) > 1:
            parts = t.split('_')
            corrected_d.append(' '.join(sorted(set(parts))))
        else:
            corrected_d.append(t)
    description_mesh_labels_final.append(corrected_d)

In [None]:
dictionary_mesh_labels = Dictionary(description_mesh_labels_final)

## Filtering Descriptions

In [None]:
gdb_df['coocurrence_labels'] = description_mesh_labels_final

In [None]:
gdb_df_co = gdb_df[gdb_df['coocurrence_labels'].str.len() > 2]

## Splitting Projects by Year

We'll take the most recent 10 years of projects.

In [None]:
gdb_df_co = gdb_df_co[(gdb_df_co['year'] >= 2006) & (gdb_df_co['year'] < 2018)]

In [None]:
gdb_df_co['year'].value_counts()

## Building a Sliding Window Coocurrence Network

From here we will want to create a new set of labelled descriptions where the terms with very high counts and little semantic value are removed, and also those that appear very few times in the corpus. We will also need to map the labels to token IDs which can then act as the vertex values in our graph.

In [None]:
times = range(2006, 2018)
co_graphs = [SlidingWindowGraph(gdb_df_co[gdb_df_co['year'] == t]['coocurrence_labels'],
                             dictionary=dictionary_mesh_labels, window_size=2)
          for t in times]

In [None]:
co_graphs = [g.prepare() for g in co_graphs]
co_graphs = [g.build() for g in co_graphs]

In [None]:
association_strengths = [association_strength(g) for g in co_graphs]

In [None]:
start_period = 0
end_period = 3

In [None]:
pg = PhylomemeticGraph(co_graphs[start_period:end_period], association_strengths[start_period:end_period],
                       dictionary_mesh_labels, times[start_period:end_period],
                       max_weight=None, min_weight=1)

In [None]:
%time pg = pg.prepare('/Users/grichardson/cfinder/pg_out', '/Users/grichardson/cfinder/CFinder_commandline_mac')
# %time pg = pg.prepare('/Users/grichardson/cfinder/pg_out')

In [None]:
pg.delta_0 = 0.5
pg.delta_1 = 0.7

In [None]:
for cs in pg.clique_sets:
    print(Counter([len(c) for c in cs]))

In [None]:
%time pg.build(workers=4, min_clique_size=4)

In [None]:
pg_thresh = GraphView(pg, vfilt=lambda v: (v.out_degree() > 0) | (v.in_degree() > 0))
graph_draw(pg_thresh, vertex_fill_color=pg_thresh.vp['color'])

In [None]:
for vertex in pg_thresh.vertices():
    if np.random.randint(0, 10) > 5:
        if vertex.in_degree() > 0:
#             if vertex.out_degree() > 0:
            terms_s = pg_thresh.vp['terms'][vertex]
            print(pg_thresh.vp['times'][vertex], '-',
                  ' + '.join(sorted([dictionary_mesh_labels[t] for t in terms_s])))
            print('\n=== Parents ===')

            for i, n in enumerate(vertex.in_neighbors()):
                terms_n = pg_thresh.vp['terms'][n]
                print(pg_thresh.vp['times'][n])
                print(' + '.join(sorted([dictionary_mesh_labels[t] for t in terms_n])))

            print('\n=== Children ===')

            for i, n in enumerate(n.out_neighbours()):
                terms_n = pg_thresh.vp['terms'][n]
                print(pg_thresh.vp['times'][n])
                print(' + '.join(sorted([dictionary_mesh_labels[t] for t in terms_n])))

            print('\n')