In [116]:
import cairo
import json
import matplotlib.pyplot as plt
import os
import psycopg2
import pandas as pd
import numpy as np

from datetime import datetime
from itertools import combinations
import networkx as nx
import community #Get this package: http://perso.crans.org/aynaud/communities/

In [3]:
def write_network(g):
    out_dir = os.path.dirname(os.path.dirname(os.getcwd()))
    out_dir = out_dir + '/data/networks/'
    out_file = out_dir + 'gtr_network_' + datetime.now().strftime('%Y%m%d%H%M') + '.gml'
    try:
        os.makedirs(out_dir, exist_ok=True)
    except OSError:
        "Print output directory already exists. Saving network to {}".format(out_file)
    nx.write_gml(g, out_file)

def extract_network_from_corpus(label_corpus, edge_attr="None", stop_words=None):
    """
    Extracts a network object from a list or series where every observation is a list of co-ocurring observations
    (e.g. interacting agents, or co-occurring words). 
    In the network output, the labels are nodes and co-occurrences between them are edges. 
    Weight is number of co-occurrences. There is the option to consider additional edge attributes.

    Inputs:
    label_corpus: corpus of documents with label co-occurrence or node interaction.
    edge_attr: an edge attribute associated to the interaction. It defaults to the string none (which we use
        for control flow)
    stop_words: labels to remove (e.g. generic terms)
    
    Returns a networkx object.
    
    """
    label_pairs_container =[]
    
    #!!! TODO Using type to control flow feels a bit hacky.
    if type(edge_attr) != str:
        #Container for pairs of labels
        edge_attr_name = edge_attr.name
    
    #For each pair
    for i in range(len(label_corpus)):
        #List of tuples where every tuple is a combination of topics
        pairs = list(combinations(label_corpus.iloc[i], 2))

        #Extract these as lists of dicts capturing connections between topics, and add those to the container.
        #NB we are sorting the tuples to make sure that we don't duplicate edges.
        if len(pairs)>0:
            cont = [{'e1':sorted(tup)[0],'e2':sorted(tup)[1]} for tup in pairs]

            #Give each edge its corresponding attribute (if we have one)
            if type(edge_attr) != str:
                for d in cont:
                    d.update({edge_attr_name: edge_attr[i]})

            label_pairs_container.append(cont)

    #Flatten dict list and turn into DF
    label_pairs_df = pd.DataFrame([ds for sublist in label_pairs_container for ds in sublist])
    
    #Extract weights (depends on edge_attr)
    if type(edge_attr) != str:
        label_edgelist = pd.DataFrame(label_pairs_df.groupby(['e1','e2',edge_attr_name]
                                                        ).size()).reset_index().sort(columns=0,ascending=False)
    else:
        label_edgelist = pd.DataFrame(label_pairs_df.groupby(['e1','e2']
                                                        ).size()).reset_index().sort(columns=0,ascending=False)
    
    label_edgelist.rename(columns={0:'weight'},inplace=True)
    
    #Remove stop-words
    if stop_words != None:
        has_ttm =  [x in stop_words or y in stop_words for x,y in zip(label_edgelist['e1'],label_edgelist['e2'])]
        label_edgelist = label_edgelist[-pd.Series(has_ttm)]
    
    #Create label graph
    if type(edge_attr) != str:
        label_graph = nx.from_pandas_dataframe(label_edgelist,'e1','e2',['weight',edge_attr_name])
    else:
        label_graph = nx.from_pandas_dataframe(label_edgelist,'e1','e2',['weight'])
    
    return(label_graph) 

In [4]:
config_file = os.path.dirname(os.path.dirname(os.getcwd())) + '/config.json'

#Load config file
with open(config_file, 'r') as f:
    config = json.load(f)

#Create connection string
conn_string = 'host={} dbname={} user={} password={}'.format(
                    config.get("host"),
                    config.get("database"),
                    config.get("user"),
                    config.get("passw"))

#Create connection
conn = psycopg2.connect(conn_string)

In [5]:
#Read the data
#We'll read it in chunks
chunk= 500

#Create sql query string
sql_query_str="""
    SELECT
        *
    FROM
        gtr.projects      
"""

#Read the data
results = pd.read_sql(sql_query_str,con=conn,chunksize=chunk)

#Create df from results
projects_df = pd.DataFrame()
for result in results:
     projects_df = projects_df.append(result)
        
pd.options.mode.chained_assignment = None  # default='warn'

#reindex
projects_df.reset_index(drop=True,inplace=True)

#Select relevant columns and subset
rel_vars = ['pkey','abstract_texts','grant_cats','href','identifiers',
            'lead_org_dpts','links','research_subjects','research_topics','status']
projects_rel_df = projects_df[rel_vars]

#Extract the json elements form their 1 element-dicts
projects_rel_df['research_subjects'] = projects_rel_df[
    'research_subjects'].map(lambda x: x['researchSubject'])

projects_rel_df['research_topics'] = projects_rel_df[
    'research_topics'].map(lambda x: x['researchTopic'])

In [6]:
#Focus analysis on research grant and fellowship projects
#Subset
projects_academic_df = projects_rel_df[[i in ['Research Grant','Fellowship'] for
                                                 i in projects_rel_df.grant_cats]]
        
#Only consider projects with subject data (drop nas and )
projects_w_subject_df = projects_academic_df[projects_academic_df.research_topics.notnull()]
projects_w_subject_df = projects_academic_df[[len(i)>0 for i in
                                             projects_academic_df.research_topics]]

#Extract research subjects and research topics.
projects_w_subject_df['subject_list'] = [[i['text'] for i in sublist] for sublist in 
                                       projects_w_subject_df['research_subjects']]
projects_w_subject_df['topic_list'] = [[i['text'] for i in sublist] for sublist in 
                                       projects_w_subject_df['research_topics']]

In [7]:
project_all_subjects = pd.Series([i for sublist in projects_w_subject_df['subject_list'] for
                       i in sublist])
projects_all_topics = pd.Series([i for sublist in projects_w_subject_df['topic_list'] for
                       i in sublist])

In [8]:
# Network Object Creation
net = extract_network_from_corpus(projects_w_subject_df.topic_list, stop_words="Research approaches")

#Extract communities (discipline aggregates)
topic_communities = community.best_partition(net)



In [9]:
#The community objects are dicts where keys are subjects or topics, and values their communities
#Use this group dict to relabel variables
grouped_topics_dict = {
    0: "Engineering and Technology",
    1: "Life Sciences",
    2: "Physics",
    3: "Arts and Humanities",
    4: "Environmental Sciences",
    5: "Mathematics and Computing",
    6: "Social Sciences"
}

In [10]:
#Lookup disciplines
projects_w_subject_df['aggregated_topics'] = projects_w_subject_df.topic_list.map(
    lambda x: [grouped_topics_dict[topic_communities[i]] for i in x])

#If there are more than 2 disciplines, we call the project Mixed (a.k.a. multi/interdisciplinary)
projects_w_subject_df['topic_classified'] = projects_w_subject_df['aggregated_topics'].map(
    lambda x: list(set(x))[0] if len(set(x))==1 else "Mixed")

#Check outputs
projects_w_subject_df.ix[1:10, ['pkey', 'topic_list', 'aggregated_topics', 'topic_classified']]

Unnamed: 0,pkey,topic_list,aggregated_topics,topic_classified
1,37256,"[Television HTC, Media & Communication Studies]","[Engineering and Technology, Engineering and T...",Engineering and Technology
3,37258,"[New & Emerging Comp. Paradigms, Fundamentals ...","[Arts and Humanities, Arts and Humanities]",Arts and Humanities
7,37262,"[Ageing: chemistry/biochemistry, Animal & huma...","[Life Sciences, Life Sciences, Life Sciences]",Life Sciences
9,37264,"[Agricultural systems, Land - Atmosphere Inter...","[Physics, Physics, Physics, Physics, Physics]",Physics


### Higher Resolution Communities

The community detection above gives just 6 communities. We would like to, if possible, identify higher resolution communities. We can try to accompish this using igraph

In [11]:
# Import igraph
import igraph

In [12]:
# Save the networkx object to gml
write_network(net)

In [17]:
# The following methodology comes from
# http://stackoverflow.com/questions/25254151/using-igraph-in-python-for-community-detection-and-writing-community-number-for

# Read it from igraph. Specify the graph you want here.
g = igraph.read('../../data/networks/gtr_network_201608111227.gml')

# calculate dendrogram
edge_betweenness_dendrogram = g.community_edge_betweenness()

# convert it into a flat clustering
edge_betweenness_clusters = dendrogram.as_clustering()

# get the membership vector
edge_betweenness_membership = clusters.membership

In [195]:
edge_betweenness_clusters.summary()

'Clustering with 607 elements and 54 clusters'

In [176]:
walktrap_dendogram = g.community_walktrap()
walktrap_clusters = walktrap_dendogram.as_clustering()
walktrap_membership = walktrap_clusters.membership

In [194]:
walktrap_clusters.summary()

'Clustering with 607 elements and 7 clusters'

In [197]:
infomap_clusters = g.community_infomap()
infomap_membership = infomap_clusters.membership

In [198]:
infomap_clusters.summary()

'Clustering with 607 elements and 5 clusters'

In [203]:
g.es[1]

igraph.Edge(<igraph.Graph object at 0x132eb2e58>, 1, {'weight': 2.0})

In [219]:
eigen_clusters = g.community_leading_eigenvector()
eigen_clusters_membership = eigen_clusters.membership

In [218]:
eigen_clusters.summary()

'Clustering with 607 elements and 3 clusters'

In [None]:
g.save('test.svg', format='svg', layout='kk', vertex_size=3, height=1000, width=1000)