# Topic classification with Gateway To Research data

Our input is a database with research projects in the [Gateway to Research website](http://gtr.rcuk.ac.uk/). Some projects contain information about their research subject.

We will co-occurrence of research subjects in projects to identify a reduced subset of research domains. We will then use that information to classify projects into research domains depending on their subject distribution.

## 1. Preamble: imports and functions

In [3]:
%matplotlib inline

#Imports
import json
import matplotlib.pyplot as plt
import psycopg2
import pandas as pd
import numpy as np
import seaborn as sns

#For network analysis and community detection
from itertools import combinations
import networkx as nx
import community #Get this package: http://perso.crans.org/aynaud/communities/

#Path for network and visual outputs:
net_path = "insert_path_here"
img_path = "insert_path_here"


In [15]:
def extract_network_from_corpus(label_corpus,edge_attr="None",stop_words=None):
    """
    Extracts a network object from a list or series where every observation is a list of co-ocurring observations
    (e.g. interacting agents, or co-occurring words). 
    In the network output, the labels are nodes and co-occurrences between them are edges. 
    Weight is number of co-occurrences. There is the option to consider additional edge attributes.

    Inputs:
    label_corpus: corpus of documents with label co-occurrence or node interaction.
    edge_attr: an edge attribute associated to the interaction. It defaults to the string none (which we use
        for control flow)
    stop_words: labels to remove (e.g. generic terms)
    
    Returns a networkx object.
    
    """
    label_pairs_container =[]
    
    #!!! TODO Using type to control flow feels a bit hacky.
    if type(edge_attr) != str:
        #Container for pairs of labels
        edge_attr_name = edge_attr.name
    
    #For each pair
    for i in range(len(label_corpus)):
    
    #List of tuples where every tuple is a combination of topics
        pairs = list(combinations(label_corpus[i],2))
    
    #Extract these as lists of dicts capturing connections between topics, and add those to the container.
    #NB we are sorting the tuples to make sure that we don't duplicate edges.
        if len(pairs)>0:
            cont = [{'e1':sorted(tup)[0],'e2':sorted(tup)[1]} for tup in pairs]
            
            #Give each edge its corresponding attribute (if we have one)
            if type(edge_attr) != str:
                for d in cont:
                    d.update({edge_attr_name: edge_attr[i]})

            label_pairs_container.append(cont)

    #Flatten dict list and turn into DF
    label_pairs_df = pd.DataFrame([ds for sublist in label_pairs_container for ds in sublist])
    
    #Extract weights (depends on edge_attr)
    if type(edge_attr) != str:
        label_edgelist = pd.DataFrame(label_pairs_df.groupby(['e1','e2',edge_attr_name]
                                                        ).size()).reset_index().sort(columns=0,ascending=False)
    else:
        label_edgelist = pd.DataFrame(label_pairs_df.groupby(['e1','e2']
                                                        ).size()).reset_index().sort(columns=0,ascending=False)
    
    label_edgelist.rename(columns={0:'weight'},inplace=True)
    
    #Remove stop-words
    if stop_words != None:
        has_ttm =  [x in stop_words or y in stop_words for x,y in zip(label_edgelist['e1'],label_edgelist['e2'])]
        label_edgelist = label_edgelist[-pd.Series(has_ttm)]
    
    #Create label graph
    if type(edge_attr) != str:
        label_graph = nx.from_pandas_dataframe(label_edgelist,'e1','e2',['weight',edge_attr_name])
    else:
        label_graph = nx.from_pandas_dataframe(label_edgelist,'e1','e2',['weight'])
    
    return(label_graph)    
    

## 2. Data collection

In [2]:
#Create connection with our database
#Steps
#Load config file
with open("../scripts/config.json",'r') as f:
    config = json.load(f)

#Create connection string
conn_string = 'host={} dbname={} user={} password={}'.format(
                    config.get("host"),
                    config.get("database"),
                    config.get("user"),
                    config.get("passw"))
#Create connection
conn = psycopg2.connect(conn_string)

In [3]:
#Read the data
#We'll read it in chunks
chunk= 500

#Create sql query string
sql_query_str="""
    SELECT
        *
    FROM
        gtr.projects      
"""

#Read the data
results = pd.read_sql(sql_query_str,con=conn,chunksize=chunk)

#Create df from results
projects_df = pd.DataFrame()
for result in results:
     projects_df = projects_df.append(result)
        
pd.options.mode.chained_assignment = None  # default='warn'

#reindex
projects_df.reset_index(drop=True,inplace=True)

#Select relevant columns and subset
rel_vars = ['pkey','abstract_texts','grant_cats','href','identifiers',
            'lead_org_dpts','links','research_subjects','research_topics','status']
projects_rel_df = projects_df[rel_vars]

#Extract the json elements form their 1 element-dicts
projects_rel_df['research_subjects'] = projects_rel_df[
    'research_subjects'].map(lambda x: x['researchSubject'])

projects_rel_df['research_topics'] = projects_rel_df[
    'research_topics'].map(lambda x: x['researchTopic'])

In [10]:
#Focus analysis on research grant and fellowship projects
#Subset
projects_academic_df = projects_rel_df[[i in ['Research Grant','Fellowship'] for
                                                 i in projects_rel_df.grant_cats]]
        
#Only consider projects with subject data (drop nas and )
projects_w_subject_df = projects_academic_df[projects_academic_df.research_subjects.notnull()]
projects_w_subject_df = projects_academic_df[[len(i)>0 for i in
                                             projects_academic_df.research_topics]]

#Extract research subjects and research topics.
projects_w_subject_df['subject_list'] = [[i['text'] for i in sublist] for sublist in 
                                       projects_w_subject_df['research_subjects']]
projects_w_subject_df['topic_list'] = [[i['text'] for i in sublist] for sublist in 
                                       projects_w_subject_df['research_topics']]

projects_w_subject_df.head()

Unnamed: 0,pkey,abstract_texts,grant_cats,href,identifiers,lead_org_dpts,links,research_subjects,research_topics,status,subject_list,topic_list
0,51709,The widely acclaimed writer W.G. Sabald is oft...,Research Grant,http://gtr.rcuk.ac.uk:80/gtr/api/projects/3EBF...,"{u'identifier': [{u'type': u'RCUK', u'value': ...",Modern Languages and Cultures,"{u'link': [{u'otherAttributes': {}, u'href': u...","[{u'text': u'Languages & Literature', u'percen...","[{u'text': u'German, inc. Dutch & Yiddish', u'...",Closed,[Languages & Literature],"[German, inc. Dutch & Yiddish]"
2,51711,Highly concentrated photovoltaic (HCPV) system...,Research Grant,http://gtr.rcuk.ac.uk:80/gtr/api/projects/3F0F...,"{u'identifier': [{u'type': u'RCUK', u'value': ...",Mechanical Engineering,"{u'link': [{u'otherAttributes': {}, u'href': u...","[{u'text': u'Energy', u'percentage': 100, u'id...","[{u'text': u'Solar Technology', u'percentage':...",Active,[Energy],[Solar Technology]
3,51712,"This multi-partner, bi-lingual community herit...",Research Grant,http://gtr.rcuk.ac.uk:80/gtr/api/projects/74A9...,"{u'identifier': [{u'type': u'RCUK', u'value': ...",School of Arts and Humanities,"{u'link': [{u'otherAttributes': {}, u'href': u...","[{u'text': u'History', u'percentage': 0, u'id'...","[{u'text': u'Economic & Social History', u'per...",Closed,[History],[Economic & Social History]
6,51715,Autism Spectrum Conditions (ASCs) are neurodev...,Research Grant,http://gtr.rcuk.ac.uk:80/gtr/api/projects/76F1...,"{u'identifier': [{u'type': u'RCUK', u'value': ...","Culture, Communication and Media","{u'link': [{u'otherAttributes': {}, u'href': u...","[{u'text': u'Education', u'percentage': 20, u'...","[{u'text': u'Artificial Intelligence', u'perce...",Closed,"[Education, Info. & commun. Technol.]","[Artificial Intelligence, Human-Computer Inter..."
7,51716,Given a Fortran program which evaluates numeri...,Research Grant,http://gtr.rcuk.ac.uk:80/gtr/api/projects/9B8A...,"{u'identifier': [{u'type': u'RCUK', u'value': ...",Science and Technology RI,"{u'link': [{u'otherAttributes': {}, u'href': u...","[{u'text': u'Info. & commun. Technol.', u'perc...","[{u'text': u'Fundamentals of Computing', u'per...",Closed,[Info. & commun. Technol.],[Fundamentals of Computing]


In [11]:
project_all_subjects = pd.Series([i for sublist in projects_w_subject_df['subject_list'] for
                       i in sublist])
projects_all_topics = pd.Series([i for sublist in projects_w_subject_df['topic_list'] for
                       i in sublist])

## 2. Community detection

* We create a network based on subject co-occurrences
* Perform community detection on the set of topics for each project in order to identify a smaller set.
* We can attach these to organisations and regions.

In [1416]:
#Extract communities (discipline aggregates)
topic_communities = community.best_partition(extract_network_from_corpus(projects_w_subject_df.topic_list,
                                            stop_words="Research approaches"))




In [1417]:
#The community objects are dicts where keys are subjects or topics, and values their communities
#Use this group dict to relabel variables
grouped_topics_dict = {0:"Social Sciences",
                      1:"Environmental Sciences",
                      2:"Life Sciences",
                      3:"Arts and Humanities",
                      4:"Physics",
                      5:"Mathematics and Computing",
                      6:"Engineering and Technology"}

In [1418]:
#Lookup disciplines
projects_w_subject_df['aggregated_topics'] = projects_w_subject_df.topic_list.map(
    lambda x: [grouped_topics_dict[topic_communities[i]] for i in x])

#If there are more than 2 disciplines, we call the project Mixed (a.k.a. multi/interdisciplinary)
projects_w_subject_df['topic_classified'] = projects_w_subject_df['aggregated_topics'].map(
    lambda x: list(set(x))[0] if len(set(x))==1 else "Mixed")

#Check outputs
projects_w_subject_df.ix[1:10,['pkey','topic_list','aggregated_topics','topic_classified']]

Unnamed: 0,pkey,topic_list,aggregated_topics,topic_classified
2,51711,[Solar Technology],[Engineering and Technology],Engineering and Technology
3,51712,[Economic & Social History],[Arts and Humanities],Arts and Humanities
6,51715,"[Artificial Intelligence, Human-Computer Inter...","[Mathematics and Computing, Mathematics and Co...",Mixed
7,51716,[Fundamentals of Computing],[Mathematics and Computing],Mathematics and Computing
8,51717,[Data Handling & Storage],[Physics],Physics
9,51718,"[Climate & Climate Change, Pollution, Hydrolog...","[Environmental Sciences, Environmental Science...",Environmental Sciences
10,51719,"[Pollution, Boundary Layer Meteorology, Land -...","[Environmental Sciences, Environmental Science...",Environmental Sciences


In [1419]:
#Merge this with the project df
project_labelled_df = projects_df.merge(projects_w_subject_df[['pkey','topic_list','aggregated_topics','topic_classified']],
                                         on="pkey",how='left')
project_labelled_df.head()

Unnamed: 0,pkey,abstract_texts,created,grant_cats,href,health_categories,id,identifiers,lead_org_dpts,links,potential_impacts,research_activities,research_subjects,research_topics,status,titles,tech_abstracts,topic_list,aggregated_topics,topic_classified
0,51709,The widely acclaimed writer W.G. Sabald is oft...,2016-03-31 19:16:25,Research Grant,http://gtr.rcuk.ac.uk:80/gtr/api/projects/3EBF...,{u'healthCategory': []},3EBFD795-CD31-4850-9676-C08B769B3A46,"{u'identifier': [{u'type': u'RCUK', u'value': ...",Modern Languages and Cultures,"{u'link': [{u'otherAttributes': {}, u'href': u...",,{u'researchActivity': []},{u'researchSubject': [{u'text': u'Languages & ...,"{u'researchTopic': [{u'text': u'German, inc. D...",Closed,W.G. Sebald: Modernity and the Archive,,"[German, inc. Dutch & Yiddish]",[Arts and Humanities],Arts and Humanities
1,51710,Lignin is a natural polymer in abundance and p...,2016-03-31 19:16:25,Innovation Voucher,http://gtr.rcuk.ac.uk:80/gtr/api/projects/3F0E...,{u'healthCategory': []},3F0EC91C-7E09-4EE7-ABEB-BE1BA33556A6,"{u'identifier': [{u'type': u'RCUK', u'value': ...",,"{u'link': [{u'otherAttributes': {}, u'href': u...",,{u'researchActivity': []},{u'researchSubject': []},{u'researchTopic': []},Closed,Lignin: Crop Intellect,,,,
2,51711,Highly concentrated photovoltaic (HCPV) system...,2016-03-31 19:16:25,Research Grant,http://gtr.rcuk.ac.uk:80/gtr/api/projects/3F0F...,{u'healthCategory': []},3F0F02FA-2942-4D17-A01E-BD3DD5C08B35,"{u'identifier': [{u'type': u'RCUK', u'value': ...",Mechanical Engineering,"{u'link': [{u'otherAttributes': {}, u'href': u...",The impact of our proposed international colla...,{u'researchActivity': []},"{u'researchSubject': [{u'text': u'Energy', u'p...",{u'researchTopic': [{u'text': u'Solar Technolo...,Active,Newton Fund-Integrating water cooled concentra...,,[Solar Technology],[Engineering and Technology],Engineering and Technology
3,51712,"This multi-partner, bi-lingual community herit...",2016-03-31 19:16:25,Research Grant,http://gtr.rcuk.ac.uk:80/gtr/api/projects/74A9...,{u'healthCategory': []},74A9A074-1577-4116-82CD-D1A12178C5B7,"{u'identifier': [{u'type': u'RCUK', u'value': ...",School of Arts and Humanities,"{u'link': [{u'otherAttributes': {}, u'href': u...","First and foremost, the seven All our Stories ...",{u'researchActivity': []},"{u'researchSubject': [{u'text': u'History', u'...",{u'researchTopic': [{u'text': u'Economic & Soc...,Closed,Cymunedau Cysylltiedig 2: Researching the Indu...,,[Economic & Social History],[Arts and Humanities],Arts and Humanities
4,51713,Fleetfoot is our driver engagement and behavio...,2016-03-31 19:16:25,Feasibility Study,http://gtr.rcuk.ac.uk:80/gtr/api/projects/74E1...,{u'healthCategory': []},74E11FB1-09B2-4A0D-92D9-D00DCC71799D,"{u'identifier': [{u'type': u'RCUK', u'value': ...",,"{u'link': [{u'otherAttributes': {}, u'href': u...",,{u'researchActivity': []},{u'researchSubject': []},{u'researchTopic': []},Active,Harnessing Driver Data in Rental Vehicles,,,,


In [1420]:
#Add attributes to networkx and output for plotting
topic_network = extract_network_from_corpus(projects_w_subject_df.topic_list)

#Add labels
#For each label in a node
for i in topic_network.node:
    #Add a category attribute based on the grouped_topic_dict and topic_communities lookups
    
    topic_network.node[i]["category"] = grouped_topics_dict[topic_communities[i]]



In [1421]:
nx.write_graphml(topic_network,out_path+"topic_network_14may2016.graphml")

## Quick network graph via gephi

### Node size
* Degree centrality (number of edges with other topics)

### Colours = subjects
* Orange = Life sciences
* Blue = Engineering + Technology
* Red = Physics
* Dark brown = Environmental sciences
* Dark green = Maths and computing
* Light green = Arts and Humanities
* Purple = Social sciences

<img src="http://i.imgur.com/QdTsGW6.png">