# Subject Detection with Topic Cooccurrence Networks - Gateway to Research

This tutorial

## Preamble

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# importing useful Python utility libraries we'll need
import ast
import smart_open

from collections import Counter, defaultdict
import itertools

# matplotlib for static plots
import matplotlib.pyplot as plt
# numpy for mathematical functions
import numpy as np
# pandas for handling tabular data
import pandas as pd

from im_tutorials.utilities import chunks

## Import Data

In [None]:
bucket='innovation-mapping-tutorials'
gtr_projects_key='gateway-to-re'
list_cols = ['research_topics', 'research_subjects']
# We use ast.literal_eval to convert the two columns above from
# string representations of lists to actual lists.
gtr_projects_df = pd.read_csv(
    smart_open.smart_open('s3://innovation-mapping-tutorials/gateway-to-research/gtr_projects.csv'),
    converters={k: ast.literal_eval for k in list_cols}
)

In [None]:
gtr_projects_df.head()

In [None]:
# flatten the lists of research subjects and elements and count the contents
research_subject_counter = Counter(chain(*gtr_projects_df['research_subjects']))
research_topic_counter = Counter(chain(*gtr_projects_df['research_topics']))
print('There are {} unique research subjects in the GtR projects dataset.'.format(len(research_subject_counter)))
print('There are {} unique research topics in the GtR projects dataset.'.format(len(research_topic_counter)))

In [None]:
print("Top Research Topics by Frequency", '\n')
print('{:<40}{}'.format('Topic', 'Frequency'))
for k, v in research_topic_counter.most_common(20):
    print('{:<40}{}'.format(k, v))
    
print('\nMedian Topic Freqency:')
print(np.median(list(research_topic_counter.values())))

We can see that the top research topic is _Climate & Climate Change_ by some margin. However, we can also see that the top spots are populated by topics from several disciplines. 50% of the topics occur 69 times or fewer, again highlighting the skewness of the distribution.

### Field Definition Through Community Detection

We are going to define communities of research topics as groups of topics which commonly occur together. An effective way of finding these clusters, and visualising the results, is by creating a topic cooccurrence graph.

A cooccurrence graph is a network structure, where nodes are elements and an edge represents the elements of two nodes having cooccured at least once. The edges can then be "weighted" by the frequencies of each cooccurring pair. In the case of our research projects, we can say that two topics have cooccurred if they appear in at least one project together. To find all cooccurrences we therefore need to find the pairwise combinations of research topics for every project. For example, a single project with the topics
```
['Materials Characterisation', 'High Performance Computing', 'Condensed Matter Physics']
```

will become a set of topic pairs:

In [None]:
# The combinations function from itertools generates all the possible
# elements of combinations from a list with length  r.
list(combinations(['Materials Characterisation', 'High Performance Computing', 'Condensed Matter Physics'], 2))

These cooccurrences would form a triangular network, where each edge has a frequency weight of 1.

To create a cooccurrence network across all projects, we need to repeat this process for every project. We can do this in a Python list comprehension, and then chain togeher all of the cooccurring pairs into one long list.

In [None]:
# Generate every pairwise combination of research topics from each project.
# Each pair is sorted alphabetically to make sure that there is only one 
# possible permutation of each edge.
cooccurrences = list(chain(*[[tuple(sorted(c)) for c in (itertools.combinations(d, 2))] for d in gtr_projects_df['research_topics']]))
# Count the frequency of each cooccurring pair.
research_topic_co_counter = Counter(cooccurrences)

In [None]:
print("Top Research Topic Cooccurrences by Frequency", '\n')
print('{:<70}{}'.format('Cooccurrence', 'Frequency'))
for k, v in research_topic_co_counter.most_common(20):
    print('{:<70}{}'.format((k[0] + ' + ' +k[1]), v))
    
print('\nMedian Topic Cooccurrence Freqency:')
print(np.median(list(research_topic_co_counter.values())))

Looking at the most frequently cooccurring topics we can pairs that make intuitive sense and are all generally captured neatly within higher order academic disciplines.

However this, along with the individual topic frequencies, also shows us that using the cooccurrence frequency as our edge weight might not be such a good idea. High frequency elements are simply more likely to cooccur due to chance. Therefore we should normalise our edge weights. One method for this is to calculate the association strength, a proababilistic measure, where the cooccurrence freqency is normalised by the product of the individual terms' occurrence counts. It is defined as

$$ a = \frac{2 n c_{ij}}{o_{i}o_{j}} $$

where $n$ is the total number of elements, $c_{ij}$ is the number of cooccurrences between elements $i$ and $j$, and $o_{i}$ and $o_{j}$ are the individual frequency counts of each element.

In [None]:
def association_strength(combo, occurrences, cooccurrences, total):
    '''association_strength
    Calculates the association strength between a cooccurring pair.
    '''
    a_s = ((2 * total * cooccurrences[combo]) / 
           (occurrences[combo[0]] * occurrences[combo[1]]))
    return a_s

To build our cooccurrence network, we need to generate a list of unique edges from our long list of cooccurrences and then calculate the association strength for each edge.

In [None]:
# Generate a set of cooccurences (a list of unique pairs).
# This will form the edges of our cooccurrence graph.
edges = set(cooccurrences)
# Calculate the total number of elements
n = len(list(chain(*gtr_projects_df['research_topics'])))
# Calculate the association strength for each edge.
# We take the log of the association strength to give it
# a normal distribution.
assoc_strengths = np.log10([association_strength(
    edge,
    research_topic_counter, 
    research_topic_co_counter, 
    n) for edge in edges])

In [None]:
fig, ax = plt.subplots()
ax.hist(assoc_strengths, bins=100)
ax.set_xlabel('Association Strength')
plt.show()

### Build the Graph

In [None]:
import networkx as nx

In [None]:
weighted_edges = []
for (s, t), a_s in zip(edges, assoc_strengths):
    weighted_edges.append((s, t, a_s))

# for (s, t), count in research_topic_co_counter.items():
#     weighted_edges.append((s, t, count))

g = nx.Graph()
g.add_weighted_edges_from(weighted_edges, weight='association_strength')

In [None]:
print(g.edges[('Materials Characterisation', 'Materials Synthesis & Growth')])

### Community Detection

In [None]:
import community

In [None]:
part = community.best_partition(g, resolution=0.6, random_state=42, weight='weight')
n_communities = len(set(part.values()))
print('{} communities detected.'.format(n_communities))

### Interactive Network Visualisation

First we add some extra properties to the nodes in our graph.

In [None]:
from bokeh.io import show, output_notebook
from bokeh.plotting import figure
from bokeh.palettes import Category20, Spectral4
from bokeh.models import Circle, MultiLine, HoverTool, TapTool
from bokeh.models.graphs import from_networkx, NodesAndLinkedEdges

output_notebook()

In [None]:
names = {k: k for k, _ in part.items()}
nx.set_node_attributes(g, names, name='topic_name')
community_colors = {k: Category20[n_communities][c] for k, c in part.items()}
nx.set_node_attributes(g, community_colors, name='color')

print(g.nodes['Materials Characterisation'])

Then we calculate positions for the visual graph layout.

In [None]:
pos = nx.spring_layout(g, weight='association_strength', scale=2, random_state=42)

`bokeh` has built-in support for `networkx` graphs, which makes plotting and interacting with them easy.

In [None]:
plot = figure(title="Research Topic Cooccurrence Network",
              x_range=(-2.1,2.1), y_range=(-2.1,2.1),
             )

graph_renderer = from_networkx(g, pos, center=(0,0))
graph_renderer.node_renderer.glyph = Circle(size=7, fill_color='color', line_color=None)
graph_renderer.node_renderer.selection_glyph = Circle(size=7, fill_color='color')
graph_renderer.node_renderer.hover_glyph = Circle(size=7, fill_color='color')
graph_renderer.node_renderer.muted_glyph = Circle(size=7, fill_color='color', fill_alpha=0.9)


graph_renderer.edge_renderer.glyph = MultiLine(line_color="#CCCCCC", line_alpha=0.2, line_width=1)
graph_renderer.edge_renderer.selection_glyph = MultiLine(line_color=Spectral4[2], line_width=1.5)
graph_renderer.edge_renderer.hover_glyph = MultiLine(line_color=Spectral4[1], line_width=1.5)

graph_renderer.selection_policy = NodesAndLinkedEdges()

node_hover_tool = HoverTool(tooltips=[("Topic", "@topic_name")])
plot.add_tools(node_hover_tool, TapTool())

plot.renderers.append(graph_renderer)

show(plot)

### Investigating the Communities

Let's manually inspect the topics in each community to see if we can see what disciplines they might form.

In [None]:
reverse_part = defaultdict(list)
for k, v in part.items():
    reverse_part[v].append(k)
    
for c, topics in reverse_part.items():
    print(c)
    for chunk in chunks(topics, 4):
        print(', '.join(chunk))
    print('')

We can now create a community ID to discipline mapping.

In [None]:
community_discipline_map = {
    0: 'social_sciences',
    1: 'arts_humanities',
    2: 'environmental_sciences',
    3: 'maths_computing_ee',
    4: 'social_sciences',
    5: 'biological_sciences',
    6: 'physics_chemistry_engineering',
    7: 'astro_particle_physics',
    8: 'social_sciences'
}

In [None]:
topic_discipline_mapping = {top: community_discipline_map[disc] for top, disc in part.items()}

### Assigning Subjects to Projects

In [None]:
# Map topics to disciplines using pandas' apply method on
# the `research_topics` column.
gtr_projects_df['disciplines'] = gtr_projects_df['research_topics'].apply(
    lambda x: [topic_discipline_mapping[val] for val in x])
# 
gtr_projects_df['discipline_set'] = [set(d) for d in gtr_projects_df['disciplines']]
gtr_projects_df['discipline_set'][
    (gtr_projects_df['funder_name'] == 'MRC') | 
    (gtr_projects_df['funder_name'] == 'NC3Rs')] = set(['medical_sciences'])
# 
gtr_projects_df['n_disciplines'] = [len(x) for x in gtr_projects_df['discipline_set']]
# 
gtr_projects_df['is_single_discipline'] = [True if len(x)==1 else np.nan if len(x)==0 else False 
                                           for x in gtr_projects_df['discipline_set']]

gtr_projects_df['is_single_discipline'].mean() * 100

In [None]:
import seaborn as sns

In [None]:
discipline_cooccurrences = list(
    chain(*[[tuple(sorted(c)) for c in itertools.combinations_with_replacement(d, 2)] for d in gtr_projects_df['discipline_set']])
)
# Count the frequency of each cooccurring pair.
discipline_edge_counter = Counter(discipline_cooccurrences)

In [None]:
discipline_cooccurrence_df = pd.DataFrame({
    'subj0': [dcc[0] for dcc in discipline_edge_counter.keys()],
    'subj1': [dcc[1] for dcc in discipline_edge_counter.keys()],
    'count': list(discipline_edge_counter.values()),
}).pivot_table(index='subj0', columns='subj1')['count']

In [None]:
def format_discipline_labels(labels):
    return [l.get_text().replace('_', ' ').title() for l in labels]

fig, ax = plt.subplots(figsize=(7, 6))
sns.heatmap(discipline_cooccurrence_df, annot=True, fmt='.0f', ax=ax, cbar=None)
ax.set_xticklabels(format_discipline_labels(ax.get_xticklabels()), rotation=30, ha='right')
ax.set_yticklabels(format_discipline_labels(ax.get_yticklabels()))
ax.invert_yaxis()
ax.set_xlabel(None)
ax.set_ylabel(None)
ax.set_title('Discipline Crossover in Multidiscplinary Projects')
plt.show()

In [None]:
fig, ax = plt.subplots()
gtr_projects_df['n_disciplines'].value_counts().plot.bar(color='C0', ax=ax)
plt.show()

## Extra Stuff

In [None]:
class CommunityPartition:
    def __init__(self, graph):
        self.graph = graph
    
    def edgelist_to_cooccurrence(self, repeats, **best_partition_kwargs):
        edge_counter = Counter()
        for i in range(repeats):
            partition = community.best_partition(self.graph, random_state=i, **best_partition_kwargs)
            edgelist = self.partition_to_edgelist(partition)
            edge_counter.update(edgelist)

        g = nx.Graph()
        g.add_weighted_edges_from([(e[0][0], e[0][1], e[1]) for e in edge_counter.items()])
        return g
    
    def partition_to_edgelist(self, partition):
        partition_reverse_mapping = self.reverse_index_partition(partition)
        edgelist = []
        for community, elements in partition_reverse_mapping.items():
            combos = [tuple(sorted(e)) for e in itertools.combinations(elements, 2)]
            edgelist.extend(combos)
        return edgelist
     
    def reverse_index_partition(self, partition):
        partition_reverse_mapping = defaultdict(list)
        for k, v in partition.items():
            partition_reverse_mapping[v].append(k)
        return partition_reverse_mapping

In [None]:
cp = CommunityPartition(g)

In [None]:
c_co = cp.edgelist_to_cooccurrence(5, resolution=.8)

In [None]:
part_c_co = community.best_partition(c_co, resolution=0.4, random_state=42, weight='weight')
n_c_co_communities = len(set(part_c_co.values()))
print('{} communities detected.'.format(n_c_co_communities))

In [None]:
names = {k: k for k, _ in part.items()}
nx.set_node_attributes(c_co, names, name='topic_name')
c_co_community_colors = {k: Category20[n_c_co_communities][c] for k, c in part_c_co.items()}
nx.set_node_attributes(c_co, c_co_community_colors, name='color')

In [None]:
pos = nx.spring_layout(c_co, weight='weight', scale=2, random_state=42)

plot = figure(title="Research Topic Cooccurrence Network",
              x_range=(-2.1,2.1), y_range=(-2.1,2.1),
             )

graph_renderer = from_networkx(c_co, pos, center=(0,0))
graph_renderer.node_renderer.glyph = Circle(size=7, fill_color='color', line_color=None)
graph_renderer.node_renderer.selection_glyph = Circle(size=7, fill_color='color')
graph_renderer.node_renderer.hover_glyph = Circle(size=7, fill_color='color')
graph_renderer.node_renderer.muted_glyph = Circle(size=7, fill_color='color', fill_alpha=0.9)


graph_renderer.edge_renderer.glyph = MultiLine(line_color="#CCCCCC", line_alpha=0.2, line_width=1)
graph_renderer.edge_renderer.selection_glyph = MultiLine(line_color=Spectral4[2], line_width=1.5)
graph_renderer.edge_renderer.hover_glyph = MultiLine(line_color=Spectral4[1], line_width=1.5)

graph_renderer.selection_policy = NodesAndLinkedEdges()

node_hover_tool = HoverTool(tooltips=[("Topic", "@topic_name")])
plot.add_tools(node_hover_tool, TapTool())

plot.renderers.append(graph_renderer)

show(plot)

## Apply to CORDIS

In [None]:
cordis_projects_df = pd.read_csv(os.path.join(inter_data_path, 'fp7_h2020_projects.csv'))

In [None]:
cordis_abstracts = [remove_markup(a) for a in cordis_projects_df['objective'][:25]]
cordis_abstracts = [normalise_digits(a) for a in cordis_abstracts]
cordis_abstracts = lemmatize(cordis_abstracts, nlp)
cordis_abstracts = bigram(cordis_abstracts, phraser=bigrammer)
cordis_abstracts = list(stringify_docs(cordis_abstracts))

In [None]:
for abstract, pred in zip(cordis_projects_df['objective'][:25], pipe.predict(cordis_abstracts)):
    print(pred)
    print(abstract)
    print('\n==============')

In [None]:
cordis_tfidf_vecs = tfidf.transform(cordis_abstracts)

In [None]:
cordis_subject_probs = rf_random.best_estimator_.predict_proba(cordis_tfidf_vecs)
cordis_subjects = rf_random.best_estimator_.predict(cordis_tfidf_vecs)

In [None]:
subject_probs = np.zeros((len(cordis_projects_df), 8))

In [None]:
for i in range(8):
    subject_probs[:, i] = cordis_subject_probs[i][:, 0]

In [None]:
n = 101

In [None]:
cordis_projects_df['objective'][n]

In [None]:
pd.DataFrame(cordis_subjects, columns=mlb.classes_).sum()

## Alternative Feature Selection

In [None]:
feature_terms = []
indices = np.array(range(0, X_train.shape[1]))
for discipline in y_train.columns:
    features_chi2 = chi2(X_train, y_train[discipline])[0]
    threshold = np.percentile(features_chi2[~pd.isnull(features_chi2)], 90)
    discipline_indices = indices[features_chi2 > threshold]
    feature_terms.extend(np.array(tfidf.get_feature_names())[discipline_indices])

In [None]:
tfidf_stop_words = set(tfidf.get_feature_names()).difference(set(feature_terms))

In [None]:
tfidf = TfidfVectorizer(
#     max_df=0.5, 
    min_df=5, 
    sublinear_tf=True, 
    norm='l2',
    stop_words=tfidf_stop_words
)
tfidf_vecs_filt = tfidf.fit_transform(abstracts_str)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_vecs_filt, target_binarized, train_size=0.9, test_size=0.1)