In [1]:
%reload_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from umap import UMAP
import matplotlib.pyplot as plt
import seaborn as sns; sns.set_theme()  # noqa
import dash_bio as dashbio
from python.cogtext.datasets.pubmed import PubMedDataLoader
from python.cogtext.similarity_matrix import get_similarity_matrix
from sklearn.preprocessing import normalize
from tqdm import tqdm

In [2]:
# load weights, clusters and metadata (takes ~ 20sec)

clusters = pd.read_csv('models/gpt3/abstracts_gpt3ada_clusters.csv.gz', index_col=0)
weights = np.load('models/gpt3/abstracts_gpt3ada_weights.npz')['arr_0']
clusters['weights'] = list(weights)

# load abstracts
pubmed = PubMedDataLoader(preprocessed=False, drop_low_occurred_labels=False).load()
pubmed = pubmed.merge(clusters, on='pmid', how='left')

pubmed.dropna(subset=['cluster'], inplace=True)

tasks = pubmed.query('category.str.contains("Task")')['label'].unique()
constructs = pubmed.query('category.str.contains("Task")')['label'].unique()

print(f'Successfully loaded a list of {len(pubmed)} topic-embeddings.')

Successfully loaded a list of 293014 topic-embeddings.


In [3]:
from sklearn.metrics.pairwise import pairwise_kernels

node_features = pubmed.groupby(['category','label'])['weights'].apply(np.mean)


def calc_similarity(a,b):
  return pairwise_kernels(a.reshape(1,-1), b.reshape(1,-1), metric='cosine')[0][0]

sim = node_features.apply(pd.Series).T.corr(method=calc_similarity)
sim

Unnamed: 0_level_0,category,CognitiveConstruct,CognitiveConstruct,CognitiveConstruct,CognitiveConstruct,CognitiveConstruct,CognitiveConstruct,CognitiveConstruct,CognitiveConstruct,CognitiveConstruct,CognitiveConstruct,...,CognitiveTask,CognitiveTask,CognitiveTask,CognitiveTask,CognitiveTask,CognitiveTask,CognitiveTask,CognitiveTask,CognitiveTask,CognitiveTask
Unnamed: 0_level_1,label,Attention,AttentionalControl,AttentionalInertia,BehavioralControl,BehavioralRegulation,Binding,CentralExecutive,CognitiveFlexibility,CognitiveInhibition,CogntiveControl,...,StimSSS,StopSignalTask,Stroop,TMT_-_Trail_Making_Task,TaskSwitching,TowerOfHanoi,TowerOfLondon,TwoStep,Verbal_fluency_task,WCST_-_Wisconsin_Card_Sort_Test
category,label,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
CognitiveConstruct,Attention,1.000000,0.752932,0.295641,0.483556,0.430609,0.130825,0.612105,0.727945,0.614340,0.673749,...,0.094237,0.277683,0.656375,0.588599,0.338202,0.613160,0.558000,0.085150,0.610400,0.459221
CognitiveConstruct,AttentionalControl,0.752932,1.000000,0.245826,0.407276,0.325410,0.106058,0.548220,0.607454,0.545379,0.697154,...,0.116986,0.257208,0.624316,0.458210,0.413081,0.511157,0.445758,0.056423,0.467205,0.361333
CognitiveConstruct,AttentionalInertia,0.295641,0.245826,1.000000,0.129154,0.114072,0.031252,0.136751,0.241828,0.282768,0.230889,...,0.036845,0.075522,0.177773,0.106224,0.411305,0.154857,0.106378,0.049944,0.117599,0.068427
CognitiveConstruct,BehavioralControl,0.483556,0.407276,0.129154,1.000000,0.530490,0.058796,0.233511,0.373270,0.296424,0.326214,...,0.082714,0.185369,0.283505,0.229247,0.136315,0.264296,0.215750,0.032941,0.237450,0.147182
CognitiveConstruct,BehavioralRegulation,0.430609,0.325410,0.114072,0.530490,1.000000,0.047390,0.213489,0.371935,0.249873,0.264224,...,0.032234,0.132225,0.250542,0.204852,0.116077,0.234020,0.236616,0.028873,0.219875,0.123726
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CognitiveTask,TowerOfHanoi,0.613160,0.511157,0.154857,0.264296,0.234020,0.096193,0.572009,0.623308,0.573776,0.529309,...,0.057285,0.199276,0.597647,0.606739,0.289196,1.000000,0.672605,0.038179,0.653462,0.652692
CognitiveTask,TowerOfLondon,0.558000,0.445758,0.106378,0.215750,0.236616,0.094464,0.643257,0.650526,0.612336,0.487086,...,0.045447,0.192500,0.603550,0.701378,0.238993,0.672605,1.000000,0.026739,0.760810,0.849813
CognitiveTask,TwoStep,0.085150,0.056423,0.049944,0.032941,0.028873,0.015965,0.034573,0.050176,0.063463,0.057728,...,0.008971,0.021352,0.048498,0.033977,0.118708,0.038179,0.026739,1.000000,0.036083,0.020179
CognitiveTask,Verbal_fluency_task,0.610400,0.467205,0.117599,0.237450,0.219875,0.095398,0.588772,0.648358,0.577283,0.469523,...,0.049980,0.166557,0.663901,0.826354,0.243050,0.653462,0.760810,0.036083,1.000000,0.742899


Cosine similarity of the construct vectors regardless of the tasks they rely on is as follows.

In [4]:
plot_data = sim.droplevel(0).droplevel(0,1).drop(index=tasks, columns=tasks, errors='ignore')

dashbio.Clustergram(
  data=plot_data,
  column_labels=plot_data.columns.to_list(),
  row_labels=plot_data.index.to_list(),
  cluster='all',
  center_values=False,
  height=800,
  width=900,
  display_ratio=[0.001, 0.001],
  color_map='turbo',
  # hidden_labels=['col'],
  row_dist='cosine',
  col_dist='cosine',
  # standardize='col'
)


In [5]:
# create a graph

import stellargraph as sg
from stellargraph.data import UniformRandomMetaPathWalk

sim.columns.name = 'source'
sim.index.name = 'target'

# keep = np.triu(np.ones(sim.shape)).astype('bool').reshape(sim.size)
adj = sim.droplevel(0).droplevel(0,1).stack()
adj.index.names = ['source', 'target']
adj = adj.to_frame().rename(columns={0:'weight'}).reset_index()

adj = adj.query('target != source')

# # use separate filtering for each of the edge types
edge_threshold = adj['weight'].median() + adj['weight'].std()

edges = adj.query('weight >= weight.median() + weight.std()')

# constructs

# constructs = node_features.query('category == "CognitiveConstruct"').reset_index(0,drop=True).droplevel(0,1)
# tasks = node_features.query('category == "CognitiveTask"').reset_index(0,drop=True).droplevel(0,1)
# node_features.to_frame().query('category == "CognitiveConstruct"').reset_index(0,drop=True)

tasks = node_features.apply(pd.Series).query('category == "CognitiveTask"').reset_index(0,drop=True)
constructs = node_features.apply(pd.Series).query('category == "CognitiveConstruct"').reset_index(0,drop=True)

G = sg.StellarGraph(
  nodes = {'task': tasks,
           'construct': constructs},
  edges=edges)
print(G.info())

StellarGraph: Undirected multigraph
 Nodes: 170, Edges: 5916

 Node types:
  task: [98]
    Features: float32 vector, length 509
    Edge types: task-default->construct, task-default->task
  construct: [72]
    Features: float32 vector, length 509
    Edge types: construct-default->construct, construct-default->task

 Edge types:
    construct-default->task: [2790]
        Weights: range=[0.297252, 0.960602], mean=0.442998, std=0.117636
        Features: none
    construct-default->construct: [1912]
        Weights: range=[0.297113, 0.967654], mean=0.457343, std=0.121061
        Features: none
    task-default->task: [1214]
        Weights: range=[0.297287, 0.9871], mean=0.461797, std=0.1414
        Features: none


2022-01-26 13:53:26.945397: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Now we use Metapath2Vec for the graph embedding.

In [83]:
# %pip install csrgraph


import networkx as nx
import csrgraph as cg

def weighted_metapath_random_walk(graph,
                                  walk_length:int = None, n_walks: int = None, metapaths = None):
  _g = cg.csrgraph(graph)

  # TODO walks of length>len(metapaths[0]) are also valid if it uses the same methapath

  valid_walks = []
  
  node_labels = np.array(graph.nodes())
  
  max_iter = 1000
  iter = 0

  while iter < max_iter:
    iter += 1
    walks = _g.random_walks(walk_length, n_walks, start_nodes=None)
    types = [[graph.nodes[node_labels[node]]['type'] for node in walk]
             for walk in walks]
    walks = [node_labels[walks[i]]
             for i,t in enumerate(types)
             if t in metapaths #and len(set(walks[i])) == len(walks[i])  # no loop
            ]
    valid_walks.extend(walks)
    
    valid_walks = pd.DataFrame(walks).drop_duplicates().values.tolist()

    if len(valid_walks) >= n_walks:
      return np.vstack(valid_walks[:n_walks]).tolist()

  return np.vstack(valid_walks).tolist()


metapaths = [
    # ['task', 'construct', 'task'],
    ['construct', 'task', 'construct'],
]

walks = weighted_metapath_random_walk(G.to_networkx(node_type_attr='type'), 3, 1000, metapaths)

print(f'Generated {len(walks)} random walks.')


from gensim.models import Word2Vec
model = Word2Vec(walks, vector_size=8, min_count=0, window=3, sg=1, workers=1, epochs=10000)

Generated 1000 random walks.


In [96]:
model.wv.most_similar('CategoryFluencyTask')

[('Semantic_Fluency_test', 0.9444797039031982),
 ('Verbal_fluency_task', 0.9288606643676758),
 ('AttentionNetworkTest', 0.9154469966888428),
 ('TMT_-_Trail_Making_Task', 0.9137340784072876),
 ('Span_Task', 0.8961082100868225),
 ('Sorting_task', 0.8767808079719543),
 ('CorsiBlockTask', 0.8364273309707642),
 ('AntiSaccadeTask', 0.831874430179596),
 ('NBackTask', 0.8313265442848206),
 ('ContiniousPerformanceTask', 0.8152111172676086)]