In [1]:
%reload_ext autoreload
%autoreload 2

import igraph as ig
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats as st
import xarray as xr
from python.cogtext.datasets.pubmed import PubMedDataLoader
from sklearn.preprocessing import MinMaxScaler
from tqdm.auto import tqdm
from umap import UMAP
import hypernetx as hnx

import seaborn as sns; sns.set_theme()  # noqa


In [2]:
# load weights, clusters and metadata (takes ~ 20sec)

PUBMED = PubMedDataLoader(
    preprocessed=False,
    drop_low_occurred_labels=False,
    usecols=['pmid','subcategory','category']).load()

EMBEDDINGS = xr.open_dataset('models/gpt3/abstracts_gpt3ada.nc')

doc_topic_embeddings = EMBEDDINGS.coords['pmid'].to_dataframe().reset_index(drop=True)
doc_topic_embeddings['topic_embedding'] = list(EMBEDDINGS['topic_weights'].values)
doc_topic_embeddings['topic'] = list(EMBEDDINGS['topics'].values)

PUBMED = PUBMED.merge(doc_topic_embeddings, on='pmid', how='left')
PUBMED.dropna(subset=['topic_embedding'], inplace=True)
PUBMED = PUBMED.query('topic != -1')

print(f'Successfully load a dataset of {len(PUBMED)} topic-embeddings.')

Successfully load a dataset of 300251 topic-embeddings.


In [3]:
task_labels = PUBMED.query('category.str.contains("Task")')['label'].unique()
construct_labels = PUBMED.query('category.str.contains("Construct")')['label'].unique()

In [95]:
data = PUBMED.drop(columns=['topic'])
node_embeddings = data.groupby(['category','label'])['topic_embedding'].apply(lambda x: x.mean(axis=0)).reset_index()


from sklearn.metrics.pairwise import cosine_similarity

X = np.stack(node_embeddings['topic_embedding'].values)

sim = pd.DataFrame(cosine_similarity(X), columns=node_embeddings['label'], index=node_embeddings['label'])

# sns.clustermap(sim, cmap='RdBu', figsize=(25,28))

sim.drop(columns=task_labels, index=construct_labels, inplace=True)
sim.columns.name='construct'
sim.index.name='task'
sim = sim.stack().reset_index().rename(columns={0:'similarity'})
sim = sim.query('similarity > similarity.mean() + 2* similarity.std()')

adj = sim.pivot(columns='construct', index='task').drop(columns='index', errors='ignore')
adj.columns = adj.columns.droplevel(0)

HG = hnx.Hypergraph.from_dataframe(adj)

# fig, ax = plt.subplots(figsize=(30,30))
# hnx.draw(HG, ax=ax)
# HG

hg_df = HG.dataframe().stack().reset_index().rename(columns={'level_0':'task',
                                                     'level_1':'construct',
                                                     0:'has_edge'})

hg_df = hg_df.query('has_edge == 1').drop(columns='has_edge')

hg_df.to_csv('~/hg.csv', index=False)
# hg_df.to_records()