# 3 Topic Embedding

This notebook implements an analysis that first produces a topic embedding for cognitive tasks and constructs. Topic embedding refers to the probabilities of assigning a topic to a given task/construct corpus. For example, task A could be assigned the following topic embedding: `[1., .5, .1]` which basically shows the probability of observing the three topics in the corpus A.

## Data

**Input**: `pubmed_abstracts.csv.gz`.

**Output**: `topic_embeddings` is a table in that each row denotes a document, columns are topics, and values are the probabilities of being assigned to topics.

In [6]:
import pandas as pd
import numpy as np

from pathlib import Path

from sklearn.datasets import make_classification

from bertopic import BERTopic

import torch
from torch import nn
import torch.nn.functional as F

In [7]:
# load the data

pubmed_path = Path('data/pubmed_abstracts_preprocessed.csv.gz')
model_name = 'pubmed100pct_bertopic'
version = 'v202110141'

indices = np.load(f'models/{model_name}_{version}.idx.npz')['arr_0']
model: BERTopic = BERTopic.load(f'models/{model_name}_{version}.model')
topics = np.load(f'models/{model_name}_{version}.topics.npz')['arr_0']
probs = np.load(f'models/{model_name}_{version}.probs.npz')['arr_0']

data = pd.read_csv(pubmed_path)
data = data[data.index.isin(indices)]

# DEBUG
with pd.option_context('display.max_rows', 10000):
  display(model.get_topic_info())

# model.visualize_topics()

# model.visualize_barchart()
# model.visualize_distribution(probabilities=probs[0])

# model.get_params()
# model.get_topics()

Unnamed: 0,Topic,Count,Name
0,-1,206609,-1_dopamine_autism_hyperactivity disorder_defi...
1,0,8162,0_parenting_parent_parental_self regulation
2,1,6815,1_sound_speech_auditory_phonological
3,2,4826,2_tumor_glioma_brain tumor_glioblastoma
4,3,3942,3_dyslexia_reading_phonological_dyslexic
5,4,3488,4_bci_eeg_retrieval_fmri
6,5,3426,5_eat_obesity_eating_obese
7,6,2844,6_concussion_brain injury_traumatic brain_trau...
8,7,2787,7_aβ_amyloid_microglia_alzheimer disease
9,8,2704,8_distractor_capture attention_visual search_a...


In [124]:
from tqdm import tqdm

n_topics = 300
batch_size = 200
n_samples = 10000
n_labels = 2

X, y = make_classification(n_samples=n_samples, n_features=n_topics, n_classes=n_labels, random_state=0)

X = torch.tensor(X).type(torch.float)
y = torch.tensor(y).type(torch.long)

class TopicEmbeddingNet(nn.Module):
  def __init__(self, n_topics: int, n_labels: int):
    super(TopicEmbeddingNet, self).__init__()

    self.encoder = nn.Sequential(
        nn.Linear(n_topics, n_topics), nn.BatchNorm1d(196), nn.LeakyReLU(0.1),
        nn.Linear(196, 128), nn.BatchNorm1d(128), nn.LeakyReLU(0.1),
        nn.Linear(128, n_topics)
    )
    self.hidden2mu = nn.Linear(n_topics, n_topics)
    self.hidden2log_var = nn.Linear(n_topics, n_topics)
    self.decoder = nn.Sequential(
      nn.Linear(n_topics, 128), nn.BatchNorm1d(128), nn.LeakyReLU(0.1),
      nn.Linear(128, 196), nn.BatchNorm1d(196), nn.LeakyReLU(0.1),
      nn.Linear(196, n_topics),
    )

    self.encoder = nn.Sequential(
      nn.Linear(n_topics, n_topics), nn.BatchNorm1d(196), nn.LeakyReLU(0.1)
    )
    self.embedding = nn.Embedding(n_labels, n_topics)

  def encode(self, x, label):
    # h = self.encoder(x)
    # mu = self.hidden2mu(h)
    # log_var = self.hidden2log_var(h)
    # sigma = torch.exp(0.5*log_var)
    # z = torch.randn_like(sigma)
    # h = mu + sigma * z
    # return mu, log_var, h

    x = self.encoder(x)
    h = self.embedding(label)

  def decode(self, x):
    x = self.decoder(x)
    return x


  def forward(self, x):
    mu, log_var, h = self.encode(x)
    y = self.decoder(h)
    return mu, log_var, y

model = TopicEmbeddingNet(n_topics, n_labels)

print('before train:', (model(X)[2].argmax(dim=1) == y).sum().item())

# criterion = nn.CrossEntropyLoss()
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters())

for epoch in tqdm(range(3)):
  model.train()
  model.zero_grad()
  _, _, x_pred = model(X)
  loss = criterion(x_pred, X)
  loss.backward()
  optimizer.step()

  # TODO eval

print('after train:', (model(X)[2].argmax(dim=1) == y).sum().item())

before train: 47


100%|██████████| 3/3 [00:01<00:00,  1.90it/s]

after train: 34





In [116]:
print(dict(model.named_parameters())['hidden2mu.weight'].shape)

torch.Size([300, 300])


In [None]:

import skorch
from skorch.callbacks import TensorBoard
from torch.utils.tensorboard import SummaryWriter

cols = ['category', 'subcategory']
X = np.vstack([data[col].astype('category').cat.codes for col in cols]).T
y = probs

X = torch.tensor(X, dtype=torch.int)
y = torch.tensor(y, dtype=torch.float)


net = skorch.NeuralNetRegressor(
  TopicEmbeddingNet(1,1,1),
  max_epochs=100,
  lr=0.1,
  iterator_train__shuffle=True,
  # DEBUG: callbacks=[TensorBoard(writer=SummaryWriter())]
)

# DEBUG: net.fit(X, y)

from sklearn.model_selection import GridSearchCV

params = {
  'lr': [0.01],
  'max_epochs': [100],
  'module__n_cats': [data['category'].nunique()],
  'module__n_subcats': [data['subcategory'].nunique()],
  'module__embeddings_dim': range(1, y.shape[1]),
}

gs = GridSearchCV(net, params, scoring='accuracy')

gs.fit(X, y)
print(gs.best_score_, gs.best_params_, gs.best_estimator_)

In [81]:
model = gs.best_params_

gs.estimator

# with torch.no_grad():
#   params = list(model.get_params()['module'].parameters())
#   cat_embeddings = params[0]
#   subcat_embeddings = params[1]

# model.save_params(f_params='model.pkl')

<class 'skorch.regressor.NeuralNetRegressor'>[uninitialized](
  module=TopicEmbeddingNet(
    (cat_embedding): Embedding(1, 1)
    (subcat_embedding): Embedding(1, 1)
    (fc): Linear(in_features=1, out_features=65, bias=True)
  ),
)

In [None]:
# test/train RSA
from scipy.stats import spearmanr

sim_train = cosine_similarity(model.topic_embeddings)
sim_test = cosine_similarity(result.H_test)
rho = spearmanr(sim_train, sim_test)
print(f'[RSA] mean test/train correlation: {rho[0].mean():.2f}')
