In [1]:
%reload_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, explained_variance_score, r2_score

from tqdm import tqdm

import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader, random_split
from torch.utils.tensorboard import SummaryWriter

from python.cogtext.utils import select_relevant_journals
from python.cogtext.coappearance_matrix import generate_coappearance_matrix_fast

sns.set()

In [2]:
# parameters
DEV_MODE = True
INPUT_FILE = 'data/pubmed_abstracts.csv.gz'

In [3]:
# prepare data

PUBMED = (pd.read_csv(INPUT_FILE)
            .pipe(select_relevant_journals)
            .dropna(subset=['abstract']))

# only corpora with # of articles < DEV_MAX_CORPUS_SIZE
# subcats_cnt = PUBMED['subcategory'].value_counts()
# small_subcats = subcats_cnt[subcats_cnt < DEV_MAX_CORPUS_SIZE].index.to_list()
# PUBMED = PUBMED.query('subcategory in @small_subcats',).copy()

# DROP tasks/constructs with less than 5 articles (1/test + 1/valid + 4/train = 6)
valid_subcats = PUBMED['subcategory'].value_counts()[lambda cnt: cnt > 5].index.to_list()
PUBMED = PUBMED.query('subcategory in @valid_subcats')

# train/test split (80% train 20% test)
PUBMED_train, PUBMED_test = train_test_split(
    PUBMED,
    test_size=0.2,
    stratify=PUBMED['subcategory'],
    random_state=0)

n_constructs = PUBMED.groupby('category')['subcategory'].nunique()['CognitiveConstruct']
n_tasks = PUBMED.groupby('category')['subcategory'].nunique()['CognitiveTask']

In [27]:
class MFNet(nn.Module):
  def __init__(self, n_tasks, n_constructs, n_embeddings):
    super(MFNet, self).__init__()
    self.task_embeddings = nn.Embedding(n_tasks, n_embeddings)
    self.construct_embeddings = nn.Embedding(n_constructs, n_embeddings)
    self.task_biases = torch.nn.Embedding(n_tasks, 1)
    self.construct_biases = torch.nn.Embedding(n_constructs, 1)
    self.decoder = nn.Linear(n_embeddings, 1)

  def forward(self, construct, task):
    M = self.task_embeddings(task)
    C = self.construct_embeddings(construct)
    bias = self.task_biases(task) + self.construct_biases(construct)
    y = torch.diagonal(M @ C.T).unsqueeze(1) + bias
    # y = self.decoder(H)
    return y

  def fit(self,
      X, y,
      train_split_size=.8,
      n_epochs=1000,
      batch_size=100,
      logger: SummaryWriter=SummaryWriter()):

    assert 0. < train_split_size < 1.0

    n_samples = X.shape[0]

    train_size = int(n_samples * train_split_size)
    test_size = n_samples - train_size


    dataset = TensorDataset(X, y)
    train_subset, test_subset = random_split(dataset, lengths=(train_size,test_size))

    X_test, y_test = dataset[test_subset.indices]

    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(self.parameters(), lr=.001)

    logger.add_graph(self, [X[:, 0], X[:, 1]])

    for epoch in tqdm(range(n_epochs)):

      # train model
      self.train()
      for X_batch, y_batch in DataLoader(train_subset, batch_size=batch_size):
        self.zero_grad()

        construct, task = X_batch[:, 0], X_batch[:, 1]
        y_pred = self(construct, task)
        loss = criterion(y_batch, y_pred)
        logger.add_scalar('loss/train', loss.detach(), epoch)
        loss.backward()
        optimizer.step()

      # eval mode
      self.eval()
      with torch.no_grad():
        construct, task = X_test[:, 0], X_test[:, 1]
        y_pred = self(construct, task)
        loss = criterion(y_test, y_pred)
        logger.add_scalar('loss/test', loss.detach(), epoch)
        
        # ev = explained_variance_score(y_test, y_pred)
        # logger.add_scalar('explained_variance/test', ev, epoch)

    return self

In [28]:
# TODO
cols = ['construct','task']

# create CO_APPEARANCE
CO_APPEARANCE = PUBMED.pipe(generate_coappearance_matrix_fast, probability=True, group_categories=True)
X = np.vstack([CO_APPEARANCE[c].astype('category').cat.codes for c in cols]).T
y = CO_APPEARANCE[['probability']].values

# TODO drop 0 probabilities from X
# TODO n_embeddings should be a hyper parameter (use Ax to optimize)

model = MFNet(n_tasks, n_constructs, 7)
model.fit(torch.tensor(X, dtype=torch.int), torch.tensor(y, dtype=torch.float))

# %reload_ext tensorboard
# %tensorboard --logdir=runs/

100%|██████████| 1000/1000 [01:40<00:00,  9.94it/s]


MFNet(
  (task_embeddings): Embedding(83, 7)
  (construct_embeddings): Embedding(71, 7)
  (task_biases): Embedding(83, 1)
  (construct_biases): Embedding(71, 1)
  (decoder): Linear(in_features=7, out_features=1, bias=True)
)