In [2]:
%reload_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, explained_variance_score, r2_score

from tqdm import tqdm

import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader, random_split
from torch.utils.tensorboard import SummaryWriter

from skorch import NeuralNetRegressor
from skorch.callbacks import TensorBoard

from python.cogtext.utils import select_relevant_journals
from python.cogtext import co_occurrence_matrix

sns.set()

In [3]:
# parameters
DEV_MODE = True
INPUT_FILE = 'data/pubmed/abstracts.csv.gz'

In [4]:
# prepare data

PUBMED = (pd.read_csv(INPUT_FILE)
            .pipe(select_relevant_journals)
            .dropna(subset=['abstract']))

# only corpora with # of articles < DEV_MAX_CORPUS_SIZE
# labels_cnt = PUBMED['label'].value_counts()
# small_sets = labels_cnt[labels_cnt < DEV_MAX_CORPUS_SIZE].index.to_list()
# PUBMED = PUBMED.query('label in @small_sets',).copy()

# DROP tasks/constructs with less than 5 articles (1/test + 1/valid + 4/train = 6)
valid_labels = PUBMED['label'].value_counts()[lambda cnt: cnt > 5].index.to_list()
PUBMED = PUBMED.query('label in @valid_labels')

# train/test split (80% train 20% test)
PUBMED_train, PUBMED_test = train_test_split(
    PUBMED,
    test_size=0.2,
    stratify=PUBMED['label'],
    random_state=0)

n_constructs = PUBMED.groupby('category')['label'].nunique()['CognitiveConstruct']
n_tasks = PUBMED.groupby('category')['label'].nunique()['CognitiveTask']

In [5]:
class MFNet(nn.Module):
  def __init__(self, n_tasks, n_constructs, n_embeddings):
    super(MFNet, self).__init__()
    self.task_embeddings = nn.Embedding(n_tasks, n_embeddings)
    self.construct_embeddings = nn.Embedding(n_constructs, n_embeddings)
    self.task_biases = torch.nn.Embedding(n_tasks, 1)
    self.construct_biases = torch.nn.Embedding(n_constructs, 1)
    self.decoder = nn.Linear(n_embeddings, 1)

  def forward(self, x):
    construct, task = x[:, 0], x[:, 1]
    M = self.task_embeddings(task)
    C = self.construct_embeddings(construct)
    bias = self.task_biases(task) + self.construct_biases(construct)
    y = torch.diagonal(M @ C.T).unsqueeze(1) + bias
    # y = self.decoder(H)
    return y

  def fit(self,
      X, y,
      train_split_size=.8,
      n_epochs=1000,
      batch_size=100,
      logger: SummaryWriter=SummaryWriter()):

    assert 0. < train_split_size < 1.0

    n_samples = X.shape[0]

    train_size = int(n_samples * train_split_size)
    test_size = n_samples - train_size


    dataset = TensorDataset(X, y)
    train_subset, test_subset = random_split(dataset, lengths=(train_size,test_size))

    X_test, y_test = dataset[test_subset.indices]

    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(self.parameters(), lr=.001)

    logger.add_graph(self, [X[:, 0], X[:, 1]])

    for epoch in tqdm(range(n_epochs)):

      # train model
      self.train()
      for X_batch, y_batch in DataLoader(train_subset, batch_size=batch_size):
        self.zero_grad()

        y_pred = self(X_batch)
        loss = criterion(y_batch, y_pred)
        logger.add_scalar('loss/train', loss.detach(), epoch)
        loss.backward()
        optimizer.step()

      # eval mode
      self.eval()
      with torch.no_grad():
        y_pred = self(X_test)
        loss = criterion(y_test, y_pred)
        logger.add_scalar('loss/test', loss.detach(), epoch)
        
        # ev = explained_variance_score(y_test, y_pred)
        # logger.add_scalar('explained_variance/test', ev, epoch)

    return self

In [7]:
# TODO
cols = ['construct','task']

# create X_c (co-occurrence matrix)
COOC = PUBMED.pipe(co_occurrence_matrix, jaccard_coefficient=True, groupby_category=True)
X = np.vstack([COOC[c].astype('category').cat.codes for c in cols]).T
y = COOC[['jaccard_coefficient']].values

# TODO drop 0 probabilities from X
# TODO n_embeddings should be a hyper parameter (use Ax to optimize)

# model = MFNet(n_tasks, n_constructs, 7)
# model.fit(torch.tensor(X, dtype=torch.int), torch.tensor(y, dtype=torch.float))

X = torch.tensor(X, dtype=torch.int)
y = torch.tensor(y, dtype=torch.float)

net = NeuralNetRegressor(
    MFNet(n_tasks, n_constructs, 7),
    max_epochs=1000,
    lr=0.1,
    # Shuffle training data on each epoch
    iterator_train__shuffle=True,
    callbacks=[TensorBoard(writer=SummaryWriter())]
)

net.fit(X, y)
y_proba = net.predict_proba(X)

# %reload_ext tensorboard
# %tensorboard --logdir=runs/

  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1        [36m8.0981[0m        [32m8.5962[0m  0.1922
      2        [36m5.4807[0m        [32m7.6374[0m  0.1165
      3        [36m3.9541[0m        [32m6.9579[0m  0.1216
      4        [36m2.9701[0m        [32m6.4529[0m  0.1171
      5        [36m2.2922[0m        [32m6.0639[0m  0.1309
      6        [36m1.8065[0m        [32m5.7578[0m  0.1274
      7        [36m1.4453[0m        [32m5.5125[0m  0.1133
      8        [36m1.1715[0m        [32m5.3124[0m  0.1133
      9        [36m0.9595[0m        [32m5.1481[0m  0.1156
     10        [36m0.7932[0m        [32m5.0115[0m  0.1132
     11        [36m0.6614[0m        [32m4.8968[0m  0.1718
     12        [36m0.5556[0m        [32m4.7998[0m  0.1648
     13        [36m0.4699[0m        [32m4.7177[0m  0.1634
     14        [36m0.4000[0m        [32m4.6474[0m  0.1308
     15        [36m0.3426[0m        [32m4

In [8]:
y_pred = net.predict(X)
(y - y_pred).mean()

tensor(-0.1276)