In [None]:
%reload_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, explained_variance_score, r2_score

import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader, random_split
from torch.utils.tensorboard import SummaryWriter

from python.cogtext.utils import select_relevant_pubmed_articles
from python.cogtext.coappearance_matrix import generate_coappearance_matrix_fast

sns.set()

In [228]:
# parameters
DEV_MODE = True
INPUT_FILE = 'data/pubmed_abstracts.csv.gz'

In [None]:
# prepare data

PUBMED = (pd.read_csv(INPUT_FILE)
            .pipe(select_relevant_pubmed_articles)
            .dropna(subset=['abstract']))

# only corpora with # of articles < DEV_MAX_CORPUS_SIZE
# subcats_cnt = PUBMED['subcategory'].value_counts()
# small_subcats = subcats_cnt[subcats_cnt < DEV_MAX_CORPUS_SIZE].index.to_list()
# PUBMED = PUBMED.query('subcategory in @small_subcats',).copy()

# DROP tasks/constructs with less than 5 articles (1/test + 1/valid + 4/train = 6)
valid_subcats = PUBMED['subcategory'].value_counts()[lambda cnt: cnt > 5].index.to_list()
PUBMED = PUBMED.query('subcategory in @valid_subcats')

# train/test split (80% train 20% test)
PUBMED_train, PUBMED_test = train_test_split(
    PUBMED,
    test_size=0.2,
    stratify=PUBMED['subcategory'],
    random_state=0)

n_constructs = PUBMED.groupby('category')['subcategory'].nunique()['CognitiveConstruct']
n_tasks = PUBMED.groupby('category')['subcategory'].nunique()['CognitiveTask']

In [261]:
class MFNet(nn.Module):
  def __init__(self, n_tasks, n_constructs, n_embeddings):
    super(MFNet, self).__init__()
    self.task_embeddings = nn.Embedding(n_tasks, n_embeddings)
    self.construct_embeddings = nn.Embedding(n_constructs, n_embeddings)
    self.task_biases = torch.nn.Embedding(n_tasks, 1)
    self.construct_biases = torch.nn.Embedding(n_constructs, 1)
    self.decoder = nn.Linear(n_embeddings, 1)

  def forward(self, construct, task):
    B = self.task_biases(task) + self.construct_biases(construct)
    M = self.task_embeddings(task)
    C = self.construct_embeddings(construct)
    y = B + M @ C.T
    # y = self.decoder(H)
    return y

  def fit(self,
      X, y,
      train_split_size=.8,
      n_epochs=1000,
      batch_size=100,
      logger: SummaryWriter=SummaryWriter()):

    assert 0. < train_split_size < 1.0

    n_samples = X.shape[0]

    train_size = int(n_samples * train_split_size)
    test_size = n_samples - train_size

    dataset = TensorDataset(X, y)
    train_subset, test_subset = random_split(dataset, lengths=(train_size,test_size))

    X_test, y_test = dataset[test_subset.indices]

    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(self.parameters(), lr=.001)

    logger.add_graph(model, X)

    for epoch in range(n_epochs):

      # train model
      model.train()
      for X_batch, y_batch in DataLoader(train_subset, batch_size=batch_size):
        model.zero_grad()

        # X_train = torch.tensor(X).type(torch.int)
        # y_train = torch.tensor(y).type(torch.float)
        # y_pred = self(X_train[:, 0], X_train[:, 1])

        y_pred = model(X_batch)
        loss = criterion(y_batch, y_pred)
        logger.add_scalar('loss/train', loss.detach(), epoch)
        loss.backward()
        optimizer.step()

      # eval mode
      model.eval()
      with torch.no_grad():
        y_pred = model(X_test)
        loss = criterion(y_test, y_pred)
        logger.add_scalar('loss/test', loss.detach(), epoch)
        
        ev = explained_variance_score(y_test, y_pred)
        logger.add_scalar('explained_variance/test', ev, epoch)

    return self

In [262]:
# TODO
cols = ['construct','task']

# create CO_APPEARANCE
CO_APPEARANCE = PUBMED.pipe(generate_coappearance_matrix_fast, probability=True, group_categories=True)
X = np.vstack([CO_APPEARANCE[c].astype('category').cat.codes for c in cols]).T
y = CO_APPEARANCE[['probability']].values

# TODO drop 0 probabilities from X
# TODO n_embeddings should be a hyper parameter (use Ax to optimize)

model = MFNet(n_tasks, n_constructs, 7)
model.fit(X, y)

  return F.mse_loss(input, target, reduction=self.reduction)


epoch=0, loss=111.417
epoch=1, loss=111.341
epoch=2, loss=111.265
epoch=3, loss=111.190
epoch=4, loss=111.114
epoch=5, loss=111.039
epoch=6, loss=110.964
epoch=7, loss=110.888
epoch=8, loss=110.813
epoch=9, loss=110.738
epoch=10, loss=110.663
epoch=11, loss=110.589
epoch=12, loss=110.514
epoch=13, loss=110.439
epoch=14, loss=110.365
epoch=15, loss=110.291
epoch=16, loss=110.216
epoch=17, loss=110.142
epoch=18, loss=110.068
epoch=19, loss=109.994
epoch=20, loss=109.920
epoch=21, loss=109.847
epoch=22, loss=109.773
epoch=23, loss=109.699
epoch=24, loss=109.626
epoch=25, loss=109.552
epoch=26, loss=109.479
epoch=27, loss=109.405
epoch=28, loss=109.332
epoch=29, loss=109.259
epoch=30, loss=109.186
epoch=31, loss=109.113
epoch=32, loss=109.040
epoch=33, loss=108.967
epoch=34, loss=108.894
epoch=35, loss=108.821
epoch=36, loss=108.748
epoch=37, loss=108.675
epoch=38, loss=108.602
epoch=39, loss=108.530
epoch=40, loss=108.457
epoch=41, loss=108.384
epoch=42, loss=108.311
epoch=43, loss=108.23

KeyboardInterrupt: 