In this chapter we used already fine-tuned BERT models to extract the chunk embeddings of our books. The chunk embeddings correspond to either the meaned embeddings of all the words in the sequence or the embedding of the [CLS] token. We will explore the results of both. We will then run a variety of classifiers over these embeddings directly. 

1.   Meaned pooled output --> single layer NN
2.   SVM
3.   RoBERT
4.   ToBERT 

# Installs, Imports, Configuration

In [1]:
!pip install datasets
!pip install "ray[default]"==1.5.2
!pip install wandb
!pip install tensorboardX
!pip install pytorch_lightning
!pip install transformers

!pip install httplib2==0.15.0
!pip install google-api-python-client==1.6

Collecting datasets
  Downloading datasets-1.11.0-py3-none-any.whl (264 kB)
[?25l[K     |█▎                              | 10 kB 41.4 MB/s eta 0:00:01[K     |██▌                             | 20 kB 25.3 MB/s eta 0:00:01[K     |███▊                            | 30 kB 18.7 MB/s eta 0:00:01[K     |█████                           | 40 kB 16.2 MB/s eta 0:00:01[K     |██████▏                         | 51 kB 7.2 MB/s eta 0:00:01[K     |███████▍                        | 61 kB 8.5 MB/s eta 0:00:01[K     |████████▋                       | 71 kB 8.0 MB/s eta 0:00:01[K     |██████████                      | 81 kB 9.0 MB/s eta 0:00:01[K     |███████████▏                    | 92 kB 9.6 MB/s eta 0:00:01[K     |████████████▍                   | 102 kB 7.3 MB/s eta 0:00:01[K     |█████████████▋                  | 112 kB 7.3 MB/s eta 0:00:01[K     |██████████████▉                 | 122 kB 7.3 MB/s eta 0:00:01[K     |████████████████                | 133 kB 7.3 MB/s eta 0:00:01

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!cd drive

In [3]:
import pickle
import numpy as np
import torch
import pytorch_lightning as pl
from pathlib import Path
import sys
import os
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, f1_score
from scipy.special import softmax
import math

# from ray import tune
# from ray.tune.logger import DEFAULT_LOGGERS
# from ray.tune.integration.wandb import WandbLoggerCallback
# from ray.tune.schedulers import ASHAScheduler
# from functools import partial

In [4]:
import configparser

config = configparser.ConfigParser()
config.read('/content/drive/MyDrive/Thesis/BookSuccessPredictor/config.ini')

drive_base_path = Path(config['Drive']['drive_base_path'])

sys.path.append(str(drive_base_path / 'BookSuccessPredictor' / '_utils'))
sys.path.append(str(drive_base_path / 'BookSuccessPredictor' / 'datasets' / 'goodreads_maharjan_super' / 'MultiModal' / 'dataset_loader'))

In [5]:
# saves our models to artifacts in WandB
import wandb
%env WANDB_LOG_MODEL=true
%env WANDB_PROJECT=goodreads_success_predictor

env: WANDB_LOG_MODEL=true
env: WANDB_PROJECT=goodreads_success_predictor


In [6]:
wandb.login(key = config['WandB']['api_key'])

[34m[1mwandb[0m: W&B API key is configured (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

# Get Transformer Model from Stage 1

In [7]:
from transformers.modeling_outputs import SequenceClassifierOutput
from torch import nn
import torch
from torch.nn import CrossEntropyLoss, MSELoss

from transformers import BertPreTrainedModel, BertModel

from transformers import DistilBertPreTrainedModel, DistilBertModel

class DistilBERTForMultipleSequenceClassification(DistilBertPreTrainedModel):
    def __init__(self, config, num_labels1 = 2, num_labels2 = 8):
        super().__init__(config)
        self.num_labels1 = num_labels1
        self.num_labels2 = num_labels2
        print(self.num_labels1, self.num_labels2)
        self.alpha = config.alpha
        self.config = config

        self.distilbert = DistilBertModel(config)
        self.pre_classifier = nn.Linear(config.dim, config.dim)
        self.classifier1 = nn.Linear(config.dim, self.num_labels1)
        self.classifier2 = nn.Linear(config.dim, self.num_labels2)
        self.dropout = nn.Dropout(config.dropout)

        self.init_weights()


    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        distilbert_output = self.distilbert(
              input_ids=input_ids,
              attention_mask=attention_mask,
              head_mask=head_mask,
              inputs_embeds=inputs_embeds,
              output_attentions=output_attentions,
              output_hidden_states=output_hidden_states,
              return_dict=return_dict,
          )
        hidden_state = distilbert_output[0]  # (bs, seq_len, dim)
        pooled_output = hidden_state[:, 0]  # (bs, dim)
        pooled_output = self.pre_classifier(pooled_output)  # (bs, dim)
        pooled_output = nn.ReLU()(pooled_output)  # (bs, dim)
        pooled_output = self.dropout(pooled_output)  # (bs, dim)
        logits1 = self.classifier1(pooled_output)
        logits2 = self.classifier2(pooled_output)
        logits = torch.cat([logits1, logits2], 1)

        loss = None
        if labels is not None:
            #if self.config.problem_type is None:
            #self.config.problem_type = "single_label_classification"
            
            if self.num_labels1 > 1:
                loss_fct1 = CrossEntropyLoss()
                loss1 = loss_fct1(logits1.view(-1, self.num_labels1), labels[:, 0].view(-1))
            else:
                loss_fct1 = MSELoss()
                loss1 = loss_fct1(logits1.view(-1), labels[:, 0].view(-1))

            if self.num_labels2 > 1:
                loss_fct2 = CrossEntropyLoss()
                loss2 = loss_fct2(logits2.view(-1, self.num_labels2), labels[:, 1].view(-1))
            else:
                loss_fct2 = MSELoss()
                loss2 = loss_fct2(logits2.view(-1), labels[:, 1].view(-1))
            loss = self.alpha*loss1 + (1-self.alpha)*loss2 

        if not return_dict:
            output = (logits,) + outputs[2:] #not sure if this works
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=distilbert_output.hidden_states, #hidden_states,
            attentions=distilbert_output.attentions, #attentions,
        )

In [8]:
import wandb
run = wandb.init()

if config['Model']['name'] == 'distilbert-base-uncased':
  if eval(config['Tokenizer']['overlap']):
    # artifact = run.use_artifact('lucaguarro/goodreads_success_predictor/model-nlpbosie:v0', type='model')
    artifact = run.use_artifact('lucaguarro/DistilbertMultitaskHPSearch/model-3vvi0uoq:v0', type='model')
    print("using model trained on overlap dataset")
  else:
    print("using model trained on sentence tokenized dataset")
    artifact = run.use_artifact('lucaguarro/goodreads_success_predictor/model-2giwtwvy:v0', type='model')
    
artifact_dir = artifact.download()

transformer_model = DistilBERTForMultipleSequenceClassification.from_pretrained(artifact_dir, num_labels1 = 2, num_labels2 = 8)
transformer_model.cuda()

[34m[1mwandb[0m: Currently logged in as: [33mlucaguarro[0m (use `wandb login --relogin` to force relogin)


using model trained on overlap dataset


[34m[1mwandb[0m: Downloading large artifact model-3vvi0uoq:v0, 255.48MB. 3 files... Done. 0:0:0


2 8


DistilBERTForMultipleSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30523, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.38767857660247906, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.24363502971184062, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout

# Getting the Data

## Getting the Pooled Outputs

### Creating the Dataset

#### From script

first we have to get the tokenized dataset

In [None]:
load_path = Path("/content/drive/MyDrive/Thesis/BookSuccessPredictor/datasets/goodreads_maharjan_super/already_tokenized/80_20/DistilBERT_UNCASED_NER_512_w50overlap")

In [None]:
from datasets import DatasetDict, Dataset, concatenate_datasets
train_paths = [f for f in os.listdir(load_path) if f.startswith('train')]
val_paths = [f for f in os.listdir(load_path) if f.startswith('val')]
test_paths = [f for f in os.listdir(load_path) if f.startswith('test')]

train_datasets = []
val_datasets = []
test_datasets = []

for trainp in train_paths:
  with open(load_path / trainp, "rb") as input_file:
    train_datasets.append(Dataset.from_dict(pickle.load(input_file)))

for valp in val_paths:
  with open(load_path / valp, "rb") as input_file:
    val_datasets.append(Dataset.from_dict(pickle.load(input_file)))

for testp in test_paths:
  with open(load_path / testp, "rb") as input_file:
    test_datasets.append(Dataset.from_dict(pickle.load(input_file)))

train_dataset = concatenate_datasets(train_datasets)
del train_datasets

val_dataset = concatenate_datasets(val_datasets)
del val_datasets

test_dataset = concatenate_datasets(test_datasets)
del test_datasets

chunked_encoded_dataset = DatasetDict({'train': train_dataset, 'validation': val_dataset, 'test': test_dataset})

In [None]:
chunked_encoded_dataset

In [None]:
import torch as th
import time

def get_book_changes_idx(book_titles):
  book_changes_idx = np.where(np.array(book_titles[:-1]) != np.array(book_titles[1:]))[0]
  book_changes_idx += 1
  book_changes_idx = np.insert(book_changes_idx, 0, 0)
  return book_changes_idx

def getPooledOutputs(model, encoded_dataset, batch_size = 32):
  model.eval()

  # pooled_outputs = []
  pooled_outputs = torch.empty([0,768]).cuda()

  num_iters = (len(encoded_dataset['input_ids']) - 1)//batch_size + 1
  print("total number of iters ", num_iters)
  
  for i in range(num_iters):
    print(i)
    up_to = i*batch_size + batch_size
    if len(encoded_dataset['input_ids']) < up_to:
      up_to = len(encoded_dataset['input_ids'])
    input_ids = th.LongTensor(encoded_dataset['input_ids'][i*batch_size:up_to]).cuda()
    attention_mask = th.LongTensor(encoded_dataset['attention_mask'][i*batch_size:up_to]).cuda()

    with torch.no_grad():
      embeddings = model.forward(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True)['hidden_states'][-1][:,0] # Pooled output
      pooled_outputs = th.cat([pooled_outputs, embeddings],0)
      th.cuda.empty_cache()

  return pooled_outputs

In [None]:
train_set_embeddings = getPooledOutputs(transformer_model, chunked_encoded_dataset['train'])

In [None]:
val_set_embeddings = getPooledOutputs(transformer_model, chunked_encoded_dataset['validation'])

In [None]:
test_set_embeddings = getPooledOutputs(transformer_model, chunked_encoded_dataset['test'])

In [None]:
from datasets import Dataset
train_set_embeddings = Dataset.from_dict({'pooled_outputs': train_set_embeddings})
val_set_embeddings = Dataset.from_dict({'pooled_outputs': val_set_embeddings})
test_set_embeddings = Dataset.from_dict({'pooled_outputs': test_set_embeddings})

In [None]:
from datasets import concatenate_datasets
dataset_w_embeddings = DatasetDict({
    'train': concatenate_datasets([chunked_encoded_dataset['train'], train_set_embeddings], axis = 1), 
    'validation': concatenate_datasets([chunked_encoded_dataset['validation'], val_set_embeddings], axis = 1), 
    'test': concatenate_datasets([chunked_encoded_dataset['test'], test_set_embeddings], axis = 1)
})
dataset_w_embeddings = dataset_w_embeddings.remove_columns(['attention_mask', 'input_ids', 'token_type_ids'])

In [None]:
dataset_w_embeddings

In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)  

with open('train_ds.pkl', 'wb') as output_file:
  pickle.dump(dataset_w_embeddings['train'], output_file)

with open('val_ds.pkl', 'wb') as output_file:
  pickle.dump(dataset_w_embeddings['validation'], output_file)

with open('test_ds.pkl', 'wb') as output_file:
  pickle.dump(dataset_w_embeddings['test'], output_file)

folder_id = '1TVBJzrWhS-yLq0xic2eXnw0mA4lY-FU8'
# get the folder id where you want to save your file
file = drive.CreateFile({'parents':[{u'id': folder_id}]})
file.SetContentFile('train_ds.pkl')
file.Upload() 

# get the folder id where you want to save your file
file = drive.CreateFile({'parents':[{u'id': folder_id}]})
file.SetContentFile('val_ds.pkl')
file.Upload() 

# get the folder id where you want to save your file
file = drive.CreateFile({'parents':[{u'id': folder_id}]})
file.SetContentFile('test_ds.pkl')
file.Upload() 

### Loading the Dataset from Drive

In [None]:
# base_path = Path("/content/drive/MyDrive/Thesis/BookSuccessPredictor/datasets/goodreads_maharjan_super/Pooled_Output/60_40/DistilBERT_multitask_sentence_tokenized_dataset_embeddings")
base_path = Path("/content/drive/MyDrive/Thesis/BookSuccessPredictor/datasets/goodreads_maharjan_super/Pooled_Output/80_20/3vvi0uoq:v0")

In [None]:
from datasets import DatasetDict
with open(base_path / 'train_ds.pkl', "rb") as input_file:
  train_set_embeddings = pickle.load(input_file)

with open(base_path / 'val_ds.pkl', "rb") as input_file:
  val_set_embeddings = pickle.load(input_file)

with open(base_path / 'test_ds.pkl', "rb") as input_file:
  test_set_embeddings = pickle.load(input_file)

dataset_w_embeddings = DatasetDict({'train': train_set_embeddings, 'validation': val_set_embeddings, 'test': test_set_embeddings})
dataset_w_embeddings

DatasetDict({
    train: Dataset({
        features: ['book_title', 'genre', 'success_label', 'pooled_outputs'],
        num_rows: 21539
    })
    validation: Dataset({
        features: ['book_title', 'genre', 'success_label', 'pooled_outputs'],
        num_rows: 5236
    })
    test: Dataset({
        features: ['book_title', 'genre', 'success_label', 'pooled_outputs'],
        num_rows: 10816
    })
})

In [None]:
train_set_embeddings['book_title'][0:50]

## Average Pooled Outputs for Shallow Neural Network and SVM

### Generating the Data from Pooled Outputs

#### From Script

In [None]:
def getAveragePooledOutputs(dataset_w_embeddings):
  book_embeddings_dataset = {'meaned_pooled_output': [], 'book_title': [], 'genre': [], 'success_label': []}
  book_changes = get_book_changes_idx(dataset_w_embeddings['book_title'])
  # print(len(book_changes))
  for i in range(len(book_changes)):
      start = book_changes[i]
      end = None
      if i != len(book_changes) - 1:
        end = book_changes[i+1]
      else:
        end = len(dataset_w_embeddings['pooled_outputs'])

      segment_embeddings = dataset_w_embeddings['pooled_outputs'][start:end]
      book_embeddings = torch.mean(segment_embeddings, dim=0)

      book_embeddings_dataset['meaned_pooled_output'].append(book_embeddings)
      book_embeddings_dataset['book_title'].append(dataset_w_embeddings['book_title'][start])
      book_embeddings_dataset['genre'].append(dataset_w_embeddings['genre'][start])
      book_embeddings_dataset['success_label'].append(dataset_w_embeddings['success_label'][start])
    
  return book_embeddings_dataset

In [None]:
type(dataset_w_embeddings['train']['pooled_outputs'])

torch.Tensor

In [None]:
dataset_w_embeddings.set_format(type='pt', columns=['pooled_outputs', 'success_label'])

In [None]:
avg_pld_outs_train = getAveragePooledOutputs(dataset_w_embeddings['train'])
avg_pld_outs_val = getAveragePooledOutputs(dataset_w_embeddings['validation'])
avg_pld_outs_test = getAveragePooledOutputs(dataset_w_embeddings['test'])

In [None]:
len(avg_pld_outs_hf_ds['train']['meaned_pooled_output'])

555

In [None]:
# full_ds = DatasetDict({'train': Dataset.from_dict(full_ds['train']), 'validation': Dataset.from_dict(full_ds['validation']), 'test': Dataset.from_dict(full_ds['test'])})
from datasets import Dataset
avg_pld_outs_hf_ds = DatasetDict({'train': Dataset.from_dict(avg_pld_outs_train), 'validation': Dataset.from_dict(avg_pld_outs_val), 'test': Dataset.from_dict(avg_pld_outs_test)})

In [None]:
from datasets import DatasetDict, Dataset
# avg_pld_outs_hf_ds = DatasetDict({'train': Dataset.from_dict(avg_pld_outs_train), 'validation': Dataset.from_dict(avg_pld_outs_val), 'test': Dataset.from_dict(avg_pld_outs_test)})

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)  

with open('avg_pld_outs_hf_ds.pkl', 'wb') as output_file:
  pickle.dump(avg_pld_outs_hf_ds, output_file)

folder_id = '1TVBJzrWhS-yLq0xic2eXnw0mA4lY-FU8'
# get the folder id where you want to save your file
file = drive.CreateFile({'parents':[{u'id': folder_id}]})
file.SetContentFile('avg_pld_outs_hf_ds.pkl')
file.Upload() 

#### Load from Drive

In [9]:
!pip install datasets



In [10]:
from datasets import DatasetDict
with open(r"/content/drive/MyDrive/Thesis/BookSuccessPredictor/datasets/goodreads_maharjan_super/Pooled_Output/80_20/3vvi0uoq:v0/avg_pld_outs_hf_ds.pkl", "rb") as input_file:
  avg_pld_outs_hf_ds = pickle.load(input_file)

In [11]:
len(avg_pld_outs_hf_ds['train']['meaned_pooled_output'])

555

# Simple Shallow Neural Network

In [12]:
import torch
import torch.nn as nn
import torch.nn.functional as F
# from transformers.modeling_outputs import SequenceClassifierOutput

class Net(nn.Module):

    def __init__(self, pre_classifier_init, classifier_init, do_rate=0.1):
        super(Net, self).__init__()

        self.pre_classifier = nn.Linear(768, 768)
        self.classifier = nn.Linear(768, 2)
        self.dropout = nn.Dropout(do_rate)

        self.pre_classifier.weight.data.copy_(pre_classifier_init.weight.data)
        self.classifier.weight.data.copy_(classifier_init.weight.data)

        # print(pre_classifier_init.bias.data)
        self.pre_classifier.bias.data.copy_(pre_classifier_init.bias.data)
        self.classifier.bias.data.copy_(classifier_init.bias.data)

        # DOUBLE CHECK IF BIASES ARE BEING SET AS WELL

    def forward(self, x, labels = None):
        # Max pooling over a (2, 2) window
        x = self.pre_classifier(x)
        x = nn.ReLU()(x)
        x = self.dropout(x)
        return self.classifier(x)

        # loss = None
        # if labels is not None:
        #   loss_fct = CrossEntropyLoss()
        #   loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        # return SequenceClassifierOutput(
        #     loss = loss,
        #     logits = logits
        # )

net = Net(transformer_model.pre_classifier, transformer_model.classifier1)

#### Results with no Training

In [13]:
net.eval()

Net(
  (pre_classifier): Linear(in_features=768, out_features=768, bias=True)
  (classifier): Linear(in_features=768, out_features=2, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
)

In [None]:
with torch.no_grad():
  logits = net.forward(torch.FloatTensor(avg_pld_outs_hf_ds['validation']['meaned_pooled_output']))
y_score = softmax(logits, axis = 1)[:, 1].tolist()
y_pred = [math.floor(input) if input < 0.50 else math.ceil(input) for input in y_score]
f1_score(avg_pld_outs_hf_ds['validation']['success_label'], y_pred, average = 'weighted')

0.7507210669380432

In [None]:
with torch.no_grad():
  logits = net.forward(torch.FloatTensor(avg_pld_outs_hf_ds['test']['meaned_pooled_output']))
y_score = softmax(logits, axis = 1)[:, 1].tolist()
y_pred = [math.floor(input) if input < 0.50 else math.ceil(input) for input in y_score]
f1_score(avg_pld_outs_hf_ds['test']['success_label'], y_pred, average = 'weighted')

0.6645011319979305

#### Training w Hyperparameter Tuning and Results

In [30]:
def load_data():
  with open("/content/drive/MyDrive/Thesis/BookSuccessPredictor/datasets/goodreads_maharjan_super/Pooled_Output/80_20/3vvi0uoq:v0/avg_pld_outs_hf_ds.pkl", "rb") as input_file:
    avg_pld_outs_hf_ds = pickle.load(input_file)
  avg_pld_outs_hf_ds.set_format(type='pt', columns=['meaned_pooled_output', 'success_label'])
  trainset = avg_pld_outs_hf_ds['train']
  valset = avg_pld_outs_hf_ds['validation']
  return trainset, valset

def load_test_data():
  with open("/content/drive/MyDrive/Thesis/BookSuccessPredictor/datasets/goodreads_maharjan_super/Pooled_Output/80_20/3vvi0uoq:v0/avg_pld_outs_hf_ds.pkl", "rb") as input_file:
    avg_pld_outs_hf_ds = pickle.load(input_file)
  avg_pld_outs_hf_ds.set_format(type='pt', columns=['meaned_pooled_output', 'success_label'])
  testset = avg_pld_outs_hf_ds['test']
  return testset

In [31]:
trainset, valset = load_data()

In [35]:
trainloader = torch.utils.data.DataLoader(trainset, batch_size=32, shuffle=True)

In [36]:
len(trainloader)

18

In [15]:
print(type(net))

<class '__main__.Net'>


In [46]:
from ray import tune
# from ray.tune.integration.wandb import wandb_mixin
# '''@wandb_mixin
# run = wandb.init()

def train_nn(config, checkpoint_dir=None, data_dir=None):
  net = Net(transformer_model.pre_classifier, transformer_model.classifier1, config['do_rate'])
  net.train()
  device = "cpu"
  if torch.cuda.is_available():
      device = "cuda:0"
      if torch.cuda.device_count() > 1:
          net = nn.DataParallel(net)
  print(type(net))
  net.to(device)
  # net.cuda()

  criterion = nn.CrossEntropyLoss()
  optimizer = optim.SGD(net.parameters(), lr=config["lr"], momentum=0.9)

  trainset, valset = load_data()

  trainloader = torch.utils.data.DataLoader(trainset, batch_size=config["batch_size"], shuffle=True)
  valloader = torch.utils.data.DataLoader(valset, batch_size=config["batch_size"], shuffle=True)

  total_iter_steps = len(trainloader)
  for epoch in range(config['num_epochs']):
    running_loss = 0.0
    epoch_steps = 0
    for i, data in enumerate(trainloader, 0):

      inputs = data['meaned_pooled_output']
      labels = data['success_label']

      inputs, labels = inputs.to(device), labels.to(device)

      optimizer.zero_grad()

      outputs = net(inputs)
      loss = criterion(outputs, labels)
      loss.backward()
      optimizer.step()

      running_loss += loss.item()
      epoch_steps += 1

      if i % 10 == 9:
        print("[%d, %5d] loss: %.3f" % (epoch + 1, i + 1,
                                        running_loss / epoch_steps))
        running_loss = 0.0

      # Validation loss
      val_loss = 0.0
      val_steps = 0
      total = 0
      correct = 0

      all_predictions = np.array([])
      all_labels = np.array([])

      net.eval()
      with torch.no_grad():
        for j, data in enumerate(valloader, 0):

          inputs_cpu = data['meaned_pooled_output']
          labels_cpu = data['success_label']

          inputs, labels = inputs_cpu.to(device), labels_cpu.to(device)
          # inputs.cuda()
          # labels.cuda()

          outputs = net(inputs)
          _, predicted = torch.max(outputs.data, 1)

          all_predictions = np.append(all_predictions, predicted.to('cpu').numpy())
          all_labels = np.append(all_labels, labels_cpu.numpy())

          total += labels.size(0)
          correct += (predicted == labels).sum().item()

          loss = criterion(outputs, labels)
          val_loss += loss.cpu().numpy()
          val_steps += 1

      with tune.checkpoint_dir(total_iter_steps * epoch + i) as checkpoint_dir:
          print("saving in checkpoint dir")
          path = os.path.join(checkpoint_dir, "checkpoint")
          torch.save((net.state_dict(), optimizer.state_dict()), path)

      net.train()

      s_precision, s_recall, s_f1, _ = precision_recall_fscore_support(all_labels, all_predictions, average='weighted')
      # s_acc = accuracy_score(all_labels, all_predictions)
      # wandb.log({"val_loss": val_loss / val_steps, "val_accuracy": correct / total})
      tune.report(loss=(val_loss / val_steps), accuracy=correct / total, f1=s_f1, precision=s_precision, recall=s_recall)
  print("Finished Training")

In [47]:
def test_results(net, device="cpu", for_test_set=True):
    testset = load_test_data()

    if for_test_set:
      testloader = torch.utils.data.DataLoader(testset, batch_size=4, shuffle=False)
    else:
      _, valset = load_data()
      testloader = torch.utils.data.DataLoader(valset, batch_size=4, shuffle=False)

    all_predictions = np.array([])
    all_labels = np.array([])

    net.eval()
    with torch.no_grad():
        for i, data in enumerate(testloader, 0):
            inputs_cpu = data['meaned_pooled_output']
            labels_cpu = data['success_label']

            inputs, labels = inputs_cpu.to(device), labels_cpu.to(device)
            outputs = net(inputs)
            _, predicted = torch.max(outputs.data, 1)

            all_predictions = np.append(all_predictions, predicted.to('cpu').numpy())
            all_labels = np.append(all_labels, labels_cpu.numpy())

    s_precision, s_recall, s_f1, _ = precision_recall_fscore_support(all_labels, all_predictions, average='weighted')
    return {
        'precision': s_precision,
        'recall': s_recall,
        'f1': s_f1
    }

In [48]:
from ray.tune.logger import DEFAULT_LOGGERS
from ray.tune.schedulers import ASHAScheduler, PopulationBasedTraining
import torch.optim as optim
from functools import partial

def main(num_samples = 15, max_num_epochs = 10):
  tune_config = {
      "lr": tune.loguniform(5e-3, 1e-2),
      "batch_size": tune.choice([16,32]),
      "num_epochs": tune.choice([2,3,5,10]),#,2,3]),#,2,3,5,10,20]),
      "do_rate": tune.uniform(0.1, 0.4),
    }

  scheduler = ASHAScheduler(
    max_t=max_num_epochs,
    grace_period=1,
    reduction_factor=2,
    metric='f1',
    mode='max',)

  # scheduler = PopulationBasedTraining(
  #     time_attr='time_total_s',
  #     metric='f1',
  #     mode='max',
  #     perturbation_interval=6.0,
  #     hyperparam_mutations={
  #         "lr": [5e-4, 1e-4, 5e-5, 1e-5]
  #     })

  result = tune.run(
    run_or_experiment = partial(train_nn, checkpoint_dir='/tmp/ShallowNNModels'),
    config = tune_config,
    resources_per_trial={'gpu': 1},
    # metric = 'loss',
    # mode = 'min',
    num_samples = num_samples,
    scheduler = scheduler)
  
  best_trial = result.get_best_trial(metric="f1", mode="max", scope="all")
  print("Best trial config: {}".format(best_trial.config))
  print("Best trial final validation loss: {}".format(
      best_trial.last_result["loss"]))
  print("Best trial final validation f1: {}".format(
      best_trial.last_result["f1"]))
  
  best_trained_model = Net(transformer_model.pre_classifier, transformer_model.classifier1, best_trial.config['do_rate'])
  device = "cpu"
  if torch.cuda.is_available():
      device = "cuda:0"
      # if gpus_per_trial > 1:
      #     best_trained_model = nn.DataParallel(best_trained_model)
  best_trained_model.to(device)

  best_checkpoint_dir = best_trial.checkpoint.value
  print("best_checkpoint_dir", best_checkpoint_dir)
  model_state, optimizer_state = torch.load(os.path.join(
      best_checkpoint_dir, "checkpoint"))
  best_trained_model.load_state_dict(model_state)

  # model_save_name = "yungclassifier.pt"
  path = F"/content/drive/MyDrive/Thesis/BookSuccessPredictor/saved_models/ShallowNNModels/yungclassifier1.pt"
  torch.save(best_trained_model.state_dict(), path)

  print("Test results")
  print(test_results(best_trained_model, device))
  return result

In [49]:
tune_run_result = main(num_samples=1)

2021-08-31 00:31:03,523	INFO registry.py:67 -- Detected unknown callable for trainable. Converting to class.


Trial name,status,loc,batch_size,do_rate,lr,num_epochs
DEFAULT_ba97d_00000,RUNNING,,16,0.393788,0.00679726,10


[2m[36m(pid=1491)[0m <class '__main__.Net'>
Result for DEFAULT_ba97d_00000:
  accuracy: 0.7769784172661871
  date: 2021-08-31_00-31-18
  done: false
  experiment_id: a53a105163d44c0aa27751fabd8afbe8
  f1: 0.7693014391519823
  hostname: 3cea2778e88a
  iterations_since_restore: 1
  loss: 0.5295071502526602
  node_ip: 172.28.0.2
  pid: 1491
  precision: 0.7726071357609067
  recall: 0.7769784172661871
  should_checkpoint: true
  time_since_restore: 0.21792078018188477
  time_this_iter_s: 0.21792078018188477
  time_total_s: 0.21792078018188477
  timestamp: 1630369878
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: ba97d_00000
  


Trial name,status,loc,batch_size,do_rate,lr,num_epochs,iter,total time (s),loss,accuracy,f1
DEFAULT_ba97d_00000,RUNNING,172.28.0.2:1491,16,0.393788,0.00679726,10,1,0.217921,0.529507,0.776978,0.769301


[2m[36m(pid=1491)[0m saving in checkpoint dir




[2m[36m(pid=1491)[0m saving in checkpoint dir




[2m[36m(pid=1491)[0m saving in checkpoint dir




[2m[36m(pid=1491)[0m saving in checkpoint dir




Trial name,status,loc,batch_size,do_rate,lr,num_epochs,iter,total time (s),loss,accuracy,f1
DEFAULT_ba97d_00000,RUNNING,172.28.0.2:1491,16,0.393788,0.00679726,10,4,4.78655,0.540947,0.784173,0.780616


Result for DEFAULT_ba97d_00000:
  accuracy: 0.7841726618705036
  date: 2021-08-31_00-31-24
  done: false
  experiment_id: a53a105163d44c0aa27751fabd8afbe8
  f1: 0.7806158999868829
  hostname: 3cea2778e88a
  iterations_since_restore: 5
  loss: 0.5288541350099776
  node_ip: 172.28.0.2
  pid: 1491
  precision: 0.7802200100384808
  recall: 0.7841726618705036
  should_checkpoint: true
  time_since_restore: 6.2629783153533936
  time_this_iter_s: 1.4764275550842285
  time_total_s: 6.2629783153533936
  timestamp: 1630369884
  timesteps_since_restore: 0
  training_iteration: 5
  trial_id: ba97d_00000
  
[2m[36m(pid=1491)[0m saving in checkpoint dir




[2m[36m(pid=1491)[0m saving in checkpoint dir




[2m[36m(pid=1491)[0m saving in checkpoint dir




[2m[36m(pid=1491)[0m saving in checkpoint dir




Trial name,status,loc,batch_size,do_rate,lr,num_epochs,iter,total time (s),loss,accuracy,f1
DEFAULT_ba97d_00000,RUNNING,172.28.0.2:1491,16,0.393788,0.00679726,10,8,10.8311,0.527504,0.791367,0.788578


Result for DEFAULT_ba97d_00000:
  accuracy: 0.7985611510791367
  date: 2021-08-31_00-31-30
  done: false
  experiment_id: a53a105163d44c0aa27751fabd8afbe8
  f1: 0.7938942463050835
  hostname: 3cea2778e88a
  iterations_since_restore: 9
  loss: 0.5215916236241659
  node_ip: 172.28.0.2
  pid: 1491
  precision: 0.7950947355604815
  recall: 0.7985611510791367
  should_checkpoint: true
  time_since_restore: 12.330201625823975
  time_this_iter_s: 1.4991514682769775
  time_total_s: 12.330201625823975
  timestamp: 1630369890
  timesteps_since_restore: 0
  training_iteration: 9
  trial_id: ba97d_00000
  
[2m[36m(pid=1491)[0m saving in checkpoint dir




Result for DEFAULT_ba97d_00000:
  accuracy: 0.7841726618705036
  date: 2021-08-31_00-31-31
  done: true
  experiment_id: a53a105163d44c0aa27751fabd8afbe8
  f1: 0.7806158999868829
  hostname: 3cea2778e88a
  iterations_since_restore: 10
  loss: 0.5357038411829207
  node_ip: 172.28.0.2
  pid: 1491
  precision: 0.7802200100384808
  recall: 0.7841726618705036
  should_checkpoint: true
  time_since_restore: 13.812340021133423
  time_this_iter_s: 1.4821383953094482
  time_total_s: 13.812340021133423
  timestamp: 1630369891
  timesteps_since_restore: 0
  training_iteration: 10
  trial_id: ba97d_00000
  
[2m[36m(pid=1491)[0m [1,    10] loss: 0.498
[2m[36m(pid=1491)[0m saving in checkpoint dir




Trial name,status,loc,batch_size,do_rate,lr,num_epochs,iter,total time (s),loss,accuracy,f1
DEFAULT_ba97d_00000,TERMINATED,,16,0.393788,0.00679726,10,10,13.8123,0.535704,0.784173,0.780616


2021-08-31 00:31:33,259	INFO tune.py:550 -- Total run time: 29.74 seconds (27.26 seconds for the tuning loop).


Best trial config: {'lr': 0.006797255020711621, 'batch_size': 16, 'num_epochs': 10, 'do_rate': 0.3937879223831988}
Best trial final validation loss: 0.5357038411829207
Best trial final validation f1: 0.7806158999868829
best_checkpoint_dir /root/ray_results/DEFAULT_2021-08-31_00-31-04/DEFAULT_ba97d_00000_0_batch_size=16,do_rate=0.39379,lr=0.0067973,num_epochs=10_2021-08-31_00-31-08/checkpoint_000009/
Test results
{'precision': 0.6959298817805587, 'recall': 0.7034482758620689, 'f1': 0.6979866542244342}


In [50]:
best_trial = tune_run_result.get_best_trial(metric="f1", mode="max", scope="all")
best_ckpt = tune_run_result.get_best_checkpoint(best_trial, metric="f1", mode="max")

In [51]:
best_trained_model = Net(transformer_model.pre_classifier, transformer_model.classifier1, 0)
device = "cpu"
if torch.cuda.is_available():
    device = "cuda:0"
    # if gpus_per_trial > 1:
    #     best_trained_model = nn.DataParallel(best_trained_model)
best_trained_model.to(device)

Net(
  (pre_classifier): Linear(in_features=768, out_features=768, bias=True)
  (classifier): Linear(in_features=768, out_features=2, bias=True)
  (dropout): Dropout(p=0, inplace=False)
)

In [52]:
  model_state, optimizer_state = torch.load(os.path.join(
      best_ckpt, "checkpoint"))
  best_trained_model.load_state_dict(model_state)

<All keys matched successfully>

In [53]:
test_results(best_trained_model, device)

{'f1': 0.7132518689832534,
 'precision': 0.7124129162880418,
 'recall': 0.7206896551724138}

In [54]:
test_results(best_trained_model, device, False)

{'f1': 0.7938942463050835,
 'precision': 0.7950947355604815,
 'recall': 0.7985611510791367}

In [55]:
test_results(best_trained_model, device, True)

{'f1': 0.7132518689832534,
 'precision': 0.7124129162880418,
 'recall': 0.7206896551724138}

In [56]:
path = os.path.join("/content/drive/MyDrive/Thesis/BookSuccessPredictor/saved_models/ShallowNNModels", "f1_7939.pt")
torch.save(best_trained_model.state_dict(), path)

# SVM

In [None]:
from sklearn import svm
import numpy as np

In [None]:
cs = np.arange(6, 6.2, 0.02).tolist()
best_clf = None
best_score = 0
best_c = None
for c in cs:
  clf = svm.SVC(kernel='rbf', gamma='scale', C=c)
  clf.fit(avg_pld_outs_hf_ds['train']['meaned_pooled_output'], avg_pld_outs_hf_ds['train']['success_label'])
  predictions = clf.predict(avg_pld_outs_hf_ds['validation']['meaned_pooled_output'])
  (_, pred_counts) = np.unique(predictions, return_counts=True)
  val_score = f1_score(avg_pld_outs_hf_ds['validation']['success_label'], predictions, average = 'weighted')
  print('Clf with C = {} obtained val-score of {}'.format(c, val_score))
  if (val_score > best_score):
    best_score = val_score
    best_clf = clf
    best_c = c

print('\nBest C: {}; Val-score: {}'.format(best_c, best_score))
test_predictions = best_clf.predict(avg_pld_outs_hf_ds['test']['meaned_pooled_output'])
test_score = f1_score(avg_pld_outs_hf_ds['test']['success_label'], test_predictions, average = 'weighted')
print('Yields score of {} on test set'.format(test_score))

Clf with C = 6.0 obtained val-score of 0.7805388682566458
Clf with C = 6.02 obtained val-score of 0.7805388682566458
Clf with C = 6.039999999999999 obtained val-score of 0.7805388682566458
Clf with C = 6.059999999999999 obtained val-score of 0.7805388682566458
Clf with C = 6.079999999999998 obtained val-score of 0.7805388682566458
Clf with C = 6.099999999999998 obtained val-score of 0.7805388682566458
Clf with C = 6.119999999999997 obtained val-score of 0.7805388682566458
Clf with C = 6.139999999999997 obtained val-score of 0.7805388682566458
Clf with C = 6.159999999999997 obtained val-score of 0.7805388682566458
Clf with C = 6.179999999999996 obtained val-score of 0.7805388682566458
Clf with C = 6.199999999999996 obtained val-score of 0.7805388682566458

Best C: 6.0; Val-score: 0.7805388682566458
Yields score of 0.7363360213723841 on test set


# RoBERT

In [None]:
dataset_w_embeddings.set_format('pytorch', columns=['pooled_outputs', 'success_label', 'genre'])

In [None]:
import numpy as np
from datasets import DatasetDict, Dataset

def get_book_changes_idx(book_titles):
  book_changes_idx = np.where(np.array(book_titles[:-1]) != np.array(book_titles[1:]))[0]
  book_changes_idx += 1
  return book_changes_idx

def convert_to_LSTM_dataset_full(dataset):
  full_ds = {}
  full_ds['train'] = convert_to_LSTM_dataset_sub(dataset['train'])
  full_ds['validation'] = convert_to_LSTM_dataset_sub(dataset['validation'])
  full_ds['test'] = convert_to_LSTM_dataset_sub(dataset['test'])

  full_ds = DatasetDict({'train': Dataset.from_dict(full_ds['train']), 'validation': Dataset.from_dict(full_ds['validation']), 'test': Dataset.from_dict(full_ds['test'])})
  return full_ds

def convert_to_LSTM_dataset_sub(dataset):
  ds = {'grouped_pooled_outs': None, 'success_label': None, 'genre': None}

  book_start_idx = get_book_changes_idx(dataset['book_title'])
  book_start_idx_w_end = np.append(book_start_idx, len(dataset['book_title']))
  book_start_idx_w_zero = np.insert(book_start_idx, 0, 0)

  book_lengths = book_start_idx_w_end - np.concatenate((np.array([0]), np.roll(book_start_idx_w_end, 1)[1:]))
  # print(type(dataset['pooled_outputs']))
  book_grouped_embeddings = dataset['pooled_outputs'].split_with_sizes(list(book_lengths))
  # book_grouped_embeddings = torch.stack(dataset['pooled_outputs'].split_with_sizes(list(book_lengths)), dim=0)

  # print(type(book_grouped_embeddings))
  ds['grouped_pooled_outs'] = book_grouped_embeddings
  ds['success_label'] = np.take(dataset['success_label'], book_start_idx_w_zero)
  ds['genre'] = np.take(dataset['genre'], book_start_idx_w_zero)
  return ds

In [None]:
class RoBERT_Model(nn.Module):

    def __init__(self, layer_size = 100):
        self.layer_size = layer_size
        super(RoBERT_Model, self).__init__()
        self.lstm = nn.LSTM(768, layer_size, num_layers=1, bidirectional=False)
        self.out = nn.Linear(layer_size, 2)

    def forward(self, grouped_pooled_outs):
        """ Define how to performed each call
        Parameters
        __________
        pooled_output: array
            -
        lengt: int
            -
        Returns:
        _______
        -
        """
        # chunks_emb = pooled_out.split_with_sizes(lengt) # splits the input tensor into a list of tensors where the length of each sublist is determined by lengt

        seq_lengths = torch.LongTensor([x for x in map(len, grouped_pooled_outs)]) # gets the length of each sublist in chunks_emb and returns it as an array

        batch_emb_pad = nn.utils.rnn.pad_sequence(grouped_pooled_outs, padding_value=-91, batch_first=True) # pads each sublist in chunks_emb to the largest sublist with value -91
        batch_emb = batch_emb_pad.transpose(0, 1)  # (B,L,D) -> (L,B,D)
        lstm_input = nn.utils.rnn.pack_padded_sequence(batch_emb, seq_lengths, batch_first=False, enforce_sorted=False) # seq_lengths.cpu().numpy()

        packed_output, (h_t, h_c) = self.lstm(lstm_input, )  # (h_t, h_c))
        # output, _ = nn.utils.rnn.pad_packed_sequence(packed_output, padding_value=-91)

        h_t = h_t.view(-1, self.layer_size) # (-1, 100)

        return self.out(h_t) # logits

In [None]:
def my_collate1(batches):
  # for batch in batches:
  #   print(type(batch['grouped_pooled_outs']), len(batch['grouped_pooled_outs']))
  #   print(type(torch.FloatTensor(batch['grouped_pooled_outs'])))
    return {
        'grouped_pooled_outs': [torch.stack(x['grouped_pooled_outs']) for x in batches],
        'success_label': torch.LongTensor([x['success_label'] for x in batches])
    }

In [None]:
from torch.optim import AdamW
import time

def load_test_data():
  full_ds = convert_to_LSTM_dataset_full(dataset_w_embeddings)
  full_ds.set_format(type='torch', columns = ['grouped_pooled_outs', 'success_label', 'genre'])
  testset = full_ds['test']
  return testset

def load_data():
  full_ds = convert_to_LSTM_dataset_full(dataset_w_embeddings)
  full_ds.set_format(type='torch', columns = ['grouped_pooled_outs', 'success_label', 'genre'])
  trainset = full_ds['train']
  valset = full_ds['validation']
  return trainset, valset

# def loss_fun(outputs, targets):
#     loss = nn.CrossEntropyLoss()
#     return loss(outputs, targets)

def rnn_train_fun1(config, checkpoint_dir='/tmp/LSTMModels'):
  model = RoBERT_Model(config["layer_size"])
  model.train()
  device = "cpu"
  # if torch.cuda.is_available():
  #   device = "cuda:0"
  #   if torch.cuda.device_count() > 1:
  #       model = nn.DataParallel(model)
  # # print(type(model))
  # model.to(device)


  criterion = nn.CrossEntropyLoss()
  # optimizer = optim.SGD(model.parameters(), lr=config["lr"], momentum=0.9)
  optimizer=AdamW(model.parameters(), lr=config["lr"])

  trainset, valset = load_data()

  trainloader = torch.utils.data.DataLoader(trainset, batch_size=config["batch_size"], collate_fn=my_collate1)
  valloader = torch.utils.data.DataLoader(valset, batch_size=config["batch_size"], collate_fn=my_collate1)

  for epoch in range(config['num_epochs']):
    running_loss = 0.0
    epoch_steps = 0

    for batch_idx, batch in enumerate(trainloader):
      grouped_pooled_outs = batch['grouped_pooled_outs'] # .to(device)
      targets = batch['success_label'] #.to(device)

      optimizer.zero_grad()
      outputs = model(grouped_pooled_outs)
      loss = loss_fun(outputs, targets)
      loss.backward()
      model.float()
      optimizer.step()

      running_loss += loss.item()
      epoch_steps += 1

      if batch_idx % 10 == 9:
        print("[%d, %5d] loss: %.3f" % (epoch + 1, i + 1,
                                        running_loss / epoch_steps))
        running_loss = 0.0

      # Validation loss
      val_loss = 0.0
      val_steps = 0
      total = 0
      correct = 0

      all_predictions = np.array([])
      all_labels = np.array([])

      with torch.no_grad():
          for i, data in enumerate(valloader, 0):

              grouped_pooled_outs = data['grouped_pooled_outs'] # .to(device)
              targets = data['success_label'] # .to(device)

              outputs = model(grouped_pooled_outs)
              _, predicted = torch.max(outputs.data, 1)

              all_predictions = np.append(all_predictions, predicted.numpy())
              all_labels = np.append(all_labels, targets.numpy())

              loss = criterion(outputs, targets)
              val_loss += loss.cpu().numpy()
              val_steps += 1

      with tune.checkpoint_dir(epoch) as checkpoint_dir:
          print("saving in checkpoint dir")
          path = os.path.join(checkpoint_dir, "checkpoint")
          torch.save((model.state_dict(), optimizer.state_dict()), path)

      s_precision, s_recall, s_f1, _ = precision_recall_fscore_support(all_labels, all_predictions, average='weighted')
      tune.report(loss=(val_loss / val_steps), f1=s_f1, precision=s_precision, recall=s_recall)

In [None]:
def test_results(net, device="cpu"):
  testset = load_test_data()
  testloader = torch.utils.data.DataLoader(testset, batch_size=8, collate_fn=my_collate1)

  all_predictions = np.array([])
  all_labels = np.array([])

  net.eval()
  with torch.no_grad():
    for i, data in enumerate(testloader, 0):
        grouped_pooled_outs = data['grouped_pooled_outs'] # .to(device)
        targets = data['success_label'] # .to(device)

        outputs = net(grouped_pooled_outs)
        _, predicted = torch.max(outputs.data, 1)

        all_predictions = np.append(all_predictions, predicted.numpy())
        all_labels = np.append(all_labels, targets.numpy())

  s_precision, s_recall, s_f1, _ = precision_recall_fscore_support(all_labels, all_predictions, average='weighted')

  return {
      'precision': s_precision,
      'recall': s_recall,
      'f1': s_f1
  }

In [None]:
from ray.tune.logger import DEFAULT_LOGGERS
from ray.tune.integration.wandb import WandbLogger
from ray.tune.schedulers import ASHAScheduler
from functools import partial

def main(num_samples = 6, max_num_epochs = 15):
  config = {
    "lr": tune.loguniform(5e-4, 5e-2),
    "batch_size": tune.choice([16,32,64]),
    "num_epochs": tune.choice([1,2,3,5]),
    "layer_size": tune.choice([100]),
    "wandb": {
      "project": "LSTMClassifier",
      "api_key": config['WandB']['api_key'],
      "log_config": True
    }
  }

  scheduler = ASHAScheduler(
    max_t=max_num_epochs,
    grace_period=1,
    reduction_factor=2)

  result = tune.run(
    partial(rnn_train_fun1, checkpoint_dir='/tmp/LSTMModels'),
    config = config,
    resources_per_trial={'gpu': 1},
    metric = 'loss',
    mode = 'min',
    num_samples = num_samples,
    scheduler = scheduler,
    callbacks=[WandbLoggerCallback(
        project="LSTMClassifier",
        group='raytune_hpsearch',
        api_key=config['WandB']['api_key'],
        log_config=True
    )])

  
  best_trial = result.get_best_trial(metric="f1", mode="max", scope="last")
  print("Best trial config: {}".format(best_trial.config))
  print("Best trial final validation loss: {}".format(
      best_trial.last_result["loss"]))
  print("Best trial final validation accuracy: {}".format(
      best_trial.last_result["f1"]))
  
  best_trained_model = RoBERT_Model(best_trial.config['layer_size'])
  device = "cpu"
  # if torch.cuda.is_available():
  #     device = "cuda:0"
      # if gpus_per_trial > 1:
      #     best_trained_model = nn.DataParallel(best_trained_model)
  best_trained_model.to(device)
                        
  best_checkpoint_dir = best_trial.checkpoint.value
  model_state, optimizer_state = torch.load(os.path.join(
      best_checkpoint_dir, "checkpoint"))
  best_trained_model.load_state_dict(model_state)

  # model_save_name = "yungclassifier.pt"
  path = F"/content/drive/MyDrive/Thesis/Models/LSTMModels/yungclassifier1.pt"
  torch.save(best_trained_model.state_dict(), path)
  return test_results(best_trained_model, device)

In [None]:
test_results = main()

# ToBERT

### ToBERT dataset

In [None]:
import numpy as np
from datasets import DatasetDict, Dataset
from torch.nn.utils.rnn import pad_sequence

def get_book_changes_idx(book_titles):
  book_changes_idx = torch.from_numpy(np.where(np.array(book_titles[:-1]) != np.array(book_titles[1:]))[0])
  book_changes_idx += 1
  return book_changes_idx

def convert_to_transformer_dataset_full(dataset):
  full_ds = {}

  full_ds['train'] = convert_to_transformer_dataset_sub(dataset['train'])
  full_ds['validation'] = convert_to_transformer_dataset_sub(dataset['validation'])
  full_ds['test'] = convert_to_transformer_dataset_sub(dataset['test'])

  full_ds = DatasetDict({'train': Dataset.from_dict(full_ds['train']), 'validation': Dataset.from_dict(full_ds['validation']), 'test': Dataset.from_dict(full_ds['test'])})
  return full_ds

def convert_to_transformer_dataset_sub(dataset):
  ds = {'grouped_pooled_outs': None, 'success_label': None, 'genre': None}

  book_titles = dataset['book_title']
  book_start_idx = get_book_changes_idx(book_titles)
  book_start_idx_w_end = np.append(book_start_idx, len(book_titles))
  book_lengths = book_start_idx_w_end - np.concatenate((np.array([0]), np.roll(book_start_idx_w_end, 1)[1:]))
  # print(type(dataset['pooled_outputs']))
  book_grouped_embeddings = dataset['pooled_outputs'].split_with_sizes(list(book_lengths))
  book_grouped_embeddings = pad_sequence(list(book_grouped_embeddings), batch_first=True)
  # book_grouped_embeddings = torch.stack(dataset['pooled_outputs'].split_with_sizes(list(book_lengths)), dim=0)

  book_start_idx_w_zero = np.insert(book_start_idx, 0, 0)
  ds['book_lengths'] = torch.from_numpy(book_lengths)
  ds['grouped_pooled_outs'] = book_grouped_embeddings
  ds['success_label'] = torch.take(dataset['success_label'], book_start_idx_w_zero)
  ds['genre'] = torch.take(dataset['genre'], book_start_idx_w_zero)
  return ds

# def get_max_seq_length(train_book_lengths, val_book_lengths, test_book_lengths):
#   return max(max(train_book_lengths),max(val_book_lengths),max(test_book_lengths))

In [None]:
import pytorch_lightning as pl
from functools import partial
class GoodReadsDataModule(pl.LightningDataModule):

  def prepare_data(self):
    dataset_w_embeddings.set_format('pytorch', columns=['pooled_outputs', 'success_label', 'genre'])
    self.full_ds = convert_to_transformer_dataset_full(dataset_w_embeddings)

  def train_dataloader(self):
    trainset = Dataset.from_dict(convert_to_transformer_dataset_sub(dataset_w_embeddings['train']))
    trainset.set_format('pytorch', columns=['book_lengths', 'grouped_pooled_outs', 'success_label', 'genre'])
    trainloader = torch.utils.data.DataLoader(trainset, batch_size=32, collate_fn=partial(self.my_collate1, up_to=None))
    return trainloader

  def val_dataloader(self):
    valset = Dataset.from_dict(convert_to_transformer_dataset_sub(dataset_w_embeddings['validation']))
    valset.set_format('pytorch', columns=['book_lengths', 'grouped_pooled_outs', 'success_label', 'genre'])
    valloader = torch.utils.data.DataLoader(valset, batch_size=128, collate_fn=partial(self.my_collate1, up_to=None))
    return valloader

  def test_dataloader(self):
    testset = Dataset.from_dict(convert_to_transformer_dataset_sub(dataset_w_embeddings['test']))
    testset.set_format('pytorch', columns=['book_lengths', 'grouped_pooled_outs', 'success_label', 'genre'])
    testloader = torch.utils.data.DataLoader(testset, batch_size=128, collate_fn=partial(self.my_collate1, up_to=None))
    return testloader

  def get_batch_mask(self, max_seq_len, book_lens):
    mask = torch.zeros(len(book_lens),max_seq_len+1) # batch_size, seq_len
    mask[(torch.arange(len(book_lens)),book_lens)] = 1
    mask = mask.cumsum(dim=1)[:, :-1]
    return mask

  def my_collate1(self, batches, up_to=None):
    max_seq_len = len(batches[0]['grouped_pooled_outs']) # all sequences were previously padded to the max length
    src_key_padding_mask = self.get_batch_mask(max_seq_len, [x['book_lengths'] for x in batches])

    if up_to == None:
      up_to = max_seq_len

    return {
        'src_key_padding_mask': src_key_padding_mask[:,:up_to],
        'grouped_pooled_outs': torch.stack([torch.stack(x['grouped_pooled_outs']) for x in batches])[:,:up_to,:],
        'success_label': torch.LongTensor([x['success_label'] for x in batches])
    }

### Model Debugging

In [None]:
gr_dm = GoodReadsDataModule()
gr_dm.prepare_data()
valloader = gr_dm.val_dataloader()

NameError: ignored

In [None]:
for a, b in enumerate(valloader):
  src_key_padding_mask = b['src_key_padding_mask']
  grouped_pooled_outs = b['grouped_pooled_outs']
  targets = b['success_label']
  print(b['src_key_padding_mask'])
  input("")

In [None]:
src_key_padding_mask[0]

In [None]:
import torch.nn as nn
encoder_layers = nn.TransformerEncoderLayer(
    d_model=768, nhead=2, dim_feedforward=1024, dropout=0.1, batch_first=True
)
transformer_encoder = nn.TransformerEncoder(
    encoder_layers, num_layers=2
)

In [None]:
x = transformer_encoder(grouped_pooled_outs, src_key_padding_mask=src_key_padding_mask)

In [None]:
x

In [None]:
x[0]

In [None]:
q = torch.unsqueeze(1-src_key_padding_mask,2)*x

In [None]:
q

In [None]:
x

In [None]:
x = x.sum(dim=1)/(1-src_key_padding_mask).sum(dim=1).unsqueeze(1) 

In [None]:
x

In [None]:
# def get_batch_mask(max_seq_len, book_lens):
#   mask = torch.zeros(len(book_lens),max_seq_len+1) # batch_size, seq_len
#   mask[(torch.arange(len(book_lens)),book_lens)] = 1
#   mask = mask.cumsum(dim=1)[:, :-1]
#   return mask

# def my_collate1(batches):
#   # for some reason, the only dictionary values making it here are 'grouped_pooled_outs', 'success_label', and 'genre'
#   max_seq_len = len(batches[0]['grouped_pooled_outs']) # all sequences were previously padded to the max length
#   src_key_padding_mask = get_batch_mask(max_seq_len, [x['book_lengths'] for x in batches])
#   return {
#       'src_key_padding_mask': src_key_padding_mask,
#       'grouped_pooled_outs': torch.stack([torch.stack(x['grouped_pooled_outs']) for x in batches]),
#       'success_label': torch.LongTensor([x['success_label'] for x in batches])
#   }

# # valset = Dataset.from_dict(convert_to_transformer_dataset_sub(dataset_w_embeddings['validation']))
# # valset.set_format('pytorch', columns=['book_lengths', 'grouped_pooled_outs', 'success_label', 'genre'])
# # valloader = torch.utils.data.DataLoader(valset, batch_size=64, collate_fn=my_collate1)

In [None]:
# for batch_idx, batch in enumerate(valloader):
#   print(batch['success_label'].shape)

### Defining Model

The Embedding layer:
nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0)
uses a vector of size 768. Implies that TransformerEncoderLayer works with embeddings of length 768.

In [None]:
import torch.nn as nn
import pytorch_lightning as pl
from scipy.special import softmax
from sklearn.metrics import f1_score, precision_recall_curve, roc_auc_score, roc_curve
import math

In [None]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, d_model)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        """
        Args:
            x: Tensor, shape [batch_size, seq_len, embedding_dim]
        """
        x = x + self.pe[:, :x.size(1), :]
        return self.dropout(x)

In [None]:
# d_model = 768, nhead = 2, d_hid = 200, dropout = 0.1, nlayers = 2
class LightningToBERT(pl.LightningModule):
    def __init__(
        self,
        d_model=768,
        nhead=2,
        nhid=512,
        num_layers=2,
        dropout=0.1,
        classifier_dropout=0.1,
        # max_len=256,
    ):

        super().__init__()

        # self.d_model = embeddings.size(1)
        assert (
            d_model % nhead == 0
        ), "nheads must divide evenly into d_model"

        # self.emb = nn.Embedding.from_pretrained(embeddings, freeze=False)
        # self.pos_encoder = PositionalEncoding(
        #     self.d_model, dropout=dropout, max_len=embeddings.size(0)
        # )
        self.pos_encoder = PositionalEncoding(
            d_model, dropout=dropout, max_len=200
        )

        encoder_layers = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=nhead, dim_feedforward=nhid, dropout=dropout, batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(
            encoder_layers, num_layers=num_layers
        )

        self.dropout = nn.Dropout(classifier_dropout)
        self.pre_classifier = nn.Linear(d_model, d_model)
        self.classifier = nn.Linear(d_model, 2)
        # self.classifier = nn.Sequential(
        #     # Other layers to go here if needed once things seem to be working
        #     nn.Linear(d_model, 2),
        # )

        self.softmaxer = nn.Softmax(dim=1)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=5e-6)
        return optimizer

    def cross_entropy_loss(self, logits, labels):
        loss = nn.CrossEntropyLoss()
        return loss(logits, labels)

    def forward(self, x, src_key_padding_mask):
        # x = self.emb(x) * math.sqrt(self.d_model)
        # x = self.pos_encoder(x)
        x = self.pos_encoder(x)
        x = self.transformer_encoder(x, src_key_padding_mask=src_key_padding_mask)  # self.src_mask)

        # calculates mean taking into account the padding
        x = torch.unsqueeze(1-src_key_padding_mask,2)*x
        x = x.sum(dim=1)/(1-src_key_padding_mask).sum(dim=1).unsqueeze(1)  

        x = self.pre_classifier(x)
        x = nn.ReLU()(x)
        x = self.dropout(x)
        return self.classifier(x)
        # x = self.dropout(x) RIGHT AFTER x = x.sum(dim=1)/(1-src_key_padding_mask).sum(dim=1).unsqueeze(1) 
        # return self.classifier(x)

    def training_step(self, train_batch, batch_idx):
        grouped_pooled_outs = train_batch['grouped_pooled_outs']
        src_key_padding_mask = train_batch['src_key_padding_mask']
        targets = train_batch['success_label']

        logits = self.forward(grouped_pooled_outs, src_key_padding_mask)
        loss = self.cross_entropy_loss(logits, targets)

        self.log('train_loss', loss, prog_bar=True)
        log_dict = {'loss': loss}
        return {'loss': loss, 'log': log_dict}

    def validation_step(self, val_batch, batch_idx):
        grouped_pooled_outs = val_batch['grouped_pooled_outs']
        src_key_padding_mask = val_batch['src_key_padding_mask']
        targets = val_batch['success_label']

        logits = self.forward(grouped_pooled_outs, src_key_padding_mask)
        y_prob = self.softmaxer(logits)[:, 1]
        y_pred = (y_prob>0.5).float()

        loss = self.cross_entropy_loss(logits, targets)
        return {'val_loss': loss, 'preds': y_pred, 'targets': targets.tolist()}

    def test_step(self, batch, batch_idx, dataloader_idx = None):
        grouped_pooled_outs = batch['grouped_pooled_outs']
        src_key_padding_mask = batch['src_key_padding_mask']
        targets = batch['success_label']

        logits = self.forward(grouped_pooled_outs, src_key_padding_mask)
        y_probs = self.softmaxer(logits)[:, 1]
        return {'class_probs': y_probs, 'targets': targets.tolist()}

    def test_epoch_end(self, test_step_outputs):
        y_probs = []
        y_true = []

        for x in test_step_outputs:
          y_probs.extend(x['class_probs'].tolist())
          y_true.extend(x['targets'])

        f1_res = f1_score(y_true, y_pred, average = 'weighted')
        return {'f1': f1_res}


    def validation_epoch_end(self, val_step_outputs):
        y_pred = []
        y_true = []

        for x in val_step_outputs:
          y_pred.extend(x['preds'].tolist())
          y_true.extend(x['targets'])

        f1_res = f1_score(y_true, y_pred, average = 'weighted')
        avg_val_loss = torch.tensor([x['val_loss'] for x in val_step_outputs]).mean()

        log_dict = {
            'val_loss': avg_val_loss,
            'val_f1': f1_res
        }

        self.log('val_loss', avg_val_loss, prog_bar=True)
        self.log('val_f1', f1_res, prog_bar=True)
        return {'val_loss': avg_val_loss, 'log': log_dict}

hit 0.781 with nhead=2, num_layers=2, lr=1e-5 max_epochs=20 at epoch 19

0.7632 with nhead=8, num_layers=1, lr=1e-5 max_epochs=20 at epoch 50

0.772 with nhead=2, num_layers=1, lr=3e-5 max_epochs=20 at epoch 19

In [None]:
model = LightningToBERT()
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    dirpath='/content/version_1_logs', 
    filename='{epoch}-{val_loss:.2f}-{val_f1:.2f}',
    monitor="val_loss", 
    every_n_epochs=1,
    save_top_k=3
)
trainer = pl.Trainer(resume_from_checkpoint="/content/lightning_logs/epoch=5-val_loss=0.45-val_f1=0.79.ckpt", max_epochs=50, callbacks=[checkpoint_callback])

# automatically restores model, epoch, step, LR schedulers, apex, etc...
datamodule = GoodReadsDataModule()
trainer.fit(model, datamodule)

In [None]:
# '/content/lightning_logs/version_' + str(v_num) + '/checkpoints'

checkpoint_callback = pl.callbacks.ModelCheckpoint(
    dirpath='/content/lightning_logs', 
    filename='{epoch}-{val_loss:.2f}-{val_f1:.2f}',
    monitor="val_loss", 
    every_n_train_steps=1,
    save_top_k=3
)

trainer = pl.Trainer(log_every_n_steps=1, gpus=1, max_epochs=20, callbacks=[checkpoint_callback], num_sanity_val_steps=0)
model = LightningToBERT(nhead=2, num_layers=2, dropout=0.15)

datamodule = GoodReadsDataModule()
trainer.fit(model, datamodule)
# v_num+=1

#### Test Model

In [None]:
gr_dm = GoodReadsDataModule()
gr_dm.prepare_data()
testloader = gr_dm.test_dataloader()

In [None]:
model  = LightningToBERT.load_from_checkpoint("/content/lightning_logs/epoch=5-val_loss=0.45-val_f1=0.79.ckpt")
model.to('cuda')

y_hats = []
y_trues = []
with torch.no_grad():
  for a, b in enumerate(testloader):
    src_key_padding_mask = b['src_key_padding_mask'].to('cuda')
    grouped_pooled_outs = b['grouped_pooled_outs'].to('cuda')
    targets = b['success_label'].tolist()

    y_hat = model(grouped_pooled_outs, src_key_padding_mask).to('cpu').tolist()
    y_hats.extend(y_hat)
    y_trues.extend(targets)

In [None]:
probabilities_per_book = softmax(y_hats, axis = 1)

In [None]:
y_score = probabilities_per_book[:,1].tolist()

In [None]:
y_pred = [math.floor(input) if input < 0.5 else math.ceil(input) for input in y_score]
f1_res = f1_score(y_trues, y_pred, average = 'weighted')
print(f1_res)

In [None]:
def get_f1_for_validation(y_score, y_true, l_th = 0.4, u_th = 0.8):
  thresholds = np.arange(l_th, u_th, 0.01)
  f1_scores = []
  for th in thresholds:
    y_pred = [math.floor(input) if input < th else math.ceil(input) for input in y_score]
    f1_res = f1_score(y_true, y_pred, average = 'weighted')
    f1_scores.append(f1_res)
  max_f1 = max(f1_scores)
  max_f1_index = f1_scores.index(max_f1)
  # self.validated_threshold = thresholds[max_f1_index]
  f1_scores_and_thresholds = {'thresholds': thresholds, 'f1_scores': f1_scores, 'max_f1_index': max_f1_index}
  return f1_scores_and_thresholds

f1_scores_and_thresholds = get_f1_for_validation(y_score, y_trues)

In [None]:
import matplotlib
matplotlib.pyplot.plot(f1_scores_and_thresholds['thresholds'], f1_scores_and_thresholds['f1_scores'])

In [None]:
y_score

In [None]:
# test_model.eval()
# y_hats = []

# for a, b in enumerate(valloader):
#   src_key_padding_mask = b['src_key_padding_mask']
#   grouped_pooled_outs = b['grouped_pooled_outs']
#   targets = b['success_label']

#   y_hat = test_model(grouped_pooled_outs, src_key_padding_mask)
#   targets.append(targets.tolist())
#   y_hats.append(y_hat)

#### View TensorBoard

In [None]:
# Load the TensorBoard notebook extension
%load_ext tensorboard
%tensorboard --logdir ./lightning_logs

In [None]:
# class ToBERT(nn.Module):

#   def __init__(self, d_model, nhead, dropout, d_hid, nlayers, nclasses):
#       # d_model = 768, nhead = 2, d_hid = 200, dropout = 0.1, nlayers = 2
#       encoder_layers = TransformerEncoderLayer(d_model, nhead, d_hid, dropout)
#       self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
#       self.classifier = nn.Linear(d_model, nclasses)

#   def forward(self, src: Tensor, src_mask: Tensor) -> Tensor:
#       """
#       Args:
#           src: Tensor, shape [seq_len, embedding_dim, batch_size]
#           src_mask: Tensor, shape [seq_len, seq_len]

#       seq_len should be the max number of segments a book has in our dataset
#       embedding_dim will be 768 (from BERT)

#       src_mask is necessary because we will need to pad shorter books to have as many segments
#       as the longest book. Obviously we do not want our model to attend to the padded tokens in
#       these cases.
#       """
#       output = self.transformer_encoder(src, src_mask) 
#       output = self.classifier(output)
#       return output

### Training (Pytorch)

In [None]:
from torch.optim import AdamW
import time

def loss_fun(outputs, targets):
    loss = nn.CrossEntropyLoss()
    return loss(outputs, targets)

def transformer_train_fun1(config, checkpoint_dir='/tmp/ToBERTModels'):

  model = ToBERT(768, config["nhead"], config["nhid"], config["num_layers"], config["dropout"], config["dropout"])
  model.train()
  device = "cpu"

  if torch.cuda.is_available():
    device = "cuda:0"
    if torch.cuda.device_count() > 1:
        model = nn.DataParallel(model)
  # print(type(model))
  model.to(device)


  criterion = nn.CrossEntropyLoss()
  # optimizer = optim.SGD(model.parameters(), lr=config["lr"], momentum=0.9)
  optimizer=AdamW(model.parameters(), lr=config["lr"])

  trainset, valset = load_data()

  trainloader = torch.utils.data.DataLoader(trainset, batch_size=config["batch_size"], collate_fn=my_collate1)
  valloader = torch.utils.data.DataLoader(valset, batch_size=config["batch_size"], collate_fn=my_collate1)


  for epoch in range(config['num_epochs']):
    running_loss = 0.0
    # epoch_steps = 0
    for tr_batch_idx, tr_batch in enumerate(trainloader):
      grouped_pooled_outs = tr_batch['grouped_pooled_outs'].to(device) # .to(device)
      src_key_padding_mask = tr_batch['src_key_padding_mask'].to(device)
      targets = tr_batch['success_label'].to(device) #.to(device)

      optimizer.zero_grad()

      outputs = model(grouped_pooled_outs, src_key_padding_mask)
      loss = loss_fun(outputs, targets)
      loss.backward()
      # model.float()
      optimizer.step()

      running_loss += loss.item()
      # epoch_steps += 1

      print('[%d, %5d] loss: %.3f' %
            (epoch + 1, tr_batch_idx + 1, running_loss))
      running_loss = 0.0

      val_loss = 0.0
      val_steps = 0

      all_predictions = torch.tensor([], dtype=torch.long, device=device) # np.array([])
      all_labels = torch.tensor([], dtype=torch.long, device=device)# np.array([])

      with torch.no_grad():
          for val_batch_idx, val_batch in enumerate(valloader, 0):

              grouped_pooled_outs = val_batch['grouped_pooled_outs'].to(device) # .to(device)
              src_key_padding_mask = val_batch['src_key_padding_mask'].to(device)
              targets = val_batch['success_label'].to(device) # .to(device)

              outputs = model(grouped_pooled_outs, src_key_padding_mask)
              _, predicted = torch.max(outputs.data, 1)

              # all_predictions = np.append(all_predictions, predicted.numpy())
              # all_labels = np.append(all_labels, targets.numpy())
              all_predictions = torch.cat((all_predictions, predicted), 0)
              all_labels = torch.cat((all_labels, targets), 0)

              loss = criterion(outputs, targets)
              val_loss += loss.cpu().numpy()
              val_steps += 1

      # with tune.checkpoint_dir(epoch) as checkpoint_dir:
      #     print("saving in checkpoint dir")
      #     path = os.path.join(checkpoint_dir, "checkpoint")
      #     torch.save((model.state_dict(), optimizer.state_dict()), path)

      all_labels = all_labels.cpu().numpy()  
      all_predictions = all_predictions.cpu().numpy()

      s_precision, s_recall, s_f1, _ = precision_recall_fscore_support(all_labels, all_predictions, average='weighted')
      print('[%d, %5d] loss: %.4f; f1: %.4f; precision: %.4f; recall: %.4f' % (epoch + 1, tr_batch_idx + 1, val_loss, s_f1, s_precision, s_recall))
      # tune.report(loss=(val_loss / val_steps), f1=s_f1, precision=s_precision, recall=s_recall)

In [None]:
sample_config = {
    "lr": 0.0001, #tune.loguniform(5e-4, 5e-2),
    "nhead": 2,
    "nhid": 200,
    "num_layers": 1,
    "dropout": 0.1,
    "batch_size": 128,
    "num_epochs": 30,
  }

In [None]:
transformer_train_fun1(sample_config)

#### Hyperparameter Search

In [None]:
def test_results(net, device="cpu"):
  testset = load_test_data()
  testloader = torch.utils.data.DataLoader(testset, batch_size=8, collate_fn=my_collate1)

  all_predictions = np.array([])
  all_labels = np.array([])

  net.eval()
  with torch.no_grad():
    for test_batch_idx, test_batch in enumerate(testloader, 0):
        grouped_pooled_outs = test_batch['grouped_pooled_outs'].to(device) # .to(device)
        src_key_padding_mask = test_batch['src_key_padding_mask'].to(device)
        targets = test_batch['success_label'].to(device) # .to(device)

        outputs = net(grouped_pooled_outs, src_key_padding_mask)
        _, predicted = torch.max(outputs.data, 1)

        all_predictions = np.append(all_predictions, predicted.numpy())
        all_labels = np.append(all_labels, targets.numpy())

  s_precision, s_recall, s_f1, _ = precision_recall_fscore_support(all_labels, all_predictions, average='weighted')

  return {
      'precision': s_precision,
      'recall': s_recall,
      'f1': s_f1
  }

In [None]:
def main(num_samples = 6, max_num_epochs = 30):
# config["nhead"], config["nhid"], config["num_layers"], config["dropout"], config["dropout"]
  tune_config = {
    "lr": tune.choice([0.00001]), #tune.loguniform(5e-4, 5e-2),
    "nhead": tune.choice([2]),
    "nhid": tune.choice([200]),
    "num_layers": tune.choice([1]),
    "dropout": tune.choice([0.1]),
    "batch_size": tune.choice([128]),
    "num_epochs": tune.choice([30]),
    "wandb": {
      "project": "ToBERTClassifier",
      "api_key": config['WandB']['api_key'],
      "log_config": True
    }
  }

  scheduler = ASHAScheduler(
    max_t=max_num_epochs,
    grace_period=1,
    reduction_factor=2)

  result = tune.run(
    partial(transformer_train_fun1, checkpoint_dir='/tmp/ToBERTModels'),
    config = tune_config,
    resources_per_trial={'gpu': 1},
    metric = 'loss',
    mode = 'min',
    num_samples = num_samples,
    scheduler = scheduler,
    callbacks=[WandbLoggerCallback(
        project="ToBERTClassifier",
        group='raytune_hpsearch',
        api_key=config['WandB']['api_key'],
        log_config=True
    )])

  
  best_trial = result.get_best_trial(metric="f1", mode="max", scope="last")
  print("Best trial config: {}".format(best_trial.config))
  print("Best trial final validation loss: {}".format(
      best_trial.last_result["loss"]))
  print("Best trial final validation accuracy: {}".format(
      best_trial.last_result["f1"]))
  
  best_trained_model = ToBERT(768, best_trial.config["nhead"], 
                        best_trial.config["nhid"], best_trial.config["num_layers"], 
                        best_trial.config["dropout"], best_trial.config["dropout"])
  device = "cpu"
  if torch.cuda.is_available():
      device = "cuda:0"

  best_trained_model.to(device)
                        
  best_checkpoint_dir = best_trial.checkpoint.value
  model_state, optimizer_state = torch.load(os.path.join(
      best_checkpoint_dir, "checkpoint"))
  best_trained_model.load_state_dict(model_state)

  # model_save_name = "yungclassifier.pt"
  path = F"/content/drive/MyDrive/Thesis/Models/LSTMModels/yungclassifier1.pt"
  torch.save(best_trained_model.state_dict(), path)
  return test_results(best_trained_model, device)

In [None]:
test_results = main(num_samples = 1)

### Playground

we can see that the values of the 1st and 2nd tensors didnt change when we applied the masking properly

In [None]:
def get_batch_mask(max_seq_len, book_lens):
  mask = torch.zeros(len(book_lens),max_seq_len+1) # batch_size, seq_len
  mask[(torch.arange(len(book_lens)),book_lens)] = 1
  mask = mask.cumsum(dim=1)[:, :-1]
  return mask

In [None]:
book_lens = torch.LongTensor([2,4,5])
max_seq_len = 6
src_key_padding_mask = get_batch_mask(max_seq_len, book_lens)

In [None]:
src_key_padding_mask.shape

In [None]:
import torch, torch.nn as nn
q = torch.randn(3, 6, 10) # batch size 3, source sequence length 6, embedding size 10
attn = nn.MultiheadAttention(10, 1, batch_first=True) # embedding size 10, one head

ay = attn(q, q, q, key_padding_mask=src_key_padding_mask) # self attention

In [None]:
src_key_padding_mask

In [None]:
y = torch.unsqueeze(1-src_key_padding_mask,2)*ay[0]
y.sum(dim=1)/(1-src_key_padding_mask).sum(dim=1).unsqueeze(1)

# MultiModal

### Defining the Model

In [None]:
import torch.nn as nn
import torch

Our model will be composed of three separate modules:

1. (Normalizer) Responsible for taking all the inputs of various dimensions and feeding them each through their own linear layer to project them into a space with all the same dimensions

In essence, it is responsible for eq (1) in the paper $h_i=selu(W_{h_i} x_i + b_h)$


2. (GenreAwareAttention) This is where most of the meat of the model is. It is responsible for performing these 3 equations. 

$score(h_i, g) = v^T selu(W_a h_i + W_g g + b_a)$

$\alpha_i = \frac{exp(score(h_i,g))}{\sum_{i'}exp(score(h_{i'},g)}$

$r=\sum_i \alpha_i h_i$

3. (ClassOutput) The last layer is simply responsible for projecting the book representation to class probabilities.

$\hat{p}=\sigma(W_c r + b_c)$

In [None]:
class Normalizer(nn.Module):
  def __init__(self, c5g_size, bf_size, std_dims):
    super(Normalizer, self).__init__()

    self.c5g_linear = nn.Linear(c5g_size, std_dims)
    self.bf_linear = nn.Linear(bf_size, std_dims)

  def forward(self, x_c5g, x_bf):
    # x_c5g ~ (BATCH_SIZE, C5G_FEATURE_SIZE)
    # x_bf ~ (BATCH_SIZE, BF_FEATURE_SIZE)
    # # split features into char_5_gram and bert_features
    # char_5_grams = None
    # bert_features = None

    c5g_normed = self.c5g_linear(x_c5g)
    bf_normed = self.bf_linear(x_bf)

    # concatenate c5g_normed and bf_normed
    return torch.stack([c5g_normed, bf_normed], 1) # (BATCH_SIZE, NUM_MODALITIES, EMBED_SIZE)

In [None]:
class GenreAwareAttention(nn.Module):
  def __init__(self, std_dims, num_units, do_rate):
    super(GenreAwareAttention, self).__init__()
    self.activation = nn.SELU()
    self.nn_softmax = nn.Softmax(dim=1)

    self.v = nn.parameter.Parameter(
        nn.init.xavier_uniform_(torch.empty(num_units,1)),
        requires_grad=True
    )

    self.Wa = nn.parameter.Parameter(
        nn.init.xavier_uniform_(torch.empty(std_dims,num_units)), 
        requires_grad=True
    )

    self.b = nn.parameter.Parameter(
        nn.init.ones_(torch.empty(num_units,)),
        requires_grad=True
    )

    self.Wg = nn.parameter.Parameter(
        nn.init.xavier_uniform_(torch.empty(8, num_units)), 
        requires_grad=True
    )

    self.dropout = nn.Dropout(p=do_rate)

  def forward(self, x, g):
    # x ~ (BATCH_SIZE, NUM_MODALITIES, EMBED_SIZE)
    # g ~ (BATCH_SIZE, 1, GENRE_EMBED_SIZE)
    
    # calculate scores
    atten_g = torch.mm(g, self.Wg).unsqueeze(dim=1)
    et = self.activation(torch.matmul(x, self.Wa) + atten_g + self.b)
    et = self.dropout(et)
    
    et = torch.matmul(et, self.v)

    at = self.nn_softmax(et)

    # at = torch.unsqueeze(at, axis=-1)

    # print('at:', at.size())
    # print('x:', x.size())
    ot = at * x # canot multiply at: torch.Size([4, 2, 1, 1]) x: torch.Size([4, 2, 100])

    return torch.sum(ot, axis=1) # BATCH_SIZE, EMBED_SIZE

In [None]:
class ClassifierOut(nn.Module):
  def __init__(self, std_dims):
    super(ClassifierOut, self).__init__()
    self.classifier = nn.Linear(std_dims, 2)
  
  def forward(self, r): # r ~ BATCH_SIZE, EMBED_SIZE
    r_out = self.classifier(r) # BATCH_SIZE, 2
    return torch.sigmoid(r_out)

In [None]:
# class FullModel(nn.Module): # may want to consider also adding a dropout layer before classification
#   def __init__(self, c5g_size, bf_size, std_dims, num_units, do_rate):
#     super(FullModel,self).__init__()
#     self.normalizer = Normalizer(c5g_size, bf_size, std_dims)
#     self.genre_aware_attention = GenreAwareAttention(std_dims, num_units, do_rate)
#     self.classifier_out = ClassifierOut(std_dims)

#   def forward(self, x_c5g, x_bf, genre):
#     x_normed = self.normalizer(x_c5g, x_bf)
#     g_a_a = self.genre_aware_attention(x_normed, genre)
#     return self.classifier_out(g_a_a)

In [None]:
# d_model = 768, nhead = 2, d_hid = 200, dropout = 0.1, nlayers = 2
class FullModel(pl.LightningModule):
    def __init__(self, c5g_size, bf_size, std_dims, num_units, do_rate):
      super(FullModel,self).__init__()
      self.normalizer = Normalizer(c5g_size, bf_size, std_dims)
      self.genre_aware_attention = GenreAwareAttention(std_dims, num_units, do_rate)
      self.classifier_out = ClassifierOut(std_dims)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=5e-4)
        return optimizer

    def cross_entropy_loss(self, logits, labels):
        loss = nn.CrossEntropyLoss()
        return loss(logits, labels)

    def forward(self, x_c5g, x_bf, genre):
      x_normed = self.normalizer(x_c5g, x_bf)
      g_a_a = self.genre_aware_attention(x_normed, genre)
      return self.classifier_out(g_a_a)

    def training_step(self, train_batch, batch_idx):
        c5g_f = batch['c5g_f']
        bert_f = batch['bert_f']
        genre = batch['genre']
        targets = batch['label']

        outputs = self.forward(c5g_f, bert_f, genre)
        loss = self.cross_entropy_loss(outputs, targets)

        self.log('train_loss', loss, prog_bar=True)
        log_dict = {'loss': loss}
        return {'loss': loss, 'log': log_dict}

    def validation_step(self, val_batch, batch_idx):
        c5g_f = batch['c5g_f']
        bert_f = batch['bert_f']
        genre = batch['genre']
        targets = batch['label']

        logits = self.forward(c5g_f, bert_f, genre)
        y_prob = self.softmaxer(logits)[:, 1]
        y_pred = (y_prob>0.5).float()

        loss = self.cross_entropy_loss(logits, targets)
        return {'val_loss': loss, 'preds': y_pred, 'targets': targets.tolist()}

    def validation_epoch_end(self, val_step_outputs):
        y_pred = []
        y_true = []

        for x in val_step_outputs:
          y_pred.extend(x['preds'].tolist())
          y_true.extend(x['targets'])

        f1_res = f1_score(y_true, y_pred, average = 'weighted')
        avg_val_loss = torch.tensor([x['val_loss'] for x in val_step_outputs]).mean()

        log_dict = {
            'val_loss': avg_val_loss,
            'val_f1': f1_res
        }

        self.log('val_loss', avg_val_loss, prog_bar=True)
        self.log('val_f1', f1_res, prog_bar=True)
        return {'val_loss': avg_val_loss, 'log': log_dict}

    def test_step(self, batch, batch_idx, dataloader_idx = None):
        c5g_f = batch['c5g_f']
        bert_f = batch['bert_f']
        genre = batch['genre']
        targets = batch['label']

        logits = self.forward(grouped_pooled_outs, src_key_padding_mask)
        y_probs = self.softmaxer(logits)[:, 1]
        y_pred = (y_prob>0.5).float()

        return {'preds': y_pred, 'targets': targets.tolist()}

    def test_epoch_end(self, test_step_outputs):
        y_preds = []
        y_true = []

        for x in test_step_outputs:
          y_preds.extend(x['preds'].tolist())
          y_true.extend(x['targets'])

        f1_res = f1_score(y_true, y_pred, average = 'weighted')
        return {'f1': f1_res}

### Getting the Data

In [None]:
from MultimodalGoodreadsDataset import MultimodalGoodreadsDataset

In [None]:
dataset_base_dir = '/content/drive/MyDrive/Thesis/BookSuccessPredictor/datasets/goodreads_maharjan_super/raw_preprocessed/goodreads_maharjan_trimmed'
cached_features_dir = '/content/drive/MyDrive/Thesis/BookSuccessPredictor/datasets/goodreads_maharjan_super/MultiModal/dataset_loader/cached_features'

ds = MultimodalGoodreadsDataset(dataset_base_dir, cached_features_dir)

def my_collate_fn(batches, f1_len, f2_len):
    return {
        'c5g_f': torch.tensor([x['text_features'].toarray()[0][0:f1_len] for x in batches]), # dtype = Float?
        'bert_f': torch.tensor([x['text_features'].toarray()[0][f1_len:f1_len+f2_len] for x in batches]), 
        'genre': torch.tensor([x['genre'] for x in batches]),
        'label': torch.tensor([x['label'] for x in batches])
    }

# c5g_len = ds.f_lengths[0]
# bf_len = ds.f_lengths[1]

# train_dataloader = DataLoader(ds.train, batch_size=64, shuffle=True, collate_fn=partial(my_collate_fn, f1_len=c5g_len, f2_len=bf_len))
# val_dataloader = DataLoader(ds.val, batch_size=64, shuffle=True, collate_fn=partial(my_collate_fn, f1_len=c5g_len, f2_len=bf_len))
# test_dataloader = DataLoader(ds.test, batch_size=64, shuffle=True, collate_fn=partial(my_collate_fn, f1_len=c5g_len, f2_len=bf_len))

Path to yaml: /content/drive/MyDrive/Thesis/BookSuccessPredictor/datasets/goodreads_maharjan_super/raw_preprocessed/goodreads_maharjan_trimmed/train_test_val_80_20_split_goodreads.yaml


  data = yaml.load(stream)


Total test instances: 290, validation instances: 139, and Training instances: 555
Total unique books: 984
Training instances (555,), Val instances (139,), Test instances (290,)
extracting feature: char_5_gram
Using cached features
extracting feature: bert_features
Using cached features


In [None]:
# from pydrive.auth import GoogleAuth
# from pydrive.drive import GoogleDrive
# from google.colab import auth
# from oauth2client.client import GoogleCredentials

# import pickle

# # 1. Authenticate and create the PyDrive client.
# auth.authenticate_user()
# gauth = GoogleAuth()
# gauth.credentials = GoogleCredentials.get_application_default()
# drive = GoogleDrive(gauth)  

# with open('train_dataset.pkl', 'wb') as output_file:
#   pickle.dump(ds.train, output_file)

# with open('val_dataset.pkl', 'wb') as output_file:
#   pickle.dump(ds.val, output_file)

# with open('test_dataset.pkl', 'wb') as output_file:
#   pickle.dump(ds.test, output_file)

# folder_id = '1q2IGZrQ9oNwP-CqttWUuiYcenb8vmWUg'
# # get the folder id where you want to save your file
# file = drive.CreateFile({'parents':[{u'id': folder_id}]})
# file.SetContentFile('train_dataset.pkl')
# file.Upload() 

# file = drive.CreateFile({'parents':[{u'id': folder_id}]})
# file.SetContentFile('val_dataset.pkl')
# file.Upload() 

# # get the folder id where you want to save your file
# file = drive.CreateFile({'parents':[{u'id': folder_id}]})
# file.SetContentFile('test_dataset.pkl')
# file.Upload() 

In [None]:
from datasets import Dataset
class MultimodalGoodreadsDatasetSplit(Dataset):

    def __init__(self, X, genres, Y):
        self.X = X
        self.genres = genres
        self.Y = Y

    def __len__(self):
        return len(self.Y)

    def __getitem__(self, idx):
        return {'text_features': self.X[idx], 'genre': self.genres[idx], 'label': self.Y[idx]}

mmgrds_train = MultimodalGoodreadsDatasetSplit(ds.train.X, ds.train.genres, ds.train.Y)
mmgrds_val = MultimodalGoodreadsDatasetSplit(ds.val.X, ds.val.genres, ds.val.Y)
mmgrds_test = MultimodalGoodreadsDatasetSplit(ds.test.X, ds.test.genres, ds.test.Y)

In [None]:
from torch.utils.data import DataLoader
import torch
from functools import partial

def my_collate_fn(batches, f1_len, f2_len):
    return {
        'c5g_f': torch.tensor([x['text_features'].toarray()[0][0:f1_len] for x in batches]), # dtype = Float?
        'bert_f': torch.tensor([x['text_features'].toarray()[0][f1_len:f1_len+f2_len] for x in batches]), 
        'genre': torch.tensor([x['genre'] for x in batches]),
        'label': torch.tensor([x['label'] for x in batches])
    }

def load_data():
  return mmgrds_train, mmgrds_val

def load_test_data():
  return mmgrds_test
# def load_data():
#   dataset_base_dir = '/content/drive/MyDrive/Thesis/BookSuccessPredictor/datasets/goodreads_maharjan_super/raw_preprocessed/goodreads_maharjan_trimmed'
#   cached_features_dir = '/content/drive/MyDrive/Thesis/BookSuccessPredictor/datasets/goodreads_maharjan_super/MultiModal/dataset_loader/cached_features'

#   ds = MultimodalGoodreadsDataset(dataset_base_dir, cached_features_dir)

#   return ds

### Training

In [None]:
from torch.optim import AdamW
import numpy as np
from sklearn.metrics import precision_recall_fscore_support 

def mm_train_fun1(config, checkpoint_dir='/tmp/MultiModalModels'):

  train_dataset, val_dataset = load_data()
  model = FullModel(311595, 768, config['std_dims'], config['num_units'], config['do_rate']).to('cuda')
  model.train()

  criterion = nn.CrossEntropyLoss()
  optimizer=AdamW(model.parameters(), lr=config["lr"])
  # [311595, 768]
  train_dataloader = DataLoader(train_dataset, batch_size=config["batch_size"], shuffle=True, collate_fn=partial(my_collate_fn, f1_len=311595, f2_len=768))
  val_dataloader = DataLoader(val_dataset, batch_size=config["batch_size"], shuffle=True, collate_fn=partial(my_collate_fn, f1_len=311595, f2_len=768))
  print("len(train_dataloader)",len(train_dataloader))
  for epoch in range(config['num_epochs']):
    running_loss = 0.0
    epoch_steps = 0

    for batch_idx, batch in enumerate(train_dataloader):


      c5g_f = batch['c5g_f'].to('cuda')
      bert_f = batch['bert_f'].to('cuda')
      genre = batch['genre'].to('cuda')
      targets = batch['label'].to('cuda')

      optimizer.zero_grad()
      outputs = model(c5g_f.float(), bert_f.float(), genre.float())
      loss = criterion(outputs, targets)
      loss.backward()
      model.float()
      optimizer.step()

      val_loss = 0.0
      val_steps = 0
      total = 0
      correct = 0

      all_predictions = np.array([])
      all_labels = np.array([])

      model.eval()
      with torch.no_grad():
          for i, batch_v in enumerate(val_dataloader, 0):

              c5g_f = batch_v['c5g_f'].to('cuda')
              bert_f = batch_v['bert_f'].to('cuda')
              genre = batch_v['genre'].to('cuda')
              targets = batch_v['label'].to('cuda')

              outputs = model(c5g_f.float(), bert_f.float(), genre.float())
              _, predicted = torch.max(outputs.data, 1)

              all_predictions = np.append(all_predictions, predicted.cpu().numpy())
              all_labels = np.append(all_labels, targets.cpu().numpy())

              loss = criterion(outputs, targets)
              val_loss += loss.cpu().numpy()
              val_steps += 1

      model.train()
      with tune.checkpoint_dir(epoch) as checkpoint_dir:
          print("saving in checkpoint dir")
          path = os.path.join(checkpoint_dir, "checkpoint")
          torch.save((model.state_dict(), optimizer.state_dict()), path)

      s_precision, s_recall, s_f1, _ = precision_recall_fscore_support(all_labels, all_predictions, average='weighted')
      print('s_precision:', s_precision, 's_recall:', s_recall, 's_f1:', s_f1)
      tune.report(loss = loss.item(), epoch = epoch + batch_idx / len(train_dataloader), eval_loss=(val_loss / val_steps), eval_f1=s_f1, eval_precision=s_precision, eval_recall=s_recall)

In [None]:
def test_results(net, test_dataloader, device="cpu"):
  all_predictions = np.array([])
  all_labels = np.array([])

  net.to(device)
  net.eval()
  with torch.no_grad():
    for i, batch_test in enumerate(test_dataloader, 0):
        c5g_f = batch_test['c5g_f']
        bert_f = batch_test['bert_f']
        genre = batch_test['genre']
        targets = batch_test['label']

        outputs = net(c5g_f.float(), bert_f.float(), genre.float())
        _, predicted = torch.max(outputs.data, 1)

        all_predictions = np.append(all_predictions, predicted.numpy())
        all_labels = np.append(all_labels, targets.numpy())

  s_precision, s_recall, s_f1, _ = precision_recall_fscore_support(all_labels, all_predictions, average='weighted')

  return {
      'precision': s_precision,
      'recall': s_recall,
      'f1': s_f1
  }

In [None]:
from ray import tune
from ray.tune.logger import DEFAULT_LOGGERS
from ray.tune.integration.wandb import WandbLogger, WandbLoggerCallback
from ray.tune.schedulers import ASHAScheduler
from functools import partial
import os

def main(num_samples = 6, max_num_epochs = 15):

  tune_config = {
    "lr": tune.loguniform(5e-4, 5e-2),
    "batch_size": tune.choice([32,64,128]),
    "num_epochs": tune.choice([1]),#,3,5,7,9]),
    "std_dims": tune.sample_from(lambda _: np.random.randint(50,300)),
    "num_units": tune.sample_from(lambda spec: np.random.randint(25,spec.config.std_dims)),
    "do_rate": tune.uniform(0, 0.5),
  }

  scheduler = ASHAScheduler(
    max_t=max_num_epochs,
    grace_period=1,
    reduction_factor=2)

  result = tune.run(
    partial(mm_train_fun1, checkpoint_dir='/tmp/MMModels'),
    config = tune_config,
    resources_per_trial={'gpu': 1},
    metric = 'eval_loss',
    mode = 'min',
    num_samples = num_samples,
    scheduler = scheduler,
    callbacks=[WandbLoggerCallback(
        project="MultiModalClassifier",
        group='raytune_hpsearch',
        api_key=config['WandB']['api_key'],
        log_config=True
    )])

  best_trial = result.get_best_trial(metric="eval_f1", mode="max", scope="last")
  print("Best trial config: {}".format(best_trial.config))
  print("Best trial final validation loss: {}".format(
      best_trial.last_result["eval_loss"]))
  print("Best trial final validation weighted f1: {}".format(
      best_trial.last_result["eval_f1"]))
  
  best_trained_model = FullModel(311595, 768, best_trial.config['std_dims'], best_trial.config['num_units'], best_trial.config['do_rate'])
  device = "cpu"

  best_trained_model.to(device)
                        
  best_checkpoint_dir = best_trial.checkpoint.value
  model_state, optimizer_state = torch.load(os.path.join(
      best_checkpoint_dir, "checkpoint"))
  best_trained_model.load_state_dict(model_state)

  # model_save_name = "yungclassifier.pt"
  path = F"/content/drive/MyDrive/Thesis/BookSuccessPredictor/saved_models/classifier1.pt"
  torch.save(best_trained_model.state_dict(), path)

  test_ds = load_test_data()
  test_dataloader = DataLoader(test_ds, batch_size=best_trial.config["batch_size"], shuffle=True, collate_fn=partial(my_collate_fn, f1_len=311595, f2_len=768))
  return test_results(best_trained_model, test_dataloader, device)

In [None]:
test_scores = main(num_samples = 1)

2021-08-27 04:07:20,293	INFO services.py:1247 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m
2021-08-27 04:07:21,491	INFO registry.py:67 -- Detected unknown callable for trainable. Converting to class.


Trial name,status,loc,batch_size,do_rate,lr,num_epochs,num_units,std_dims
DEFAULT_47f9e_00000,RUNNING,,64,0.463209,0.00525874,1,55,86


[2m[36m(pid=1118)[0m len(train_dataloader) 9
[2m[36m(pid=1118)[0m saving in checkpoint dir
Result for DEFAULT_47f9e_00000:
  date: 2021-08-27_04-07-44
  done: false
  epoch: 0.0
  eval_f1: 0.508937827903616
  eval_loss: 0.6705350081125895
  eval_precision: 0.4192329589565757
  eval_recall: 0.6474820143884892
  experiment_id: c615d98a797d49db9ff8dc72d56ca556
  hostname: c391a45958ff
  iterations_since_restore: 1
  loss: 0.6717736721038818
  node_ip: 172.28.0.2
  pid: 1118
  should_checkpoint: true
  time_since_restore: 14.890439748764038
  time_this_iter_s: 14.890439748764038
  time_total_s: 14.890439748764038
  timestamp: 1630037264
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: 47f9e_00000
  
[2m[36m(pid=1118)[0m s_precision: 0.4192329589565757 s_recall: 0.6474820143884892 s_f1: 0.508937827903616


[2m[36m(pid=1118)[0m   _warn_prf(average, modifier, msg_start, len(result))


Trial name,status,loc,batch_size,do_rate,lr,num_epochs,num_units,std_dims,iter,total time (s),loss,epoch,eval_loss
DEFAULT_47f9e_00000,RUNNING,172.28.0.2:1118,64,0.463209,0.00525874,1,55,86,1,14.8904,0.671774,0,0.670535


[2m[36m(pid=1118)[0m saving in checkpoint dir
Result for DEFAULT_47f9e_00000:
  date: 2021-08-27_04-07-53
  done: false
  epoch: 0.1111111111111111
  eval_f1: 0.508937827903616
  eval_loss: 0.6434538960456848
  eval_precision: 0.4192329589565757
  eval_recall: 0.6474820143884892
  experiment_id: c615d98a797d49db9ff8dc72d56ca556
  hostname: c391a45958ff
  iterations_since_restore: 2
  loss: 0.6600186824798584
  node_ip: 172.28.0.2
  pid: 1118
  should_checkpoint: true
  time_since_restore: 24.534753799438477
  time_this_iter_s: 9.644314050674438
  time_total_s: 24.534753799438477
  timestamp: 1630037273
  timesteps_since_restore: 0
  training_iteration: 2
  trial_id: 47f9e_00000
  


Trial name,status,loc,batch_size,do_rate,lr,num_epochs,num_units,std_dims,iter,total time (s),loss,epoch,eval_loss
DEFAULT_47f9e_00000,RUNNING,172.28.0.2:1118,64,0.463209,0.00525874,1,55,86,2,24.5348,0.660019,0.111111,0.643454


[2m[36m(pid=1118)[0m s_precision: 0.4192329589565757 s_recall: 0.6474820143884892 s_f1: 0.508937827903616
[2m[36m(pid=1118)[0m saving in checkpoint dir
Result for DEFAULT_47f9e_00000:
  date: 2021-08-27_04-08-02
  done: false
  epoch: 0.2222222222222222
  eval_f1: 0.508937827903616
  eval_loss: 0.6265470385551453
  eval_precision: 0.4192329589565757
  eval_recall: 0.6474820143884892
  experiment_id: c615d98a797d49db9ff8dc72d56ca556
  hostname: c391a45958ff
  iterations_since_restore: 3
  loss: 0.596293568611145
  node_ip: 172.28.0.2
  pid: 1118
  should_checkpoint: true
  time_since_restore: 33.513720989227295
  time_this_iter_s: 8.978967189788818
  time_total_s: 33.513720989227295
  timestamp: 1630037282
  timesteps_since_restore: 0
  training_iteration: 3
  trial_id: 47f9e_00000
  


Trial name,status,loc,batch_size,do_rate,lr,num_epochs,num_units,std_dims,iter,total time (s),loss,epoch,eval_loss
DEFAULT_47f9e_00000,RUNNING,172.28.0.2:1118,64,0.463209,0.00525874,1,55,86,3,33.5137,0.596294,0.222222,0.626547


[2m[36m(pid=1118)[0m s_precision: 0.4192329589565757 s_recall: 0.6474820143884892 s_f1: 0.508937827903616
[2m[36m(pid=1118)[0m saving in checkpoint dir
Result for DEFAULT_47f9e_00000:
  date: 2021-08-27_04-08-12
  done: false
  epoch: 0.3333333333333333
  eval_f1: 0.508937827903616
  eval_loss: 0.609357754389445
  eval_precision: 0.4192329589565757
  eval_recall: 0.6474820143884892
  experiment_id: c615d98a797d49db9ff8dc72d56ca556
  hostname: c391a45958ff
  iterations_since_restore: 4
  loss: 0.5180160403251648
  node_ip: 172.28.0.2
  pid: 1118
  should_checkpoint: true
  time_since_restore: 42.739635705947876
  time_this_iter_s: 9.225914716720581
  time_total_s: 42.739635705947876
  timestamp: 1630037292
  timesteps_since_restore: 0
  training_iteration: 4
  trial_id: 47f9e_00000
  
[2m[36m(pid=1118)[0m s_precision: 0.4192329589565757 s_recall: 0.6474820143884892 s_f1: 0.508937827903616


Trial name,status,loc,batch_size,do_rate,lr,num_epochs,num_units,std_dims,iter,total time (s),loss,epoch,eval_loss
DEFAULT_47f9e_00000,RUNNING,172.28.0.2:1118,64,0.463209,0.00525874,1,55,86,4,42.7396,0.518016,0.333333,0.609358


[2m[36m(pid=1118)[0m saving in checkpoint dir
Result for DEFAULT_47f9e_00000:
  date: 2021-08-27_04-08-21
  done: false
  epoch: 0.4444444444444444
  eval_f1: 0.508937827903616
  eval_loss: 0.6256746848424276
  eval_precision: 0.4192329589565757
  eval_recall: 0.6474820143884892
  experiment_id: c615d98a797d49db9ff8dc72d56ca556
  hostname: c391a45958ff
  iterations_since_restore: 5
  loss: 0.5705971121788025
  node_ip: 172.28.0.2
  pid: 1118
  should_checkpoint: true
  time_since_restore: 51.93508315086365
  time_this_iter_s: 9.195447444915771
  time_total_s: 51.93508315086365
  timestamp: 1630037301
  timesteps_since_restore: 0
  training_iteration: 5
  trial_id: 47f9e_00000
  


Trial name,status,loc,batch_size,do_rate,lr,num_epochs,num_units,std_dims,iter,total time (s),loss,epoch,eval_loss
DEFAULT_47f9e_00000,RUNNING,172.28.0.2:1118,64,0.463209,0.00525874,1,55,86,5,51.9351,0.570597,0.444444,0.625675


[2m[36m(pid=1118)[0m s_precision: 0.4192329589565757 s_recall: 0.6474820143884892 s_f1: 0.508937827903616




[2m[36m(pid=1118)[0m saving in checkpoint dir
Result for DEFAULT_47f9e_00000:
  date: 2021-08-27_04-08-30
  done: false
  epoch: 0.5555555555555556
  eval_f1: 0.508937827903616
  eval_loss: 0.6256352265675863
  eval_precision: 0.4192329589565757
  eval_recall: 0.6474820143884892
  experiment_id: c615d98a797d49db9ff8dc72d56ca556
  hostname: c391a45958ff
  iterations_since_restore: 6
  loss: 0.5698477625846863
  node_ip: 172.28.0.2
  pid: 1118
  should_checkpoint: true
  time_since_restore: 61.139683961868286
  time_this_iter_s: 9.204600811004639
  time_total_s: 61.139683961868286
  timestamp: 1630037310
  timesteps_since_restore: 0
  training_iteration: 6
  trial_id: 47f9e_00000
  


Trial name,status,loc,batch_size,do_rate,lr,num_epochs,num_units,std_dims,iter,total time (s),loss,epoch,eval_loss
DEFAULT_47f9e_00000,RUNNING,172.28.0.2:1118,64,0.463209,0.00525874,1,55,86,6,61.1397,0.569848,0.555556,0.625635


[2m[36m(pid=1118)[0m s_precision: 0.4192329589565757 s_recall: 0.6474820143884892 s_f1: 0.508937827903616




[2m[36m(pid=1118)[0m saving in checkpoint dir
Result for DEFAULT_47f9e_00000:
  date: 2021-08-27_04-08-39
  done: false
  epoch: 0.6666666666666666
  eval_f1: 0.508937827903616
  eval_loss: 0.6286422610282898
  eval_precision: 0.4192329589565757
  eval_recall: 0.6474820143884892
  experiment_id: c615d98a797d49db9ff8dc72d56ca556
  hostname: c391a45958ff
  iterations_since_restore: 7
  loss: 0.5830590724945068
  node_ip: 172.28.0.2
  pid: 1118
  should_checkpoint: true
  time_since_restore: 70.21444392204285
  time_this_iter_s: 9.07475996017456
  time_total_s: 70.21444392204285
  timestamp: 1630037319
  timesteps_since_restore: 0
  training_iteration: 7
  trial_id: 47f9e_00000
  


Trial name,status,loc,batch_size,do_rate,lr,num_epochs,num_units,std_dims,iter,total time (s),loss,epoch,eval_loss
DEFAULT_47f9e_00000,RUNNING,172.28.0.2:1118,64,0.463209,0.00525874,1,55,86,7,70.2144,0.583059,0.666667,0.628642


[2m[36m(pid=1118)[0m s_precision: 0.4192329589565757 s_recall: 0.6474820143884892 s_f1: 0.508937827903616




[2m[36m(pid=1118)[0m saving in checkpoint dir
Result for DEFAULT_47f9e_00000:
  date: 2021-08-27_04-08-48
  done: false
  epoch: 0.7777777777777778
  eval_f1: 0.508937827903616
  eval_loss: 0.6708032290140787
  eval_precision: 0.4192329589565757
  eval_recall: 0.6474820143884892
  experiment_id: c615d98a797d49db9ff8dc72d56ca556
  hostname: c391a45958ff
  iterations_since_restore: 8
  loss: 0.7416834235191345
  node_ip: 172.28.0.2
  pid: 1118
  should_checkpoint: true
  time_since_restore: 79.05911254882812
  time_this_iter_s: 8.844668626785278
  time_total_s: 79.05911254882812
  timestamp: 1630037328
  timesteps_since_restore: 0
  training_iteration: 8
  trial_id: 47f9e_00000
  
[2m[36m(pid=1118)[0m s_precision: 0.4192329589565757 s_recall: 0.6474820143884892 s_f1: 0.508937827903616


Trial name,status,loc,batch_size,do_rate,lr,num_epochs,num_units,std_dims,iter,total time (s),loss,epoch,eval_loss
DEFAULT_47f9e_00000,RUNNING,172.28.0.2:1118,64,0.463209,0.00525874,1,55,86,8,79.0591,0.741683,0.777778,0.670803




[2m[36m(pid=1118)[0m saving in checkpoint dir
Result for DEFAULT_47f9e_00000:
  date: 2021-08-27_04-08-56
  done: false
  epoch: 0.8888888888888888
  eval_f1: 0.508937827903616
  eval_loss: 0.6199080546696981
  eval_precision: 0.4192329589565757
  eval_recall: 0.6474820143884892
  experiment_id: c615d98a797d49db9ff8dc72d56ca556
  hostname: c391a45958ff
  iterations_since_restore: 9
  loss: 0.5679579973220825
  node_ip: 172.28.0.2
  pid: 1118
  should_checkpoint: true
  time_since_restore: 87.24576449394226
  time_this_iter_s: 8.186651945114136
  time_total_s: 87.24576449394226
  timestamp: 1630037336
  timesteps_since_restore: 0
  training_iteration: 9
  trial_id: 47f9e_00000
  


Trial name,status,loc,batch_size,do_rate,lr,num_epochs,num_units,std_dims,iter,total time (s),loss,epoch,eval_loss
DEFAULT_47f9e_00000,RUNNING,172.28.0.2:1118,64,0.463209,0.00525874,1,55,86,9,87.2458,0.567958,0.888889,0.619908


[2m[36m(pid=1118)[0m s_precision: 0.4192329589565757 s_recall: 0.6474820143884892 s_f1: 0.508937827903616




Result for DEFAULT_47f9e_00000:
  date: 2021-08-27_04-08-56
  done: true
  epoch: 0.8888888888888888
  eval_f1: 0.508937827903616
  eval_loss: 0.6199080546696981
  eval_precision: 0.4192329589565757
  eval_recall: 0.6474820143884892
  experiment_id: c615d98a797d49db9ff8dc72d56ca556
  experiment_tag: 0_batch_size=64,do_rate=0.46321,lr=0.0052587,num_epochs=1,num_units=55,std_dims=86
  hostname: c391a45958ff
  iterations_since_restore: 9
  loss: 0.5679579973220825
  node_ip: 172.28.0.2
  pid: 1118
  should_checkpoint: true
  time_since_restore: 87.24576449394226
  time_this_iter_s: 8.186651945114136
  time_total_s: 87.24576449394226
  timestamp: 1630037336
  timesteps_since_restore: 0
  training_iteration: 9
  trial_id: 47f9e_00000
  


Trial name,status,loc,batch_size,do_rate,lr,num_epochs,num_units,std_dims,iter,total time (s),loss,epoch,eval_loss
DEFAULT_47f9e_00000,RUNNING,172.28.0.2:1118,64,0.463209,0.00525874,1,55,86,9,87.2458,0.567958,0.888889,0.619908




Trial name,status,loc,batch_size,do_rate,lr,num_epochs,num_units,std_dims,iter,total time (s),loss,epoch,eval_loss
DEFAULT_47f9e_00000,TERMINATED,,64,0.463209,0.00525874,1,55,86,9,87.2458,0.567958,0.888889,0.619908


2021-08-27 04:09:08,977	INFO tune.py:550 -- Total run time: 107.49 seconds (105.69 seconds for the tuning loop).


Best trial config: {'lr': 0.005258735065638649, 'batch_size': 64, 'num_epochs': 1, 'std_dims': 86, 'num_units': 55, 'do_rate': 0.463208753763992}
Best trial final validation loss: 0.6199080546696981
Best trial final validation weighted f1: 0.508937827903616


  _warn_prf(average, modifier, msg_start, len(result))


Learning the genre vectors from Wg, try to understand if some genres are near each other are not using some distance metric (euclidean or manhattan). Can also do PCA.

In [None]:
from sklearn.metrics import precision_recall_fscore_support 

In [None]:
s_f1

# Archived Code

In [None]:
# def getAveragePooledOutputs(model, encoded_dataset):
#   book_embeddings_dataset = {'meaned_pooled_output': [], 'book_title': [], 'genre': [], 'labels': []}

#   book_changes = get_book_changes_idx(encoded_dataset['book_title'])

#   for i in range(len(book_changes)):
#     print(i)
#     start = book_changes[i]
#     end = None
#     if i != len(book_changes) - 1:
#       end = book_changes[i+1]
#     else:
#       end = len(encoded_dataset['input_ids'])

#     input_ids = th.LongTensor(encoded_dataset['input_ids'][start:end])
#     attention_mask = th.BoolTensor(encoded_dataset['attention_mask'][start:end])

#     with torch.no_grad():
#       embeddings = transformer_model.distilbert(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True)[0][:,0] # Pooled output
#       book_embeddings = th.mean(embeddings, dim=0) # Takes the mean of the pooled output
#     book_embeddings_dataset['meaned_pooled_output'].append(book_embeddings)
#     book_embeddings_dataset['book_title'].append(encoded_dataset['book_title'][start])
#     book_embeddings_dataset['genre'].append(encoded_dataset['genre'][start])
#     book_embeddings_dataset['labels'].append(encoded_dataset['labels'][start])
  
#   return book_embeddings_dataset