# Installs, Imports, Drive Connection, WandB Connection

##### Installs

In [None]:
!pip install transformers==4.10.0
# !pip install git+https://github.com/huggingface/transformers.git
!pip install datasets==1.9.0
!pip install -U PyYAML
!pip install "ray[default]"
!pip install wandb
!pip install tensorboardX

##### Imports

In [None]:
import sys
import os
import numpy as np
# from transformers import pipeline
from datasets import concatenate_datasets, load_dataset
import torch
from pathlib import Path
import pickle
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import Trainer, TrainingArguments
from scipy.special import softmax
from sklearn.metrics import f1_score, precision_recall_curve, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import math
from transformers import AutoModelForSequenceClassification

In [None]:
import os
import pickle
import numpy as np

import ray
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import PopulationBasedTraining, ASHAScheduler
from transformers import Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from ray.tune.logger import DEFAULT_LOGGERS
from ray.tune.integration.wandb import WandbLoggerCallback, WandbLogger

from transformers import DistilBertConfig

from ray.tune.integration.wandb import WandbLoggerCallback
# from MultiTaskExtensions import DistilBERTForMultipleSequenceClassification

##### Drive Connection

In [None]:
from google.colab import drive
drive.mount('/content/drive')

##### Get Configuration

In [None]:
import configparser
import sys
from pathlib import Path

config = configparser.ConfigParser()
config.read('/content/drive/MyDrive/Thesis/BookSuccessPredictor/config.ini')

drive_base_path = Path(config['Drive']['drive_base_path'])

sys.path.append(str(drive_base_path / 'BookSuccessPredictor' / '_utils'))

In [None]:
print("Using Model:", config['Model']['name'])
print("With NERed Dataset:", config['Model']['use_ner'])
print("In multi-task setting:", config['Model']['multi_task'])
print("Using overlap tokenizer:", config['Tokenizer']['overlap'])
print("Using pretrained model:", config['WandB']['use_WandB_pretrained'])

##### WandB Connection

In [None]:
# saves our models to artifacts in WandB
import wandb
%env WANDB_LOG_MODEL=true
%env WANDB_PROJECT=goodreads_success_predictor_80_20

In [None]:
wandb.login(key = config['WandB']['api_key'])

# Dataset Generator

### Load Text Data

#### goodreads_maharjan

In [None]:
base_path = Path(config['Datasets']['nered_goodreads_maharjan_path'])
dataset = load_dataset(str(base_path / 'goodreadsnered.py'))

#### goodreads_guarro

In [None]:
base_path = config['Datasets']['nered_goodreads_guarro_path']
dataset = load_dataset(base_path)

### Custom Tokenization Process

##### Get Tokenizer

In [None]:
from transformers import AutoTokenizer

In [None]:
print("Using tokenizer of ", config['Model']['name'])
if eval(config['Model']['use_ner']):
  print("adding special token for [CHARACTER]")
  # tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-base', additional_special_tokens = ['[CHARACTER]'])
  tokenizer = AutoTokenizer.from_pretrained(config['Model']['name'], additional_special_tokens = ['[CHARACTER]'])
else:
  tokenizer = AutoTokenizer.from_pretrained(config['Model']['name'])

##### Do actual tokenizing and uploading

In [None]:
from tokenization_algos import chunk_and_encode_examples_w_complete_sentences, chunk_and_encode_examples_w_overlap

In [None]:
from functools import partial
encode_algo = partial(chunk_and_encode_examples_w_overlap, tokenizer=tokenizer, stride=384)

In [None]:
chunked_encoded_dataset = dataset.map(encode_algo, remove_columns=dataset.column_names['train'], batched = True)

When uploading the tokenized datasets to Drive, we may need to break them up into as many pieces as is necessary. Otherwise the serialization and uploading fails. In my case I had to split each subset (train, val, test) into 2 parts. If using tokenization with complete sentences, we can usually avoid this hack. Otherwise with overlap, the dataset will most likely be too large and this trick may be necessary.

In [None]:
chunked_encoded_dataset

In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)  

with open('train_dataset1.pkl', 'wb') as output_file:
  pickle.dump(chunked_encoded_dataset['train'][0:chunked_encoded_dataset['train'].num_rows//2], output_file)

with open('train_dataset2.pkl', 'wb') as output_file:
  pickle.dump(chunked_encoded_dataset['train'][chunked_encoded_dataset['train'].num_rows//2:chunked_encoded_dataset['train'].num_rows], output_file)

with open('val_dataset1.pkl', 'wb') as output_file:
  pickle.dump(chunked_encoded_dataset['validation'][0:chunked_encoded_dataset['validation'].num_rows//2], output_file)

with open('val_dataset2.pkl', 'wb') as output_file:
  pickle.dump(chunked_encoded_dataset['validation'][chunked_encoded_dataset['validation'].num_rows//2:chunked_encoded_dataset['validation'].num_rows], output_file)

with open('test_dataset1.pkl', 'wb') as output_file:
  pickle.dump(chunked_encoded_dataset['test'][0:chunked_encoded_dataset['test'].num_rows//2], output_file)

with open('test_dataset2.pkl', 'wb') as output_file:
  pickle.dump(chunked_encoded_dataset['test'][chunked_encoded_dataset['test'].num_rows//2:chunked_encoded_dataset['test'].num_rows], output_file)

folder_id = '1vQKkl_9SeshGXavfz4fcgiPtmfks5vx9'
# get the folder id where you want to save your file
file = drive.CreateFile({'parents':[{u'id': folder_id}]})
file.SetContentFile('train_dataset1.pkl')
file.Upload() 

file = drive.CreateFile({'parents':[{u'id': folder_id}]})
file.SetContentFile('train_dataset2.pkl')
file.Upload() 

# get the folder id where you want to save your file
file = drive.CreateFile({'parents':[{u'id': folder_id}]})
file.SetContentFile('val_dataset1.pkl')
file.Upload() 

file = drive.CreateFile({'parents':[{u'id': folder_id}]})
file.SetContentFile('val_dataset2.pkl')
file.Upload() 

# get the folder id where you want to save your file
file = drive.CreateFile({'parents':[{u'id': folder_id}]})
file.SetContentFile('test_dataset1.pkl')
file.Upload() 

file = drive.CreateFile({'parents':[{u'id': folder_id}]})
file.SetContentFile('test_dataset2.pkl')
file.Upload()

#### Loading

In [None]:
load_path = Path(config['Drive']['drive_base_path']) / 'BookSuccessPredictor' / 'datasets' / 'goodreads_maharjan_super' / 'already_tokenized'

In [None]:
if config['Datasets']['split'] == '80_20':
  load_path = load_path / '80_20'
else:
  load_path = load_path / '60_40'

In [None]:
if (config['Model']['name'] == 'albert-base-v2'):
  load_path = load_path / 'ALBERT_NER_512'

elif (config['Model']['name'] == 'bert-base-uncased'):
  if (config['Tokenizer']['max_len'] == '512'):
    load_path = load_path / 'BERT_UNCASED_NER_512'
  elif (config['Tokenizer']['max_len'] == '256'):
    load_path = load_path / 'BERT_UNCASED_NER_256'

elif (config['Model']['name'] == 'distilbert-base-uncased'):
  if (eval(config['Tokenizer']['overlap'])):
    load_path = load_path / 'DistilBERT_UNCASED_NER_512_w50overlap'
  else:
    load_path = load_path / 'DistilBERT_UNCASED_NER_512'

elif (config['Model']['name'] == 'microsoft/deberta-base'):
  load_path = load_path / 'DeBERTa'

elif (config['Model']['name'] == 'roberta-base'):
  load_path = load_path / 'ROBERTA_NER_512'

elif (config['Model']['name'] == 'google/bigbird-roberta-base'):
  load_path = load_path / 'BIGBIRD_NER_4096'

In [None]:
load_path

In [None]:
from datasets import DatasetDict, Dataset, concatenate_datasets
train_paths = [f for f in os.listdir(load_path) if f.startswith('train')]
val_paths = [f for f in os.listdir(load_path) if f.startswith('val')]
test_paths = [f for f in os.listdir(load_path) if f.startswith('test')]

train_datasets = []
val_datasets = []
test_datasets = []

for trainp in train_paths:
  with open(load_path / trainp, "rb") as input_file:
    train_datasets.append(Dataset.from_dict(pickle.load(input_file)))

for valp in val_paths:
  with open(load_path / valp, "rb") as input_file:
    val_datasets.append(Dataset.from_dict(pickle.load(input_file)))

for testp in test_paths:
  with open(load_path / testp, "rb") as input_file:
    test_datasets.append(Dataset.from_dict(pickle.load(input_file)))

train_dataset = concatenate_datasets(train_datasets)
del train_datasets

val_dataset = concatenate_datasets(val_datasets)
del val_datasets

test_dataset = concatenate_datasets(test_datasets)
del test_datasets

chunked_encoded_dataset = DatasetDict({'train': train_dataset, 'validation': val_dataset, 'test': test_dataset})

In [None]:
if (eval(config['Model']['multi_task'])):
  print('multitask')
  # When batched = True, we take in multiple examples
  def group_success_and_genre(examples):
    examples['labels'] = np.vstack((examples['success_label'], examples['genre'])).T
    return examples

  chunked_encoded_dataset = chunked_encoded_dataset.map(group_success_and_genre, batched = True, remove_columns=['genre', 'success_label'])
else:
  print('single task')
  chunked_encoded_dataset = chunked_encoded_dataset.rename_column('success_label', 'labels')

#Dataset Exploration

In [None]:
chunked_encoded_dataset

In [None]:
import matplotlib.pyplot as plt
import itertools

num_segments_per_book = [len(list(g[1])) for g in itertools.groupby(chunked_encoded_dataset['train']['book_title'])]

plt.hist(num_segments_per_book, density=True, bins=30)  # density=False would make counts
plt.ylabel('Probability')
plt.xlabel('Data');

In [None]:
# [(g[0], len(list(g[1]))) for g in itertools.groupby(chunked_encoded_dataset['train']['book_title'])]
start_of_segmented_book = {}
last_idx = 0
for g in itertools.groupby(chunked_encoded_dataset['train']['book_title']):
  start_of_segmented_book[g[0]] = last_idx
  last_idx = len(list(g[1])) + last_idx

In [None]:
# When batched = True, we take in multiple examples
def filter_segments(example, idx):
  if (idx - start_of_segmented_book[example['book_title']] < 25):
    return True
  else:
    return False

test = chunked_encoded_dataset['train'].filter(filter_segments, with_indices = True)

In [None]:
num_segments_per_book = [len(list(g[1])) for g in itertools.groupby(test['book_title'])]

plt.hist(num_segments_per_book, density=True, bins=30)  # density=False would make counts
plt.ylabel('Probability')
plt.xlabel('Data');

In [None]:
chunked_encoded_dataset['train'] = chunked_encoded_dataset['train'].filter(filter_segments, with_indices = True)

# GoodReads Success Prediction

## Transformer --> Classification

In [None]:
# db_config_base = DistilBertConfig.from_pretrained('/content/drive/MyDrive/Thesis/BookSuccessPredictor/saved_models/DistilBertPretrained')
# # db_config_base.update({'_name_or_path': '/content/drive/MyDrive/Thesis/BookSuccessPredictor/saved_models/DistilBertPretrained', 'alpha': 0.2, 'dropout': 0.8})
# return DistilBERTForMultipleSequenceClassification.from_pretrained(pretrained_model_name_or_path = db_config_base._name_or_path, config = db_config_base)

In [None]:
from MultiTaskExtensions import DistilBERTForMultipleSequenceClassification
from transformers import DistilBertConfig
# pretrained_model_name_or_path = config['Model']['name']
# pretrained_model_name_or_path = '/content/drive/MyDrive/Thesis/BookSuccessPredictor/saved_models/DistilBertPretrained'
pretrained_model_name_or_path = '/content/drive/MyDrive/Thesis/BookSuccessPredictor/saved_models/3vvi0uoqDistilBertModel'

# if (eval(config['WandB']['use_WandB_pretrained'])):
#   run = wandb.init()
#   artifact = run.use_artifact('lucaguarro/goodreads_success_predictor/model-nlpbosie:v0', type='model')
#   # artifact = run.use_artifact('lucaguarro/goodreads_success_predictor/model-3d16pz7v:v0', type='model')
#   pretrained_model_name_or_path = artifact.download()

if (eval(config['Model']['multi_task'])):
  metric_for_best_model = 'eval_s_f1'
  if (config['Model']['name'] == 'distilbert-base-uncased'):
    from MultiTaskExtensions import DistilBERTForMultipleSequenceClassification
    db_config = DistilBertConfig.from_pretrained(pretrained_model_name_or_path)
    db_config.update({'_name_or_path': pretrained_model_name_or_path, 'alpha': 0.5928, 'attention_dropout': 0.2436, 'dropout': 0.3}) # 'dropout': 0.3877
    print(db_config)
    model = DistilBERTForMultipleSequenceClassification.from_pretrained(pretrained_model_name_or_path = db_config._name_or_path, config = db_config)
  else:
    from MultiTaskExtensions import BertForMultipleSequenceClassification
    model = BertForMultipleSequenceClassification.from_pretrained(pretrained_model_name_or_path)

else:
  print("standard model", pretrained_model_name_or_path)
  metric_for_best_model = 'eval_f1'
  model = AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)
  model.resize_token_embeddings(len(tokenizer))

In [None]:
model

In [None]:
# from scores import compute_metrics_multi, compute_metrics_single
# if eval(config['Model']['multi_task']):
#   print('multi')
#   # from scores import compute_metrics_multi
#   compute_metrics = compute_metrics_multi
# else:
#   print('single')
#   # from scores import compute_metrics_single
#   compute_metrics = compute_metrics_single

In [None]:
%env WANDB_PROJECT=goodreads_success_predictor_80_20

training_args = TrainingArguments(
    'gsp_80_20_DistilBERT_25segs_wtfwillthiswork',
    num_train_epochs=1,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    learning_rate=0.00003969,
    per_device_eval_batch_size=32,   # batch size for evaluation
    warmup_steps=0,                # number of warmup steps for learning rate scheduler
    weight_decay=0.1,               # strength of weight decay
    # logging_dir='./logs',            # directory for storing logs
    logging_steps=5,
    # gradient_accumulation_steps=2,
    evaluation_strategy = "steps",
    eval_steps = 5, # prob better if set to 601 that way it evenly divides into the epochs
    load_best_model_at_end = True,
    metric_for_best_model = 'eval_s_f1',
    greater_is_better = True,
    # report_to = "wandb",
    save_total_limit = 5
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=chunked_encoded_dataset['train'],         # training dataset             # evaluation dataset
    eval_dataset=chunked_encoded_dataset['validation'], 
    compute_metrics = compute_metrics
)

trainer.train()

In [None]:
chunked_encoded_dataset['train']['labels'][0]

## Scorer

In [None]:
from MultiTaskExtensions import DistilBERTForMultipleSequenceClassification

In [None]:
run = wandb.init()
# artifact = run.use_artifact('lucaguarro/DistilbertMultitaskHPSearch/model-3igwy2id:v0', type='model')
artifact = run.use_artifact('lucaguarro/DistilbertMultitaskHPSearch/model-3vvi0uoq:v0', type='model')
artifact_dir = artifact.download()

In [None]:
model = DistilBERTForMultipleSequenceClassification.from_pretrained(artifact_dir)
trainer = Trainer(
    model=model
)

In [None]:
from scores import ModelScorer

In [None]:
m_scorer = ModelScorer(trainer, chunked_encoded_dataset, for_multitask=True)

In [None]:
m_scorer.get_segmented_f1_scores()

In [None]:
m_scorer.get_book_f1_scores()

# Hyperparameter Tuning

multitask hyperparameter tuning

In [None]:
# saves our models to artifacts in WandB
%env WANDB_LOG_MODEL=true
wandb.login()
%env WANDB_PROJECT=DistilbertMultitaskHPSearch

In [None]:
chunked_encoded_dataset

In [None]:
def compute_metrics_single(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

def compute_metrics_multi(pred):
    preds = pred.predictions
    label_ids = pred.label_ids

    success_labels = label_ids[:, 0]
    genre_labels = label_ids[:, 1]

    success_preds = preds[:, 0:2].argmax(1)
    genre_preds = preds[:, 2:6].argmax(1)

    s_precision, s_recall, s_f1, _ = precision_recall_fscore_support(success_labels, success_preds, average='weighted')
    s_acc = accuracy_score(success_labels, success_preds)

    g_precision, g_recall, g_f1, _ = precision_recall_fscore_support(genre_labels, genre_preds, average='weighted')
    g_acc = accuracy_score(success_labels, success_preds)

    return {
        's_accuracy': s_acc,
        's_f1': s_f1,
        's_precision': s_precision,
        's_recall': s_recall,
        'g_accuracy': g_acc,
        'g_f1': g_f1,
        'g_precision': g_precision,
        'g_recall': g_recall
    }

if eval(config['Model']['multi_task']):
  print('multi')
  # from scores import compute_metrics_multi
  compute_metrics = compute_metrics_multi
else:
  print('single')
  # from scores import compute_metrics_single
  compute_metrics = compute_metrics_single

def my_objective(metrics):
    # Your elaborate computation here
    if eval(config['Model']['multi_task']):
      return metrics['eval_s_f1']
    else:
      return metrics['eval_f1']

In [None]:
from transformers import DistilBertConfig
db_config_base = DistilBertConfig.from_pretrained('/content/drive/MyDrive/Thesis/BookSuccessPredictor/saved_models/DistilBertPretrained')
db_config_base.update({'_name_or_path': '/content/drive/MyDrive/Thesis/BookSuccessPredictor/saved_models/DistilBertPretrained', 'alpha': 0.2, 'dropout': 0.8})

In [None]:
db_config_base

In [None]:
test = DistilBERTForMultipleSequenceClassification.from_pretrained(pretrained_model_name_or_path = db_config_base._name_or_path, config = db_config_base)

In [None]:
# tune_config_pop_based = {
#     "per_device_train_batch_size": 16,
#     "per_device_eval_batch_size": 32,
#     "num_train_epochs": 1,
#     "max_steps": 1 if smoke_test else -1,  # Used for smoke test.
#     "wandb": {
#         "project": "DistilbertMultitaskHPSearch",
#         "group": "Search1",
#         "api_key": "XXXXXXXX",
#         "log_config": True
#     }
# }

# scheduler = PopulationBasedTraining(
#     time_attr="training_iteration",
#     metric="eval_s_f1",
#     mode="max",
#     perturbation_interval=60,
#     hyperparam_mutations={
#         "weight_decay": tune.uniform(0.0, 0.3),
#         "learning_rate": tune.uniform(1e-5, 5e-5),
#         "per_device_train_batch_size": [16],
#     })

In [None]:
from transformers.modeling_outputs import SequenceClassifierOutput
from torch import nn
import torch
from torch.nn import CrossEntropyLoss, MSELoss

from transformers import DistilBertPreTrainedModel, DistilBertModel
class DistilBERTForMultipleSequenceClassification(DistilBertPreTrainedModel):
    def __init__(self, config, num_labels1 = 2, num_labels2 = 8):
        super().__init__(config)
        self.num_labels1 = num_labels1
        self.num_labels2 = num_labels2
        print(self.num_labels1, self.num_labels2)
        self.alpha = config.alpha
        self.config = config

        self.distilbert = DistilBertModel(config)
        self.pre_classifier = nn.Linear(config.dim, config.dim)
        self.classifier1 = nn.Linear(config.dim, self.num_labels1)
        self.classifier2 = nn.Linear(config.dim, self.num_labels2)
        self.dropout = nn.Dropout(config.dropout)

        self.init_weights()


    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        distilbert_output = self.distilbert(
              input_ids=input_ids,
              attention_mask=attention_mask,
              head_mask=head_mask,
              inputs_embeds=inputs_embeds,
              output_attentions=output_attentions,
              output_hidden_states=output_hidden_states,
              return_dict=return_dict,
          )
        hidden_state = distilbert_output[0]  # (bs, seq_len, dim)
        pooled_output = hidden_state[:, 0]  # (bs, dim)
        pooled_output = self.pre_classifier(pooled_output)  # (bs, dim)
        pooled_output = nn.ReLU()(pooled_output)  # (bs, dim)
        pooled_output = self.dropout(pooled_output)  # (bs, dim)
        logits1 = self.classifier1(pooled_output)
        logits2 = self.classifier2(pooled_output)
        logits = torch.cat([logits1, logits2], 1)

        loss = None
        if labels is not None:
            #if self.config.problem_type is None:
            #self.config.problem_type = "single_label_classification"
            
            if self.num_labels1 > 1:
                loss_fct1 = CrossEntropyLoss()
                loss1 = loss_fct1(logits1.view(-1, self.num_labels1), labels[:, 0].view(-1))
            else:
                loss_fct1 = MSELoss()
                loss1 = loss_fct1(logits1.view(-1), labels[:, 0].view(-1))

            if self.num_labels2 > 1:
                loss_fct2 = CrossEntropyLoss()
                loss2 = loss_fct2(logits2.view(-1, self.num_labels2), labels[:, 1].view(-1))
            else:
                loss_fct2 = MSELoss()
                loss2 = loss_fct2(logits2.view(-1), labels[:, 1].view(-1))
            loss = self.alpha*loss1 + (1-self.alpha)*loss2 

        if not return_dict:
            output = (logits,) + outputs[2:] #not sure if this works
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=distilbert_output.hidden_states, #hidden_states,
            attentions=distilbert_output.attentions, #attentions,
        )

In [None]:
def tune_transformer(num_samples=8, gpus_per_trial=0, smoke_test=False):
    data_dir_name = "./data" if not smoke_test else "./test_data"
    data_dir = os.path.abspath(os.path.join(os.getcwd(), data_dir_name))
    if not os.path.exists(data_dir):
        os.mkdir(data_dir, 0o755)

    def get_model(params):
        db_config = db_config_base
        print("printing params", params)
        if params is not None:
          db_config.update({'_name_or_path': '/content/drive/MyDrive/Thesis/BookSuccessPredictor/saved_models/DistilBertPretrained', 'alpha': params['alpha'], 'attention_dropout': params['attention_dropout'], 'dropout': params['dropout']})
        return DistilBERTForMultipleSequenceClassification.from_pretrained(pretrained_model_name_or_path = db_config_base._name_or_path, config = db_config_base)

    train_dataset = chunked_encoded_dataset['train']
    eval_dataset = chunked_encoded_dataset['validation']

    training_args = TrainingArguments(
        output_dir="DistilBertMultitask_HPsearch",
        learning_rate=1e-5,  # config
        do_train=True,
        do_eval=True,
        no_cuda=gpus_per_trial <= 0,
        evaluation_strategy="steps",
        save_total_limit = 5,
        logging_strategy="steps",
        logging_steps=5,
        eval_steps=5,
        load_best_model_at_end=True,
        # metric_for_best_model='eval_s_f1',
        # greater_is_better=True,
        num_train_epochs=0.9,  # config
        per_device_train_batch_size=16,  # config
        per_device_eval_batch_size=16,  # config
        warmup_steps=0,
        weight_decay=0.1,  # config
        logging_dir="./logs",
        skip_memory_metrics=True)

    tune_config_ASHA = {
        "attention_dropout": tune.uniform(0.15,0.4),
        "dropout": tune.uniform(0.15, 0.4),
        "alpha": tune.uniform(0.3,0.7),
        "learning_rate": tune.loguniform(1e-5, 1e-4),
        "per_device_train_batch_size": tune.choice([16]),
        "num_train_epochs": tune.choice([0.9]),
        "wandb": {
            "project": "DistilbertMultitaskHPSearch",
            "group": "Search1",
            "api_key": config['WandB']['api_key'],
            "log_config": True
        }
    }

    trainer = Trainer(
        model_init=get_model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics)
    

    # scheduler = ASHAScheduler(
    #     metric="eval_s_f1",
    #     mode="max",
    #     max_t=1000,
    #     grace_period=30,
    #     reduction_factor=1.5)

    # scheduler = ASHAScheduler(
    #     metric="eval_s_f1",
    #     mode="max",
    #     max_t=1,
    #     grace_period=1,
    #     reduction_factor=2)

    reporter = CLIReporter(
        parameter_columns={
            "weight_decay": "w_decay",
            "learning_rate": "lr",
            "dropout": "dropout",
            "alpha": "alpha",
            "per_device_train_batch_size": "train_bs/gpu",
            "num_train_epochs": "num_epochs"
        },
        metric_columns=[
            "eval_s_accuracy", "eval_loss", "eval_s_f1", "steps", "training_iteration"
        ])

    trainer.hyperparameter_search(
        hp_space=lambda _: tune_config_ASHA,
        backend="ray",
        # compute_objective=my_objective,
        direction="maximize",
        n_trials=num_samples,
        resources_per_trial={
            "cpu": 2,
            "gpu": gpus_per_trial
        },
        # scheduler=scheduler,
        keep_checkpoints_num=1,
        checkpoint_score_attr="training_iteration",
        stop={"training_iteration": 1} if smoke_test else None,
        progress_reporter=reporter,
        local_dir="~/ray_results/",
        name="tune_transformer",
        loggers=DEFAULT_LOGGERS + (WandbLogger,))
        # time_budget_s=60*60*10) # 10 hours

In [None]:
tune_transformer(num_samples=3, gpus_per_trial=1, smoke_test=False)
# tune_transformer(num_samples=1, gpus_per_trial=0, smoke_test=True)

standard hyperparameter tuning

In [None]:
# saves our models to artifacts in WandB
import wandb
%env WANDB_LOG_MODEL=true
wandb.login()

In [None]:
import os
import pickle

import ray
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import PopulationBasedTraining
from transformers import AutoConfig, \
    AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from ray.tune.logger import DEFAULT_LOGGERS
from ray.tune.integration.wandb import WandbLoggerCallback, WandbLogger
from transformers import ElectraTokenizerFast, ElectraForSequenceClassification, BertTokenizer, BertForSequenceClassification

In [None]:
model_name = 'bert-base-uncased'

In [None]:
project_name = "BERT-base-uncased-HP-Tuning"
%env WANDB_PROJECT=BERT-base-uncased-HP-Tuning

In [None]:
from datasets import DatasetDict

with open(r"/content/drive/MyDrive/Thesis/Datasets/book_preprocessing/PreTokenized/BERT_UNCASED_NER_512/train_dataset.pkl", "rb") as input_file:
  train_dataset = pickle.load(input_file)

with open(r"/content/drive/MyDrive/Thesis/Datasets/book_preprocessing/PreTokenized/BERT_UNCASED_NER_512/val_dataset.pkl", "rb") as input_file:
  val_dataset = pickle.load(input_file)

with open(r"/content/drive/MyDrive/Thesis/Datasets/book_preprocessing/PreTokenized/BERT_UNCASED_NER_512/test_dataset.pkl", "rb") as input_file:
  test_dataset = pickle.load(input_file)

chunked_encoded_dataset = DatasetDict({'train': train_dataset, 'validation': val_dataset, 'test': test_dataset})
chunked_encoded_dataset = chunked_encoded_dataset.rename_column('success_label', 'labels')

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

def tune_transformer(num_samples=8,
                     gpus_per_trial=0,
                     smoke_test=False,
                     ray_address=None):
    ray.init(ray_address, log_to_driver=True)
    data_dir_name = "./data" if not smoke_test else "./test_data"
    data_dir = os.path.abspath(os.path.join(os.getcwd(), data_dir_name))
    if not os.path.exists(data_dir):
        os.mkdir(data_dir, 0o755)

    # Change these as needed.
    # model_name = 'google/electra-small-discriminator' if not smoke_test \
    #     else 'google/electra-small-discriminator'
    task_name = "grs"

    task_data_dir = os.path.join(data_dir, task_name.upper())

    num_labels = 2

    # config = AutoConfig.from_pretrained(
    #     model_name, num_labels=num_labels, finetuning_task=task_name)

    # Download and cache tokenizer, model, and features
    print("Downloading and caching Tokenizer")
    # tokenizer = ElectraTokenizerFast.from_pretrained('google/electra-small-discriminator', additional_special_tokens = ['[CHARACTER]'])
    tokenizer = BertTokenizer.from_pretrained(model_name, additional_special_tokens = ['[CHARACTER]'])
    
    # Triggers tokenizer download to cache
    # print("Downloading and caching pre-trained model")
    # AutoModelForSequenceClassification.from_pretrained(
    #     model_name,
    #     config=config,
    # )

    def get_model():
        # model = ElectraForSequenceClassification.from_pretrained('/content/drive/MyDrive/Thesis/Models/ELECTRA_small_pretrained', num_labels=2)
        model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
        model.resize_token_embeddings(len(tokenizer))
        return model

    train_dataset = chunked_encoded_dataset['train']
    eval_dataset = chunked_encoded_dataset['validation']

    training_args = TrainingArguments(
        project_name,
        # output_dir=".",
        learning_rate=1e-5,  # config
        do_train=True,
        do_eval=True,
        no_cuda=gpus_per_trial <= 0,
        evaluation_strategy="epoch",
        load_best_model_at_end=True,
        num_train_epochs=2,  # config
        max_steps=-1,
        per_device_train_batch_size=16,  # config
        per_device_eval_batch_size=16,  # config
        warmup_steps=0,
        weight_decay=0.1,  # config
        # logging_dir="./logs",
    )

    training_args._n_gpu = gpus_per_trial

    trainer = Trainer(
        model_init=get_model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics)

    tune_config = {
        # "per_device_train_batch_size": 16,
        "per_device_eval_batch_size": 32,
        "num_train_epochs": tune.choice([2, 3, 4, 5]),
        "max_steps": 1 if smoke_test else -1,  # Used for smoke test.
        "wandb": {
            "project": project_name,
            "api_key": config['WandB']['api_key'],
            "log_config": True
        }
    }

    scheduler = PopulationBasedTraining(
        time_attr="training_iteration",
        metric="eval_f1",
        mode="max",
        perturbation_interval=1,
        hyperparam_mutations={
            "weight_decay": tune.uniform(0.0, 0.3),
            "warmup_steps": tune.choice([0, 50, 100, 500, 1000]),
            "learning_rate": tune.uniform(1e-5, 4e-5),
            "per_device_train_batch_size": [8, 16],
        })

    reporter = CLIReporter(
        parameter_columns={
            "weight_decay": "w_decay",
            "learning_rate": "lr",
            "per_device_train_batch_size": "train_bs/gpu",
            "num_train_epochs": "num_epochs"
        })

    trainer.hyperparameter_search(
        hp_space=lambda _: tune_config,
        backend="ray",
        n_trials=num_samples,
        resources_per_trial={
            "cpu": 1,
            "gpu": gpus_per_trial
        },
        scheduler=scheduler,
        keep_checkpoints_num=1,
        checkpoint_score_attr="training_iteration",
        stop={"training_iteration": 1} if smoke_test else None,
        progress_reporter=reporter,
        local_dir="~/ray_results/",
        name="tune_transformer_pbt",
        # log_to_file=True,
        loggers=DEFAULT_LOGGERS + (WandbLogger, ),
        time_budget_s=60*15
    )

In [None]:
import argparse

parser = argparse.ArgumentParser()
parser.add_argument(
    "--smoke-test", action="store_true", help="Finish quickly for testing")
parser.add_argument(
    "--ray-address",
    type=str,
    default=None,
    help="Address to use for Ray. "
    "Use \"auto\" for cluster. "
    "Defaults to None for local.")
args, _ = parser.parse_known_args()

if args.smoke_test:
    tune_transformer(
        num_samples=1,
        gpus_per_trial=0,
        smoke_test=True,
        ray_address=args.ray_address)
else:
    # You can change the number of GPUs here:
    tune_transformer(
        num_samples=15, gpus_per_trial=1, ray_address=args.ray_address)

In [None]:
ray.shutdown()

# UNNEEDED CODE

In [None]:
def tokenize_w_overlap(example, tokenizer, chunk_len = 512, overlap_len = 50):
    data_tokenize = tokenizer(example['text'], 
                    max_length = chunk_len,
                    add_special_tokens=True,
                    return_attention_mask=True,
                    return_token_type_ids=True,
                    return_overflowing_tokens = True,
                    return_tensors = 'np')

    long_terms_token = []
    input_ids_list = []
    attention_mask_list = []
    token_type_ids_list = []
    targets_list = []

    previous_input_ids = data_tokenize["input_ids"].reshape(-1)
    previous_attention_mask = data_tokenize["attention_mask"].reshape(-1)
    previous_token_type_ids = data_tokenize["token_type_ids"].reshape(-1)
    remain = data_tokenize["overflowing_tokens"].reshape(-1)
    
    input_ids_list.append(previous_input_ids)
    attention_mask_list.append(previous_attention_mask)
    token_type_ids_list.append(previous_token_type_ids)

    if remain is not None:
      idxs = range(len(remain)+chunk_len)
      idxs = idxs[(chunk_len-overlap_len-2)
                    ::(chunk_len-overlap_len-2)]
      input_ids_first_overlap = previous_input_ids[-(
          overlap_len+1):-1]
      start_token = np.array([101])
      end_token = np.array([102])

      for i, idx in enumerate(idxs):
          if i == 0:
              input_ids = np.concatenate((input_ids_first_overlap, remain[:idx])) # building the 2nd chunk
          elif i == len(idxs):
              input_ids = remain[idx:]
          elif previous_idx >= len(remain):
              break
          else:
              input_ids = remain[(previous_idx-overlap_len):idx]

          previous_idx = idx

          nb_token = len(input_ids)+2
          attention_mask = np.ones(chunk_len)
          attention_mask[nb_token:chunk_len] = 0 # only will take effect on the last chunk
          token_type_ids = np.zeros(chunk_len)
          input_ids = np.concatenate((start_token, input_ids, end_token))

          if chunk_len-nb_token > 0: # add padding, only can pass on last chunk
              padding = np.zeros(chunk_len-nb_token)
              input_ids = np.concatenate((input_ids, padding))

          input_ids_list.append(input_ids)
          attention_mask_list.append(attention_mask)
          token_type_ids_list.append(token_type_ids)

      print(input_ids_list)

      return {
          'input_ids': input_ids_list,  # torch.tensor(ids, dtype=torch.long),
          'attention_mask': attention_mask_list,
          'token_type_ids': token_type_ids_list,
          'success_label': np.array([example['success_label']] * len(input_ids_list)),
          'genre': np.array([example['genre']] * len(input_ids_list))
          # 'len': [np.array(len(targets_list), dtype=torch.long)]
      }

In [None]:
# When batched = True, we take in multiple examples
def chunk_and_encode_examples_w_overlap(examples):
  mega_dict = {'attention_mask': [], 'genre': [], 'input_ids': [], 'success_label': [], 'token_type_ids': [], 'book_title': []}
  for i in range(len(examples['text'])):
    book_sample = {'text': examples['text'][i], 'genre': examples['genre'][i], 'success_label': examples['success_label'][i], 'book_title':examples['book_title'][i]}
    dictOfTokenizedChunks = tokenize_w_overlap(book_sample, tokenizer)
    for key, value in dictOfTokenizedChunks.items():
      mega_dict[key].extend(value)
  return mega_dict

In [None]:
!transformers-cli env

In [None]:
from transformers import DistilBertModel
import torch

run = wandb.init()
artifact = run.use_artifact('lucaguarro/DistilbertMultitaskHPSearch/model-3vvi0uoq:v0', type='model')
artifact_dir = artifact.download()
db_model = DistilBertModel.from_pretrained(artifact_dir)

db_model.save_pretrained('/content/drive/MyDrive/Thesis/BookSuccessPredictor/saved_models/3vvi0uoqDistilBertModel')