In [None]:
!pip uninstall -y eonacs && pip install  git+https://github.com/njnmco/smc #--log /dev/stderr
# https://stackoverflow.com/questions/67798070/raytune-is-throwing-error-module-pickle-has-no-attribute-picklebuffer-whe
# downgrade pickle5 for ray
!pip install datasets ray[tune] pickle5==0.0.10
# NB restart kernel after setup

In [None]:
NUM_TRIALS = 30
OUTPUT = "/content/drive/MyDrive/smc/models/dbert_tasks_pac_refactor"

In [None]:
import eonacs.common.util as util

util.colab_map_drive()


Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import torch
pd.__version__

'1.3.5'

In [None]:
tasks, hyper_e, pac_e = pd.read_pickle("/content/drive/MyDrive/smc/data/tasks_pac_e.pkl.gz")

# Absorb into dbert

In [None]:
import eonacs.common.dbert as dbert
tokenizer, model = dbert.dbert()

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
from datasets import Dataset, DatasetDict

tasks["labels"] = pac_e.tolist()
# tasks["Y1"] = Y[:,0]
# tasks["Y2"] = Y[:,0]
# tasks["Y3"] = Y[:,0]


tasks_d = Dataset.from_pandas(tasks)
#test_d = Dataset.from_pandas(test)

In [None]:
tasks_split_d = tasks_d.train_test_split()

In [None]:
def tokenize_function(examples):
    result = tokenizer(examples["text"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


# Use batched=True to activate fast multithreading!
tokenized_datasets = tasks_split_d.map(
    tokenize_function, batched=True, remove_columns=["text"]#, "label"]
)
tokenized_datasets

  0%|          | 0/14 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 13482
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 4494
    })
})

In [None]:
# chunk_size = 128


In [None]:
len(tokenized_datasets["train"]["input_ids"])

13482

In [None]:
import numpy.random as random

def padd(examples):
    results = examples.copy()
    series = pd.Series(results["input_ids"])
    K=50
    for x in series:
        x.extend([0]*(K - len(x)))
        x[:] = x[:K]
    results["input_ids"] = series.values
    results["attention_mask"] = [ [1 if i != 0 else 0 for i in row] for row in results["input_ids"]   ]
    return results

In [None]:
lm_datasets = tokenized_datasets.map(padd, batched=True)
lm_datasets

  0%|          | 0/14 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 13482
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 4494
    })
})

In [None]:
## Check padding is correct
import collections
#list(i for i,x in enumerate(map(len, lm_datasets["train"]["input_ids"])) if x == 48)
collections.Counter(map(len, lm_datasets["train"]["input_ids"])), collections.Counter(map(len, lm_datasets["test"]["input_ids"]))

(Counter({50: 13482}), Counter({50: 4494}))

In [None]:
# check shape of Y is correct
collections.Counter(map(len, lm_datasets["train"]["labels"])), collections.Counter(map(len, lm_datasets["test"]["labels"]))

(Counter({76: 13482}), Counter({76: 4494}))

In [None]:
from transformers import AutoConfig, AutoModelForSequenceClassification
config = AutoConfig.from_pretrained(
        "/content/drive/MyDrive/smc/models/dbert_tasks_hyper_refactor/",num_labels=pac_e.shape[1], problem_type = "regression")

model = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/smc/models/dbert_tasks_hyper_refactor/",   config=config)

Some weights of the model checkpoint at /content/drive/MyDrive/smc/models/dbert_tasks_hyper_refactor/ were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at /content/drive/MyDrive/smc/models/dbert_tasks_hyper_refactor/ and are

In [None]:
model.config.problem_type

'regression'

In [None]:
from transformers import TrainingArguments

batch_size = 64
# Show the training loss with every epoch
logging_steps = len(lm_datasets["train"]) // batch_size
model_name = "distilbert-base-uncased"

training_args = TrainingArguments(
    output_dir=f"{model_name}-finetuned-pacmap-tasks-pred",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    push_to_hub=False,
    fp16=False,
    logging_steps=logging_steps,
    #below will save checkpoint of each run, so that best model can be retrieved directly w/o retraining
    save_strategy="epoch",
    save_total_limit=1

)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model_init= lambda: AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/smc/models/dbert_tasks_hyper_refactor/",   config=config),
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["test"]
)

loading weights file /content/drive/MyDrive/smc/models/dbert_tasks_hyper_refactor/pytorch_model.bin
Some weights of the model checkpoint at /content/drive/MyDrive/smc/models/dbert_tasks_hyper_refactor/ were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialize

In [None]:
best = trainer.hyperparameter_search(
    direction="minimize", 
    backend="ray", 
    n_trials=NUM_TRIALS,
    resume = 'AUTO'
)

Output hidden; open in https://colab.research.google.com to view.

In [None]:
# reload best model from checkpoint
print(best)
path = ! find /root/ray_results/ -type d -path *run-{best.run_id}/checkpoint*
best_model = AutoModelForSequenceClassification.from_pretrained(path[0])
best_model.save_pretrained(OUTPUT)

BestRun(run_id='a7b3c_00028', objective=0.0047509875148534775, hyperparameters={'learning_rate': 6.53330522022775e-05, 'num_train_epochs': 5, 'seed': 22.053802560765252, 'per_device_train_batch_size': 64})


loading configuration file /root/ray_results/_objective_2022-06-02_21-26-24/_objective_a7b3c_00028_28_learning_rate=6.5333e-05,num_train_epochs=5,per_device_train_batch_size=64,seed=22.054_2022-06-02_23-06-55/distilbert-base-uncased-finetuned-pacmap-tasks-pred/run-a7b3c_00028/checkpoint-1055/config.json
Model config DistilBertConfig {
  "_name_or_path": "/root/ray_results/_objective_2022-06-02_21-26-24/_objective_a7b3c_00028_28_learning_rate=6.5333e-05,num_train_epochs=5,per_device_train_batch_size=64,seed=22.054_2022-06-02_23-06-55/distilbert-base-uncased-finetuned-pacmap-tasks-pred/run-a7b3c_00028/checkpoint-1055",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "