### VM setup

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
%cd "/content/gdrive/MyDrive/Master Thesis/Data/Fuetal2017/"

/content/gdrive/MyDrive/Master Thesis/Data/Fuetal2017


In [3]:
!pip install transformers==4.18

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers==4.18
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 15.5 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 12.0 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 71.5 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 57.8 MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.53-py3-none-any.whl size=895260 sha256=c72e8672ee7183b31cfd80ebc7554d

In [4]:
!pip install datasets==2.0.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets==2.0.0
  Downloading datasets-2.0.0-py3-none-any.whl (325 kB)
[K     |████████████████████████████████| 325 kB 29.9 MB/s 
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 49.8 MB/s 
Collecting multiprocess
  Downloading multiprocess-0.70.13-py37-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 57.2 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
  Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)
[K     |████████████████████████████████| 127 kB 74.8 MB/s 
Installing collected packages: urllib3, xxhash, responses, multiprocess, datasets
  Attempting uninstall: urllib3
    Found existing installation: urllib3 1.24.3
    Uninstalling urlli

## Download git repo from github

In [None]:
!bash ../clone_repo.sh

In [None]:
!bash ../pull_repo.sh
%cd "/content/gdrive/MyDrive/Master Thesis/Data/Fuetal2017/"

## Imports

In [None]:
import sys
sys.path.insert(0, "/content/experiments_chd/data_preparation/")
sys.path.insert(0, "/content/gdrive/MyDrive/Master Thesis/ModelFineTuning/")

In [None]:
import datasets
import numpy as np
import torch
from torch import cuda
from transformers import RobertaModel, PreTrainedModel, EarlyStoppingCallback, IntervalStrategy, TrainingArguments, AutoModelForSequenceClassification
import json
from collections import ChainMap

In [None]:
from transformers import RobertaTokenizerFast

In [None]:
from hub_token import HUB_TOKEN

## Data loading

In [None]:
dataset = datasets.load_from_disk("fu2017_highlight_detection_dataset")

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['messages', 'highlights', 'match_name'],
        num_rows: 10580401
    })
    val: Dataset({
        features: ['messages', 'highlights', 'match_name'],
        num_rows: 3305229
    })
    test: Dataset({
        features: ['messages', 'highlights', 'match_name'],
        num_rows: 3984081
    })
})

## Tokenization

In [None]:
# === TOKENIZATION ===
def tokenize(ds, tokenizer):
    ds_messages_split = ds.map(lambda example: split_add_bos_eos(example, tokenizer), remove_columns=["messages"])
    return ds_messages_split.map(lambda examples: tokenizer(examples['messages_split']), batched=True)


def split_add_bos_eos(example, tok):
    return {"messages_split": f"{tok.eos_token}{tok.bos_token}".join(example["messages"].rstrip("\n").split("\n"))}


In [None]:
tok = RobertaTokenizerFast.from_pretrained("/content/gdrive/MyDrive/Master Thesis/Language Model Training/TwitchLeagueBert")

In [None]:
dataset_tokenized = tokenize(dataset, tok)



  0%|          | 0/10580401 [00:00<?, ?ex/s]

  0%|          | 0/3305229 [00:00<?, ?ex/s]

  0%|          | 0/3984081 [00:00<?, ?ex/s]

  0%|          | 0/10581 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (658 > 512). Running this sequence through the model will result in indexing errors


  0%|          | 0/3306 [00:00<?, ?ba/s]

  0%|          | 0/3985 [00:00<?, ?ba/s]

In [None]:
dataset_tokenized

DatasetDict({
    train: Dataset({
        features: ['highlights', 'match_name', 'messages_split', 'input_ids', 'attention_mask'],
        num_rows: 10580401
    })
    val: Dataset({
        features: ['highlights', 'match_name', 'messages_split', 'input_ids', 'attention_mask'],
        num_rows: 3305229
    })
    test: Dataset({
        features: ['highlights', 'match_name', 'messages_split', 'input_ids', 'attention_mask'],
        num_rows: 3984081
    })
})

In [None]:
dataset_tokenized.save_to_disk("fu2017_TwitchLeagueBert_highlight_detection_dataset_tokenized")

## Dataset windowing

Window: 7 seconds (210 frames)

step: 1 second (30 frames)

batch: 210*30

In [None]:
dataset_tokenized = datasets.load_from_disk("fu2017_TwitchLeagueBert_highlight_detection_dataset_tokenized")

In [None]:
210*30

6300

In [None]:
def group_dataset(ds_batch, window_len=210, step=30):
  window_inds = list()
  for i in range(0, len(ds_batch["attention_mask"]) - window_len + step, step):
    tmp_w_inds = (i, i+window_len)
    mn = ds_batch["match_name"][tmp_w_inds[0]: tmp_w_inds[1]]
    if len(set(mn)) > 1:
      # stop at earlier index
      np.argmax(np.asarray(mn) == mn[-1])
    else:
      window_inds.append(tmp_w_inds)

  ret = dict()
  for key, val in ds_batch.items():
    ret[key] = list()
    for i_start, i_end in window_inds:
      if key == "input_ids" or key == "attention_mask" :
        ret[key].append(np.concatenate(val[i_start: i_end]))
      elif key == "highlights":
        ret[key].append(val[i_start])
      else:
        pass
    if len(ret[key]) == 0:
      del ret[key]

  return ret


In [None]:

grouping_params = {
    "window_len": 210,
    "step": 30
    }

In [None]:
ds_test = datasets.DatasetDict({"train": datasets.Dataset.from_dict(dataset_tokenized["train"][:12600])})

In [None]:
ds_test_grouped = ds_test.map(group_dataset, batch_size=grouping_params["window_len"] * grouping_params["step"], batched=True, fn_kwargs=grouping_params, remove_columns=["messages_split", "match_name"])

  0%|          | 0/2 [00:00<?, ?ba/s]

In [None]:
ds_test_grouped

DatasetDict({
    train: Dataset({
        features: ['highlights', 'input_ids', 'attention_mask'],
        num_rows: 408
    })
})

In [None]:
dataset_tokenized_grouped = dataset_tokenized.map(group_dataset, batch_size=grouping_params["window_len"] * grouping_params["step"], batched=True, fn_kwargs=grouping_params, remove_columns=["messages_split", "match_name"])

  0%|          | 0/1680 [00:00<?, ?ba/s]

  0%|          | 0/525 [00:00<?, ?ba/s]

  0%|          | 0/633 [00:00<?, ?ba/s]

In [None]:
dataset_tokenized_grouped.save_to_disk("fu2017_TwitchLeagueBert_highlight_detection_dataset_tokenized_grouped")

In [None]:
train_sequences_lens = [len(elm) for elm in dataset_tokenized_grouped["train"]["input_ids"]]

In [None]:
np.mean(train_sequences_lens), np.std(train_sequences_lens), np.min(train_sequences_lens), np.max(train_sequences_lens)

(554.4566186797753, 126.82168102565672, 379, 6741)

looks like using 7 seconds for context for each frame is too much to fit into transformer model. Maybe use 6 seconds.

Or following along the lines of fu et al.'s last %25 approach, pad from left

## Truncate long sequences

In [None]:
dataset_tokenized_grouped = datasets.load_from_disk("fu2017_TwitchLeagueBert_highlight_detection_dataset_tokenized_grouped")

In [None]:
[1,2,3,4,5][-(3-1):]

[4, 5]

In [None]:
def pad_truncate_to_max_sequence_length(ex, pad_token_id, pad_to):
    assert len(ex["input_ids"]) == len(ex["attention_mask"])

    if len(ex["input_ids"]) >= pad_to:
        # truncate
        return {
            # adds an additional sequence beginning token
            "input_ids": np.concatenate([[0], np.asarray(ex["input_ids"][-(pad_to-1):])], dtype=float),
            "attention_mask": np.concatenate([[1], np.asarray(ex["attention_mask"][-(pad_to-1):])], dtype=float)
        }
    else:
        # pad
        return {
            "input_ids": np.concatenate([ex["input_ids"], np.full((pad_to - len(ex["input_ids"])), pad_token_id)],
                                        axis=-1).astype(np.float),
            "attention_mask": np.concatenate([ex["attention_mask"], np.full((pad_to - len(ex["attention_mask"])), 0)],
                                             axis=-1).astype(np.float)
        }

In [None]:
dataset_tokenized_grouped_padded = dataset_tokenized_grouped.map(pad_truncate_to_max_sequence_length, fn_kwargs={"pad_token_id": tok.pad_token_id, "pad_to": 512}, batched=False)

  0%|          | 0/341760 [00:00<?, ?ex/s]

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  from ipykernel import kernelapp as app
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


  0%|          | 0/106757 [00:00<?, ?ex/s]

  0%|          | 0/128683 [00:00<?, ?ex/s]

In [None]:
dataset_tokenized_grouped_padded.save_to_disk("fu2017_TwitchLeagueBert_highlight_detection_dataset_tokenized_grouped_padded")

In [None]:
first_entry = dataset_tokenized_grouped_padded["train"][100]

In [None]:
len(first_entry["input_ids"]),len(first_entry["attention_mask"]), first_entry["highlights"]

(512, 512, 0)

## restructure prediction

In [None]:
dataset_tokenized_grouped_padded = datasets.load_from_disk("fu2017_TwitchLeagueBert_highlight_detection_dataset_tokenized_grouped_padded")

In [None]:
def restructure_prediction(ds_batch):
  ret = list()
  for ex in ds_batch["highlights"]:
    #ret_vals = [0,0]
    #ret_vals[ex] = 1
    ret.append([float(ex)])
  return {"labels": ret}

In [None]:
dataset_tokenized_grouped_padded_labels = dataset_tokenized_grouped_padded.map(restructure_prediction, batched=True, remove_columns=["highlights"])

  0%|          | 0/342 [00:00<?, ?ba/s]

  0%|          | 0/107 [00:00<?, ?ba/s]

  0%|          | 0/129 [00:00<?, ?ba/s]

## Oversampling

In [None]:
type(dataset_tokenized_grouped_padded_labels["train"][[23,432,12]])

dict

In [None]:
def over_sample_binary(ds):
  label = np.asarray(ds["labels"])
  class_counts = (abs(label.size - label.sum()).astype(int), label.sum().astype(int))
  smaller_class = np.argmin(class_counts)

  print(class_counts, smaller_class)
  
  ratio = abs((len(label) - class_counts[smaller_class])/(class_counts[smaller_class]) - 1)
  print(ratio)
  smlclss_inds, _ = np.where(label == smaller_class)
  print(smlclss_inds.dtype)
  target = round(class_counts[smaller_class] * ratio)



  new_data = datasets.Dataset.from_dict({k: np.repeat(v, ratio, axis=0) for k, v in ds[smlclss_inds].items()})
  new_data_remainder = datasets.Dataset.from_dict({k: np.asarray(v) for k, v in ds[smlclss_inds[:target - len(new_data["labels"])]].items()})

  return datasets.concatenate_datasets([ds, new_data, new_data_remainder])

In [None]:
ds_oversample = over_sample_binary(dataset_tokenized_grouped_padded_labels["train"])

(306531, 35229) 1
7.701098526781912
int64


In [None]:
ds_oversample

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 613062
})

In [None]:
np.asarray(ds_oversample["label"]).sum()

306531.0

In [None]:
dataset_tokenized_grouped_padded_labels["train"] = ds_oversample

In [None]:
dataset_tokenized_grouped_padded_labels_shuffled = dataset_tokenized_grouped_padded_labels.shuffle(seed=42069)

In [None]:
dataset_tokenized_grouped_padded_labels_shuffled

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 613062
    })
    val: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 106757
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 128683
    })
})

In [None]:
dataset_tokenized_grouped_padded_labels_shuffled.push_to_hub("Epidot/private_fuetal2017_TwitchLeagueBert_highlights_preprocessed_oversampled", private=True, token=HUB_TOKEN)

Pushing split train to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/4 [00:00<?, ?it/s]

Pushing split val to the Hub.
The repository already exists: the `private` keyword argument will be ignored.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing split test to the Hub.
The repository already exists: the `private` keyword argument will be ignored.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

## Model training

In [None]:
device = 'cuda' if cuda.is_available() else 'cpu'
device

'cuda'

In [None]:
cuda.current_device()

0

In [None]:
tok = RobertaTokenizerFast.from_pretrained("/content/gdrive/MyDrive/Master Thesis/Language Model Training/TwitchLeagueBert")

In [None]:
dataset_tokenized_grouped_padded_labels = datasets.load_dataset("Epidot/private_fuetal2017_TwitchLeagueBert_highlights_preprocessed_oversampled", use_auth_token=HUB_TOKEN)

In [None]:
dataset_tokenized_grouped_padded_labels

DatasetDict({
    val: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 106757
    })
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 613062
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 128683
    })
})

In [None]:
dataset_tokenized_grouped_padded_labels["train"]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 613062
})

In [None]:
dataset_tokenized_grouped_padded_labels["train"][:10]["labels"]

[[0.0], [0.0], [0.0], [0.0], [1.0], [0.0], [0.0], [0.0], [0.0], [0.0]]

In [None]:
dataset = dataset_tokenized_grouped_padded_labels

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained("", num_labels=1)

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

In [None]:
# === eval metrics ===
def define_metrics():
    f1_metric = datasets.load_metric("f1")
    p_metric = datasets.load_metric("precision")
    r_metric = datasets.load_metric("recall")
    metrics = [f1_metric, p_metric, r_metric]
    return metrics


def compute_metrics(p):
    # https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_glue.py
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    refs = p.label_ids


    preds = torch.Tensor(np.ravel(preds))
    refs = torch.tensor(np.ravel(refs))

    metrics = define_metrics()
    results = dict(ChainMap(*[m.compute(predictions=preds, references=refs) for m in metrics]))
    return results


def preprocess_logits_for_metrics(logits, labels):
    return torch.round(logits)

In [None]:
compute_metrics

<function __main__.compute_metrics>

In [None]:
output_dir = "/content/gdrive/MyDrive/Master Thesis/ModelFineTuning/TwitchLeagueBert-finetuned-fu/"

In [None]:
training_args = TrainingArguments(
        output_dir=output_dir,
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        save_steps=1000,
        save_total_limit=4,
        evaluation_strategy=IntervalStrategy("steps"),
        eval_steps=100,
        logging_steps=50, # 500
        num_train_epochs=1,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
        label_names=["labels"]
    )

trainer = Trainer(
        model=model,
        compute_metrics=compute_metrics,
        preprocess_logits_for_metrics=preprocess_logits_for_metrics,
        args=training_args,
        train_dataset=dataset["train"],
        eval_dataset=datasets.Dataset.from_dict(dataset["val"][:100]),
        tokenizer=tok,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )

In [None]:
trainer.train()

***** Running training *****
  Num examples = 613062
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 76633


Step,Training Loss,Validation Loss,Recall,Precision,F1
100,0.2267,0.304528,0.8,0.210526,0.333333
200,0.2241,0.169482,0.3,0.230769,0.26087


***** Running Evaluation *****
  Num examples = 100
  Batch size = 8


[[1.]
 [0.]
 [1.]
 [1.]
 [1.]
 [0.]
 [1.]
 [1.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [1.]
 [0.]
 [1.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [1.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [1.]
 [1.]
 [0.]
 [1.]
 [1.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [1.]
 [1.]
 [1.]
 [1.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [1.]
 [0.]
 [0.]
 [0.]
 [1.]
 [1.]
 [0.]
 [0.]
 [0.]
 [1.]
 [1.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [1.]
 [1.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]] [[0.]
 [0.]
 [1.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0

Downloading builder script:   0%|          | 0.00/2.06k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.09k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.09k [00:00<?, ?B/s]

{'recall': 0.8, 'precision': 0.21052631578947367, 'f1': 0.3333333333333333}


***** Running Evaluation *****
  Num examples = 100
  Batch size = 8


[[0.]
 [0.]
 [1.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [1.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]] [[0.]
 [0.]
 [1.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0

KeyboardInterrupt: ignored

In [None]:
# https://stackoverflow.com/questions/68806265/huggingface-trainer-logging-train-data
try:
  with open(f"log_history.json", "w") as out_file:
    json.dump(trainer.state.log_history, out_file, indent=4) # might have to change this to copy back to storage
except Exception as e:
  print("cannot create log_history")
  print(e)

trainer.save_model()

Saving model checkpoint to /content/results
Configuration saved in /content/results/config.json
Model weights saved in /content/results/pytorch_model.bin
tokenizer config file saved in /content/results/tokenizer_config.json
Special tokens file saved in /content/results/special_tokens_map.json


In [None]:
class WeightedLossTrainer(Trainer):
  def __init__(self, *args, **kwargs):
    if "train_pos_weight" in kwargs:
      self.train_pos_weight = kwargs["train_pos_weight"]
    del kwargs["train_pos_weight"]

    super().__init__(*args, **kwargs)


  def compute_loss(self, model, inputs, return_outputs=False):
    labels = inputs.get("labels")
    # forward pass
    outputs = model(**inputs)
    logits = outputs.get("logits")
    print(logits)
    try:
      dev = f"cuda:{logits.get_device()}"
    except RuntimeError:
      dev = "cpu"
    loss_fct = torch.nn.BCEWithLogitsLoss(logits, pos_weight=torch.Tensor([1, self.train_pos_weight]).to(dev))
    loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
    return (loss, outputs) if return_outputs else loss

In [None]:
first_eval = [[0.7616355], [0.07616421], [0.0771743 ], [0.07641821], [0.07628755], [0.07666548], [0.0764187 ], [0.07639287], [0.07627223], [0.07615493], [0.0760907 ], [0.07617793], [0.07617479], [0.0761435 ], [0.07696553], [0.07639503], [0.0764244 ], [0.07631045], [0.0761729 ], [0.07616054], [0.07621111], [0.07616856], [0.07624303], [0.07621307], [0.07621069], [0.07620402], [0.07615732], [0.07615297], [0.07623408], [0.07630043], [0.0763114 ], [0.07620067], [0.07636991], [0.07615241], [0.07611664], [0.07632208], [0.07621147], [0.07619925], [0.07629716], [0.07615304], [0.07615057], [0.07619432], [0.07628324], [0.07615976], [0.07614424], [0.076182  ], [0.07624801], [0.07617931], [0.07613923], [0.07611581], [0.07629696], [0.07630761], [0.07617912], [0.07618748], [0.07644451], [0.07612307], [0.07660869], [0.0760907 ], [0.07644423], [0.07621685], [0.07621557], [0.07623279], [0.07618893], [0.07618178], [0.07611062], [0.07634888], [0.07621358], [0.07629807], [0.07682315], [0.07638013], [0.07619923], [0.0761966 ], [0.07623847], [0.07618963], [0.07619892], [0.07616763], [0.07633369], [0.07626062], [0.07614679], [0.07689293], [0.076107  ], [0.07635631], [0.07619481], [0.07617366], [0.07645369], [0.076164  ], [0.07637326], [0.07646842], [0.07624317], [0.07615667], [0.07619595], [0.07628874], [0.07611088], [0.07619626], [0.07643629], [0.07619534], [0.07629963], [0.07616585], [0.07622565], [0.0761814 ]], [[0.], [0.], [1.], [0.], [1.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [1.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [1.], [0.], [0.], [0.], [0.], [0.], [1.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [1.], [0.], [0.], [0.], [0.], [1.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [1.], [1.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [1.]]