# Model Training 

The following trial data will be collected: 

| Dataset | Curriculum | Metric Outputs (Loss, Accuracy) | Sentence Examples | 
|---------|------------|---------------|-------------------|
| Fiction (HP) | Curriculum Learning | | | 
| Fiction (HP) | Anti-Curriculum Learning | | | 
| Fiction (HP) | Randomized | | | 
| Non-Fiction (MIND) | Curriculum Learning | | | 
| Non-Fiction (MIND) | Anti-Curriculum Learning | | | 
| Non-Fiction (MIND) | Randomized | | | 

In [1]:
import wandb

wandb.init(project="fine-tuneing-cl", entity="nak-develops")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mnataliekraft5426[0m ([33mnak-develops[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [98]:
import pandas as pd 
from sklearn.utils import shuffle

hp = pd.read_csv("../data/harrypotter_2.csv")
hp.rename(columns={'sentence': 'text'}, inplace=True)
hp['label'] = 0
hp = hp[pd.notna(hp['text'])]
hp['ave_syllable'] = hp['syllables'] / hp['words']

# curriculum learning 
hp_cl = hp.sort_values(by=['flesch', 'ave_syllable', 'words'], ascending = True)
hp_cl_fine = hp_cl[['label', 'text']].reset_index().rename(columns={'index' : 'input_ids'})

# # anti-curriculum learning 
# hp_a = hp.sort_values(by=['flesch', 'ave_syllable', 'words'], ascending = False)
# # random 
# hp_r = shuffle(hp)

In [99]:
from datasets import Dataset

dataset = Dataset.from_pandas(hp_cl_fine)

from torch.utils.data import random_split
import math

small_train_dataset, small_eval_dataset = random_split(dataset, [math.floor(len(dataset) * .8), len(dataset) - math.floor(len(dataset) * .8)])

In [100]:
from torch.utils.data import DataLoader

dataloader_train = DataLoader(
   dataset=small_train_dataset,
   batch_size=5, 
   shuffle=False
)

dataloader_eval = DataLoader(
   dataset=small_eval_dataset,
   batch_size=5, 
   shuffle=False
)


In [110]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

loading configuration file config.json from cache at /Users/nakraft/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.24.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

loading file vocab.txt from cache at /Users/nakraft/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/vocab.txt
loa

In [102]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

## Train Data

In [103]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)

loading configuration file config.json from cache at /Users/nakraft/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.24.0",
  "

In [105]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [106]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch", num_train_epochs=3)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [107]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [108]:
trainer.train()

***** Running training *****
  Num examples = 49542
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 18579
  Number of trainable parameters = 108314117
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
  0%|          | 0/18579 [00:00<?, ?it/s]The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


TypeError: 'int' object is not iterable