In [2]:
!pip install datasets evaluate sacrebleu git+https://github.com/openai/whisper.git

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to c:\users\markv\appdata\local\temp\pip-req-build-mu8ah7df
  Resolved https://github.com/openai/whisper.git to commit 5979f03701209bb035a0a466f14131aeb1116cbb
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'


  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git 'C:\Users\markv\AppData\Local\Temp\pip-req-build-mu8ah7df'


In [1]:
!pip install transformers==4.45.2





In [7]:
!pip install protobuf==3.20.0

Collecting protobuf==3.20.0
  Downloading protobuf-3.20.0-cp310-cp310-win_amd64.whl.metadata (698 bytes)
Downloading protobuf-3.20.0-cp310-cp310-win_amd64.whl (903 kB)
   ---------------------------------------- 0.0/903.8 kB ? eta -:--:--
   ----------------------- ---------------- 524.3/903.8 kB 2.8 MB/s eta 0:00:01
   ---------------------------------------- 903.8/903.8 kB 2.9 MB/s eta 0:00:00
Installing collected packages: protobuf
  Attempting uninstall: protobuf
    Found existing installation: protobuf 3.20.1
    Uninstalling protobuf-3.20.1:
      Successfully uninstalled protobuf-3.20.1
Successfully installed protobuf-3.20.0


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-intel 2.11.0 requires protobuf<3.20,>=3.9.2, but you have protobuf 3.20.0 which is incompatible.


In [1]:
# Required imports
import pandas as pd
import torch
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast, Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import load_dataset, DatasetDict
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the Tydi XOR dataset from Hugging Face and filter for languages
dataset = load_dataset("coastalcph/tydi_xor_rc")

# Filter the dataset for Finnish, Japanese, and Russian languages
languages = ['fi', 'ja', 'ru']
train_data = dataset["train"].filter(lambda x: x['lang'] in languages)
valid_data = dataset["validation"].filter(lambda x: x['lang'] in languages)

In [4]:
# Instantiate tokenizer and model for mBART (multilingual)
model_name = "facebook/mbart-large-50-many-to-many-mmt"
tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name)

In [3]:
# If you want to load model from file
model_path = "./resultsGpu/checkpoint-500"
model = MBartForConditionalGeneration.from_pretrained(model_path)
tokenizer = MBart50TokenizerFast.from_pretrained(model_path)

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model.to(device)

Using device: cuda


MBartForConditionalGeneration(
  (model): MBartModel(
    (shared): MBartScaledWordEmbedding(250054, 1024, padding_idx=1)
    (encoder): MBartEncoder(
      (embed_tokens): MBartScaledWordEmbedding(250054, 1024, padding_idx=1)
      (embed_positions): MBartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x MBartEncoderLayer(
          (self_attn): MBartAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): ReLU()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
        

In [5]:
# Preprocessing function for tokenization
def preprocess_data(batch):
    inputs = []
    targets = []

    # For each question-context-answer pair
    for question, context, lang, answer, answerable in zip(
        batch["question"], batch["context"], batch["lang"], batch["answer"], batch["answerable"]
    ):
        # Construct input as "question: <question> context: <context>"
        input_text = f"question: {question} context: {context}"
        inputs.append(input_text)

        # For answerable cases, we use the actual answer; otherwise, we use "No answer"
        target_text = answer if answerable else "No answer"
        targets.append(target_text)

    # Tokenize input and target pairs
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply preprocessing to the train and validation sets
tokenized_train = train_data.map(preprocess_data, batched=True)
tokenized_valid = valid_data.map(preprocess_data, batched=True)

Map: 100%|██████████| 1380/1380 [00:00<00:00, 2192.67 examples/s]


In [6]:
# Define metric for evaluation
bleu_metric = evaluate.load("sacrebleu")

# Custom evaluation function to compute BLEU scores
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # Decode predictions and labels
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute BLEU score
    bleu = bleu_metric.compute(predictions=decoded_preds, references=[[label] for label in decoded_labels])
    return {"bleu": bleu["score"]}

In [36]:
# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./resultsGpu",
    evaluation_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=3,
    predict_with_generate=True,
    logging_dir="./logs",
    logging_steps=10,
    fp16=True,
)

# Initialize the trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train.select(range(1000)),
    eval_dataset=tokenized_valid,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Start training
trainer.train()

  0%|          | 2/1125 [03:34<33:28:53, 107.33s/it]
  1%|▏         | 10/750 [05:20<5:36:20, 27.27s/it]
  1%|▏         | 10/750 [05:36<5:36:20, 27.27s/it]

{'loss': 9.9309, 'grad_norm': 99.59172821044922, 'learning_rate': 2.976e-05, 'epoch': 0.04}


  3%|▎         | 20/750 [13:25<7:40:53, 37.88s/it] 
  3%|▎         | 20/750 [13:29<7:40:53, 37.88s/it]

{'loss': 8.162, 'grad_norm': 104.07878112792969, 'learning_rate': 2.936e-05, 'epoch': 0.08}


  4%|▍         | 30/750 [18:18<5:28:01, 27.33s/it]
  4%|▍         | 30/750 [18:23<5:28:01, 27.33s/it]

{'loss': 6.3541, 'grad_norm': 106.49015045166016, 'learning_rate': 2.896e-05, 'epoch': 0.12}


  5%|▌         | 40/750 [22:25<4:15:09, 21.56s/it]
  5%|▌         | 40/750 [22:29<4:15:09, 21.56s/it]

{'loss': 4.3822, 'grad_norm': 105.35734558105469, 'learning_rate': 2.856e-05, 'epoch': 0.16}


  7%|▋         | 50/750 [25:46<3:54:25, 20.09s/it]
  7%|▋         | 50/750 [25:50<3:54:25, 20.09s/it]

{'loss': 2.4782, 'grad_norm': 88.43077850341797, 'learning_rate': 2.816e-05, 'epoch': 0.2}


  8%|▊         | 60/750 [29:06<3:50:34, 20.05s/it]
  8%|▊         | 60/750 [29:11<3:50:34, 20.05s/it]

{'loss': 0.9577, 'grad_norm': 41.389957427978516, 'learning_rate': 2.7760000000000002e-05, 'epoch': 0.24}


  9%|▉         | 70/750 [32:54<4:52:06, 25.77s/it]
  9%|▉         | 70/750 [32:58<4:52:06, 25.77s/it]

{'loss': 0.2456, 'grad_norm': 8.504852294921875, 'learning_rate': 2.7360000000000002e-05, 'epoch': 0.28}


 11%|█         | 80/750 [36:59<4:00:38, 21.55s/it]
 11%|█         | 80/750 [37:04<4:00:38, 21.55s/it]

{'loss': 0.082, 'grad_norm': 1.460890769958496, 'learning_rate': 2.696e-05, 'epoch': 0.32}


 12%|█▏        | 90/750 [40:24<3:45:05, 20.46s/it]
 12%|█▏        | 90/750 [40:28<3:45:05, 20.46s/it]

{'loss': 0.0599, 'grad_norm': 0.4415079951286316, 'learning_rate': 2.656e-05, 'epoch': 0.36}


 13%|█▎        | 100/750 [43:47<3:40:49, 20.38s/it]
 13%|█▎        | 100/750 [43:52<3:40:49, 20.38s/it]

{'loss': 0.0537, 'grad_norm': 0.6643622517585754, 'learning_rate': 2.616e-05, 'epoch': 0.4}


 15%|█▍        | 110/750 [47:46<4:50:15, 27.21s/it]
 15%|█▍        | 110/750 [47:50<4:50:15, 27.21s/it]

{'loss': 0.0452, 'grad_norm': 0.43421679735183716, 'learning_rate': 2.576e-05, 'epoch': 0.44}


 16%|█▌        | 120/750 [52:45<5:13:04, 29.82s/it]
 16%|█▌        | 120/750 [52:49<5:13:04, 29.82s/it]

{'loss': 0.0448, 'grad_norm': 0.5135655999183655, 'learning_rate': 2.536e-05, 'epoch': 0.48}


 17%|█▋        | 130/750 [57:44<5:09:05, 29.91s/it]
 17%|█▋        | 130/750 [57:48<5:09:05, 29.91s/it]

{'loss': 0.0405, 'grad_norm': 0.48099538683891296, 'learning_rate': 2.4959999999999998e-05, 'epoch': 0.52}


 19%|█▊        | 140/750 [1:02:43<5:04:11, 29.92s/it]
 19%|█▊        | 140/750 [1:02:47<5:04:11, 29.92s/it]

{'loss': 0.0448, 'grad_norm': 0.4931408762931824, 'learning_rate': 2.456e-05, 'epoch': 0.56}


 20%|██        | 150/750 [1:07:42<4:59:01, 29.90s/it]
 20%|██        | 150/750 [1:07:46<4:59:01, 29.90s/it]

{'loss': 0.0433, 'grad_norm': 0.3529857099056244, 'learning_rate': 2.4160000000000002e-05, 'epoch': 0.6}


 21%|██▏       | 160/750 [1:12:45<4:57:44, 30.28s/it]
 21%|██▏       | 160/750 [1:12:49<4:57:44, 30.28s/it]

{'loss': 0.0408, 'grad_norm': 0.4690313935279846, 'learning_rate': 2.3760000000000003e-05, 'epoch': 0.64}


 23%|██▎       | 170/750 [1:17:47<4:52:30, 30.26s/it]
 23%|██▎       | 170/750 [1:17:52<4:52:30, 30.26s/it]

{'loss': 0.0416, 'grad_norm': 0.35360974073410034, 'learning_rate': 2.336e-05, 'epoch': 0.68}


 24%|██▍       | 180/750 [1:22:50<4:47:27, 30.26s/it]
 24%|██▍       | 180/750 [1:22:54<4:47:27, 30.26s/it]

{'loss': 0.0414, 'grad_norm': 0.33611828088760376, 'learning_rate': 2.296e-05, 'epoch': 0.72}


 25%|██▌       | 190/750 [1:27:51<4:42:04, 30.22s/it]
 25%|██▌       | 190/750 [1:27:55<4:42:04, 30.22s/it]

{'loss': 0.0343, 'grad_norm': 0.3943593502044678, 'learning_rate': 2.256e-05, 'epoch': 0.76}


 27%|██▋       | 200/750 [1:32:54<4:37:31, 30.28s/it]
 27%|██▋       | 200/750 [1:32:58<4:37:31, 30.28s/it]

{'loss': 0.0423, 'grad_norm': 0.41464170813560486, 'learning_rate': 2.216e-05, 'epoch': 0.8}


 28%|██▊       | 210/750 [1:37:56<4:32:06, 30.23s/it]
 28%|██▊       | 210/750 [1:38:01<4:32:06, 30.23s/it]

{'loss': 0.0433, 'grad_norm': 0.4163757860660553, 'learning_rate': 2.1760000000000002e-05, 'epoch': 0.84}


 29%|██▉       | 220/750 [1:42:59<4:27:15, 30.26s/it]
 29%|██▉       | 220/750 [1:43:03<4:27:15, 30.26s/it]

{'loss': 0.0348, 'grad_norm': 0.45092901587486267, 'learning_rate': 2.136e-05, 'epoch': 0.88}


 31%|███       | 230/750 [1:48:02<4:22:17, 30.26s/it]
 31%|███       | 230/750 [1:48:06<4:22:17, 30.26s/it]

{'loss': 0.0406, 'grad_norm': 0.5332738161087036, 'learning_rate': 2.096e-05, 'epoch': 0.92}


 32%|███▏      | 240/750 [1:53:05<4:17:26, 30.29s/it]
 32%|███▏      | 240/750 [1:53:09<4:17:26, 30.29s/it]

{'loss': 0.0312, 'grad_norm': 0.46768811345100403, 'learning_rate': 2.056e-05, 'epoch': 0.96}


 33%|███▎      | 250/750 [1:58:08<4:12:25, 30.29s/it]
 33%|███▎      | 250/750 [1:58:12<4:12:25, 30.29s/it]

{'loss': 0.0319, 'grad_norm': 0.3419967591762543, 'learning_rate': 2.016e-05, 'epoch': 1.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

{'eval_loss': 0.04103313758969307, 'eval_bleu': 13.668218968497257, 'eval_runtime': 11412.1512, 'eval_samples_per_second': 0.121, 'eval_steps_per_second': 0.03, 'epoch': 1.0}


 35%|███▍      | 260/750 [5:13:23<22:55:26, 168.42s/it]  
 35%|███▍      | 260/750 [5:13:27<22:55:26, 168.42s/it]

{'loss': 0.0116, 'grad_norm': 0.11194858700037003, 'learning_rate': 1.976e-05, 'epoch': 1.04}


 36%|███▌      | 270/750 [5:18:25<4:33:33, 34.19s/it]  
 36%|███▌      | 270/750 [5:18:30<4:33:33, 34.19s/it]

{'loss': 0.0138, 'grad_norm': 0.28062352538108826, 'learning_rate': 1.936e-05, 'epoch': 1.08}


 37%|███▋      | 280/750 [5:23:28<3:57:53, 30.37s/it]
 37%|███▋      | 280/750 [5:23:32<3:57:53, 30.37s/it]

{'loss': 0.0141, 'grad_norm': 0.19261829555034637, 'learning_rate': 1.896e-05, 'epoch': 1.12}


 39%|███▊      | 290/750 [5:28:31<3:51:59, 30.26s/it]
 39%|███▊      | 290/750 [5:28:35<3:51:59, 30.26s/it]

{'loss': 0.0102, 'grad_norm': 0.2051917165517807, 'learning_rate': 1.8560000000000002e-05, 'epoch': 1.16}


 40%|████      | 300/750 [5:33:34<3:47:09, 30.29s/it]
 40%|████      | 300/750 [5:33:38<3:47:09, 30.29s/it]

{'loss': 0.0125, 'grad_norm': 0.11711887270212173, 'learning_rate': 1.816e-05, 'epoch': 1.2}


 41%|████▏     | 310/750 [5:38:36<3:41:49, 30.25s/it]
 41%|████▏     | 310/750 [5:38:40<3:41:49, 30.25s/it]

{'loss': 0.0167, 'grad_norm': 1.179735541343689, 'learning_rate': 1.776e-05, 'epoch': 1.24}


 43%|████▎     | 320/750 [5:43:39<3:36:56, 30.27s/it]
 43%|████▎     | 320/750 [5:43:43<3:36:56, 30.27s/it]

{'loss': 0.0128, 'grad_norm': 0.2754683494567871, 'learning_rate': 1.736e-05, 'epoch': 1.28}


 44%|████▍     | 330/750 [5:48:42<3:32:02, 30.29s/it]
 44%|████▍     | 330/750 [5:48:46<3:32:02, 30.29s/it]

{'loss': 0.0143, 'grad_norm': 0.2735796570777893, 'learning_rate': 1.696e-05, 'epoch': 1.32}


 45%|████▌     | 340/750 [5:53:44<3:26:33, 30.23s/it]
 45%|████▌     | 340/750 [5:53:48<3:26:33, 30.23s/it]

{'loss': 0.0117, 'grad_norm': 0.4491020739078522, 'learning_rate': 1.656e-05, 'epoch': 1.36}


 47%|████▋     | 350/750 [5:58:47<3:21:46, 30.27s/it]
 47%|████▋     | 350/750 [5:58:51<3:21:46, 30.27s/it]

{'loss': 0.0115, 'grad_norm': 0.3666364848613739, 'learning_rate': 1.6159999999999998e-05, 'epoch': 1.4}


 48%|████▊     | 360/750 [6:03:49<3:16:39, 30.26s/it]
 48%|████▊     | 360/750 [6:03:54<3:16:39, 30.26s/it]

{'loss': 0.0211, 'grad_norm': 0.10381457209587097, 'learning_rate': 1.576e-05, 'epoch': 1.44}


 49%|████▉     | 370/750 [6:08:52<3:11:50, 30.29s/it]
 49%|████▉     | 370/750 [6:08:57<3:11:50, 30.29s/it]

{'loss': 0.0151, 'grad_norm': 1.4526993036270142, 'learning_rate': 1.5360000000000002e-05, 'epoch': 1.48}


 51%|█████     | 380/750 [6:13:55<3:06:43, 30.28s/it]
 51%|█████     | 380/750 [6:13:59<3:06:43, 30.28s/it]

{'loss': 0.0117, 'grad_norm': 0.2290980964899063, 'learning_rate': 1.4959999999999999e-05, 'epoch': 1.52}


 52%|█████▏    | 390/750 [6:18:58<3:01:34, 30.26s/it]
 52%|█████▏    | 390/750 [6:19:02<3:01:34, 30.26s/it]

{'loss': 0.018, 'grad_norm': 0.3683541715145111, 'learning_rate': 1.4560000000000001e-05, 'epoch': 1.56}


 53%|█████▎    | 400/750 [6:24:00<2:56:27, 30.25s/it]
 53%|█████▎    | 400/750 [6:24:05<2:56:27, 30.25s/it]

{'loss': 0.0068, 'grad_norm': 0.2935950458049774, 'learning_rate': 1.416e-05, 'epoch': 1.6}


 55%|█████▍    | 410/750 [6:29:03<2:51:33, 30.27s/it]
 55%|█████▍    | 410/750 [6:29:07<2:51:33, 30.27s/it]

{'loss': 0.0181, 'grad_norm': 0.3772999346256256, 'learning_rate': 1.376e-05, 'epoch': 1.64}


 56%|█████▌    | 420/750 [6:34:06<2:46:21, 30.25s/it]
 56%|█████▌    | 420/750 [6:34:10<2:46:21, 30.25s/it]

{'loss': 0.0241, 'grad_norm': 0.15266336500644684, 'learning_rate': 1.336e-05, 'epoch': 1.68}


 57%|█████▋    | 430/750 [6:39:09<2:41:26, 30.27s/it]
 57%|█████▋    | 430/750 [6:39:13<2:41:26, 30.27s/it]

{'loss': 0.017, 'grad_norm': 0.29921942949295044, 'learning_rate': 1.296e-05, 'epoch': 1.72}


 59%|█████▊    | 440/750 [6:44:11<2:36:22, 30.27s/it]
 59%|█████▊    | 440/750 [6:44:15<2:36:22, 30.27s/it]

{'loss': 0.0116, 'grad_norm': 0.283763587474823, 'learning_rate': 1.2560000000000002e-05, 'epoch': 1.76}


 60%|██████    | 450/750 [6:49:14<2:31:14, 30.25s/it]
 60%|██████    | 450/750 [6:49:18<2:31:14, 30.25s/it]

{'loss': 0.0124, 'grad_norm': 0.39458954334259033, 'learning_rate': 1.216e-05, 'epoch': 1.8}


 61%|██████▏   | 460/750 [6:54:16<2:26:18, 30.27s/it]
 61%|██████▏   | 460/750 [6:54:21<2:26:18, 30.27s/it]

{'loss': 0.0119, 'grad_norm': 0.1506733000278473, 'learning_rate': 1.1760000000000001e-05, 'epoch': 1.84}


 63%|██████▎   | 470/750 [6:59:19<2:21:06, 30.24s/it]
 63%|██████▎   | 470/750 [6:59:23<2:21:06, 30.24s/it]

{'loss': 0.0137, 'grad_norm': 0.41681745648384094, 'learning_rate': 1.136e-05, 'epoch': 1.88}


 64%|██████▍   | 480/750 [7:04:22<2:16:12, 30.27s/it]
 64%|██████▍   | 480/750 [7:04:26<2:16:12, 30.27s/it]

{'loss': 0.0105, 'grad_norm': 0.21773815155029297, 'learning_rate': 1.096e-05, 'epoch': 1.92}


 65%|██████▌   | 490/750 [7:09:24<2:11:00, 30.23s/it]
 65%|██████▌   | 490/750 [7:09:28<2:11:00, 30.23s/it]

{'loss': 0.0112, 'grad_norm': 0.5933569073677063, 'learning_rate': 1.0559999999999999e-05, 'epoch': 1.96}


 67%|██████▋   | 500/750 [7:14:26<2:06:00, 30.24s/it]
 67%|██████▋   | 500/750 [7:14:31<2:06:00, 30.24s/it]

{'loss': 0.0138, 'grad_norm': 0.32597747445106506, 'learning_rate': 1.0160000000000001e-05, 'epoch': 2.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

{'eval_loss': 0.03962969407439232, 'eval_bleu': 14.202996380696376, 'eval_runtime': 10315.6946, 'eval_samples_per_second': 0.134, 'eval_steps_per_second': 0.033, 'epoch': 2.0}


 68%|██████▊   | 510/750 [10:09:57<9:43:09, 145.79s/it]   
 68%|██████▊   | 510/750 [10:10:01<9:43:09, 145.79s/it]

{'loss': 0.004, 'grad_norm': 0.22426196932792664, 'learning_rate': 9.76e-06, 'epoch': 2.04}


 69%|██████▉   | 520/750 [10:13:17<1:29:08, 23.26s/it] 
 69%|██████▉   | 520/750 [10:13:21<1:29:08, 23.26s/it]

{'loss': 0.0045, 'grad_norm': 0.037455473095178604, 'learning_rate': 9.36e-06, 'epoch': 2.08}


 71%|███████   | 530/750 [10:16:36<1:13:01, 19.92s/it]
 71%|███████   | 530/750 [10:16:40<1:13:01, 19.92s/it]

{'loss': 0.0038, 'grad_norm': 0.27546486258506775, 'learning_rate': 8.96e-06, 'epoch': 2.12}


 72%|███████▏  | 540/750 [10:20:00<1:11:39, 20.47s/it]
 72%|███████▏  | 540/750 [10:20:05<1:11:39, 20.47s/it]

{'loss': 0.0075, 'grad_norm': 0.054392874240875244, 'learning_rate': 8.56e-06, 'epoch': 2.16}


 73%|███████▎  | 550/750 [10:23:18<1:05:51, 19.76s/it]
 73%|███████▎  | 550/750 [10:23:23<1:05:51, 19.76s/it]

{'loss': 0.0042, 'grad_norm': 0.14412793517112732, 'learning_rate': 8.160000000000001e-06, 'epoch': 2.2}


 75%|███████▍  | 560/750 [10:26:39<1:03:22, 20.01s/it]
 75%|███████▍  | 560/750 [10:26:43<1:03:22, 20.01s/it]

{'loss': 0.0032, 'grad_norm': 0.12132888287305832, 'learning_rate': 7.76e-06, 'epoch': 2.24}


 76%|███████▌  | 570/750 [10:29:59<1:00:22, 20.12s/it]
 76%|███████▌  | 570/750 [10:30:03<1:00:22, 20.12s/it]

{'loss': 0.005, 'grad_norm': 0.12188597768545151, 'learning_rate': 7.36e-06, 'epoch': 2.28}


 77%|███████▋  | 580/750 [10:33:23<57:41, 20.36s/it]  
 77%|███████▋  | 580/750 [10:33:27<57:41, 20.36s/it]

{'loss': 0.0035, 'grad_norm': 0.14447736740112305, 'learning_rate': 6.96e-06, 'epoch': 2.32}


 79%|███████▊  | 590/750 [10:37:31<55:41, 20.88s/it]  
 79%|███████▊  | 590/750 [10:37:35<55:41, 20.88s/it]

{'loss': 0.0041, 'grad_norm': 0.32089829444885254, 'learning_rate': 6.560000000000001e-06, 'epoch': 2.36}


 80%|████████  | 600/750 [10:40:54<51:16, 20.51s/it]
 80%|████████  | 600/750 [10:40:59<51:16, 20.51s/it]

{'loss': 0.0036, 'grad_norm': 0.049665238708257675, 'learning_rate': 6.16e-06, 'epoch': 2.4}


 81%|████████▏ | 610/750 [10:44:19<48:13, 20.67s/it]
 81%|████████▏ | 610/750 [10:44:23<48:13, 20.67s/it]

{'loss': 0.004, 'grad_norm': 0.047843948006629944, 'learning_rate': 5.76e-06, 'epoch': 2.44}


 83%|████████▎ | 620/750 [10:47:53<45:02, 20.79s/it]
 83%|████████▎ | 620/750 [10:47:58<45:02, 20.79s/it]

{'loss': 0.0032, 'grad_norm': 0.3601582944393158, 'learning_rate': 5.36e-06, 'epoch': 2.48}


 84%|████████▍ | 630/750 [10:51:21<41:37, 20.81s/it]
 84%|████████▍ | 630/750 [10:51:26<41:37, 20.81s/it]

{'loss': 0.0037, 'grad_norm': 0.35988935828208923, 'learning_rate': 4.96e-06, 'epoch': 2.52}


 84%|████████▍ | 632/750 [10:52:03<40:54, 20.80s/it]

KeyboardInterrupt: 

In [7]:
# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./resultsGpu",
    evaluation_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=3,
    predict_with_generate=True,
    logging_dir="./logs",
    logging_steps=10,
    fp16=True,
)

# Initialize the trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train.select(range(4)),
    eval_dataset=tokenized_valid.select(range(4)),
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Start training
trainer.train()

                                             
 33%|███▎      | 1/3 [01:20<00:59, 29.87s/it]

{'eval_loss': 0.04223838075995445, 'eval_bleu': 0.0, 'eval_runtime': 50.2783, 'eval_samples_per_second': 0.08, 'eval_steps_per_second': 0.02, 'epoch': 1.0}


                                             
 67%|██████▋   | 2/3 [02:57<01:06, 66.27s/it]

{'eval_loss': 0.04363738372921944, 'eval_bleu': 0.0, 'eval_runtime': 55.5094, 'eval_samples_per_second': 0.072, 'eval_steps_per_second': 0.018, 'epoch': 2.0}


                                             
100%|██████████| 3/3 [04:21<00:00, 87.18s/it]

{'eval_loss': 0.044128358364105225, 'eval_bleu': 0.0, 'eval_runtime': 28.613, 'eval_samples_per_second': 0.14, 'eval_steps_per_second': 0.035, 'epoch': 3.0}
{'train_runtime': 261.5623, 'train_samples_per_second': 0.046, 'train_steps_per_second': 0.011, 'train_loss': 0.0005929085503642758, 'epoch': 3.0}





TrainOutput(global_step=3, training_loss=0.0005929085503642758, metrics={'train_runtime': 261.5623, 'train_samples_per_second': 0.046, 'train_steps_per_second': 0.011, 'total_flos': 13002778607616.0, 'train_loss': 0.0005929085503642758, 'epoch': 3.0})

In [47]:
# Evaluate and collect results on validation set
eval_results = trainer.evaluate(tokenized_valid.select(range(8)))

# Separate the validation dataset into answerable and unanswerable subsets
answerable_data = tokenized_valid.filter(lambda x: x["answerable"]).select(range(8))
unanswerable_data = tokenized_valid.filter(lambda x: not x["answerable"]).select(range(8))

# Evaluate on answerable examples
answerable_results = trainer.evaluate(answerable_data)

# Evaluate on unanswerable examples
unanswerable_results = trainer.evaluate(unanswerable_data)

# Print overall results and answerable/unanswerable breakdown
print("Overall BLEU:", eval_results["eval_bleu"])
print("Answerable BLEU:", answerable_results["eval_bleu"])
print("Unanswerable BLEU:", unanswerable_results["eval_bleu"])

                                                    
 84%|████████▍ | 632/750 [11:07:37<40:54, 20.80s/it]

{'eval_loss': 0.05551483482122421, 'eval_bleu': 4.159883186137868, 'eval_runtime': 59.0177, 'eval_samples_per_second': 0.136, 'eval_steps_per_second': 0.034, 'epoch': 2.53}


Filter: 100%|██████████| 1380/1380 [00:00<00:00, 1976.55 examples/s]
Filter: 100%|██████████| 1380/1380 [00:00<00:00, 2036.29 examples/s]
                                                    
 84%|████████▍ | 632/750 [11:08:38<40:54, 20.80s/it]

{'eval_loss': 0.05551483482122421, 'eval_bleu': 4.159883186137868, 'eval_runtime': 58.9688, 'eval_samples_per_second': 0.136, 'eval_steps_per_second': 0.034, 'epoch': 2.53}


                                                    
 84%|████████▍ | 632/750 [11:09:40<40:54, 20.80s/it]

{'eval_loss': 0.0026778103783726692, 'eval_bleu': 0.0, 'eval_runtime': 62.0308, 'eval_samples_per_second': 0.129, 'eval_steps_per_second': 0.032, 'epoch': 2.53}
Overall BLEU: 4.159883186137868
Answerable BLEU: 4.159883186137868
Unanswerable BLEU: 0.0


In [39]:
# Sample input for prediction
sample_question = tokenized_valid.select(range(2,3))['question'][0]
sample_context = tokenized_valid.select(range(2,3))['context'][0]

# Construct the input text
input_text = f"question: {sample_question} context: {sample_context}"

# Tokenize the input
input_ids = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True).input_ids.to(device)

# Generate the answer
with torch.no_grad():
    output_ids = model.generate(input_ids, max_length=128, num_beams=5, early_stopping=True)

# Decode the generated output
generated_answer = tokenizer.decode(output_ids[0], skip_special_tokens=True)

# Print the generated answer
print(f"Question: {sample_question}")
print(f"Context: {sample_context}")
print(f"Generated Answer: {generated_answer}")


Question: Kuka oli Glee sarjan pääosassa?
Context: Rachel Barbra Berry (Lea Michele) is the lead character and is a "strong, driven" member of the glee club, who is misunderstood by her peers. Michele took the role in "Glee" because of Rachel's characterization, explaining: "Not only is she a singer, but she has so much heart—I think it's what we need on TV." Michele described the first thirteen episodes of the series as: "Rachel's journey of finding herself within the glee club", explaining that: "She's learning how to be a team player and work within this group."
Generated Answer: Rachel Barbra Berry


In [30]:
print(tokenized_train.select(range(1,2))['question'][0])
print(tokenized_train.select(range(1,2))['context'])
print(tokenized_train.select(range(1,2))['answer'])
print(tokenized_train.select(range(1,2))['answerable'])

ビスカヤ県で初めて進出した大規模鉱業会社は何？
['Another consequence of the Carlist defeat and ensuing abolition of the Basque institutional system was the Liberalization of the industries on the Basque Provinces, especially in Biscay. The liberalization of the mines, industries and ports attracted many companies, specially British Mining Companies, that established in Biscay along with small local societies, such as Ybarra-Mier y Compañía, creating a big industrial society, based on iron mining and industry. These expansion created very big mining companies, such as Orconera Iron Ore Company Limited and Societé Franco-Belge des Mines de Somorrostro.']
['Ybarra-Mier y Compañía']
[True]


In [12]:
# Sample input for prediction
sample_question = tokenized_train.select(range(1,2))['question'][0]
sample_context = tokenized_train.select(range(1,2))['context'][0]

# Construct the input text
input_text = f"question: {sample_question} context: {sample_context}"

# Tokenize the input
input_ids = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True).input_ids.to(device)

# Generate the answer
with torch.no_grad():
    output_ids = model.generate(input_ids, max_length=128, num_beams=5, early_stopping=True)

# Decode the generated output
generated_answer = tokenizer.decode(output_ids[0], skip_special_tokens=True)

# Print the generated answer
print(f"Question: {sample_question}")
print(f"Context: {sample_context}")
print(f"Generated Answer: {generated_answer}")


Question: ビスカヤ県で初めて進出した大規模鉱業会社は何？
Context: Another consequence of the Carlist defeat and ensuing abolition of the Basque institutional system was the Liberalization of the industries on the Basque Provinces, especially in Biscay. The liberalization of the mines, industries and ports attracted many companies, specially British Mining Companies, that established in Biscay along with small local societies, such as Ybarra-Mier y Compañía, creating a big industrial society, based on iron mining and industry. These expansion created very big mining companies, such as Orconera Iron Ore Company Limited and Societé Franco-Belge des Mines de Somorrostro.
Generated Answer: Ybarra-Mier y Compañía


In [43]:
model.save_pretrained('./resultsGpu/2.5epochs')