<a href="https://colab.research.google.com/github/mark-torres10/datasetArtifacts/blob/main/notebooks/train_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



```
# This is formatted as code
```

## Setup

In [1]:
# setup
!wget https://raw.githubusercontent.com/mark-torres10/datasetArtifacts/main/helpers.py
!wget https://raw.githubusercontent.com/mark-torres10/datasetArtifacts/main/requirements.txt
!wget https://raw.githubusercontent.com/mark-torres10/datasetArtifacts/main/data/samples_to_train.jsonl
!wget https://raw.githubusercontent.com/mark-torres10/datasetArtifacts/main/data/consolidated/spanish_consolidated_backtranslations.jsonl
!wget https://raw.githubusercontent.com/mark-torres10/datasetArtifacts/main/data/consolidated/japanese_consolidated_backtranslations.jsonl
!wget https://raw.githubusercontent.com/mark-torres10/datasetArtifacts/main/data/samples_to_translate.jsonl
!pip install -r requirements.txt

--2022-12-04 16:55:51--  https://raw.githubusercontent.com/mark-torres10/datasetArtifacts/main/helpers.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 14620 (14K) [text/plain]
Saving to: ‘helpers.py’


2022-12-04 16:55:51 (96.6 MB/s) - ‘helpers.py’ saved [14620/14620]

--2022-12-04 16:55:51--  https://raw.githubusercontent.com/mark-torres10/datasetArtifacts/main/requirements.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 48 [text/plain]
Saving to: ‘requirements.txt’


2022-12-04 16:55:52 (2.91 MB/s) - ‘requirements.txt’ sa

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [153]:
import json
import os
import sys

import datasets
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, \
    AutoModelForQuestionAnswering, Trainer, TrainingArguments, HfArgumentParser

from helpers import prepare_dataset_nli, prepare_train_dataset_qa, \
    prepare_validation_dataset_qa, QuestionAnsweringTrainer, compute_accuracy

In [None]:
output_dir = "/content/drive/MyDrive/grad_school/2022_fall/nlp"
do_train   = True
do_eval = True
task    = "qa"
#dataset = "squad"
dataset = "samples_to_train.jsonl"
per_device_train_batch_size = 32
num_train_epochs = 5


#
# DON'T CHANGE - this creates the .args file for the parser below.
#
# Don't worry about the 'No such file or directory' message the first 
# time you run this either.
#
from pathlib import Path
import sys
arg_file = Path(sys.argv[0]).with_suffix(".args")

!rm {arg_file}
!echo "--output_dir {output_dir}" >> {arg_file}
!echo "--do_train {do_train}" >> {arg_file}
!echo "--do_eval {do_eval}" >> {arg_file}
!echo "--task {task}" >> {arg_file}
!echo "--dataset {dataset}" >> {arg_file}
!echo "--per_device_train_batch_size {per_device_train_batch_size}" >> {arg_file}
!echo "--num_train_epochs {num_train_epochs}" >> {arg_file}
!tail {arg_file}

## Load training sets

#### Load core datasets

In [17]:
master_dataset = datasets.load_dataset("squad")



  0%|          | 0/2 [00:00<?, ?it/s]

In [467]:
# dataset 1: baseline, 10,000 samples
dataset_base = datasets.load_dataset('json', data_files="samples_to_train.jsonl")



  0%|          | 0/1 [00:00<?, ?it/s]

In [61]:
# dataset 2: 1,000 samples that were backtranslated
dataset_to_translate = datasets.load_dataset('json', data_files="samples_to_translate.jsonl")



Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-48604e7193057f1f/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-48604e7193057f1f/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [148]:
# dataset 3: backtranslated Spanish
dataset_es = datasets.load_dataset('json', data_files="spanish_consolidated_backtranslations.jsonl")



  0%|          | 0/1 [00:00<?, ?it/s]

In [149]:
# dataset 4: backtranslated Spanish
dataset_ja = datasets.load_dataset('json', data_files="spanish_consolidated_backtranslations.jsonl")



  0%|          | 0/1 [00:00<?, ?it/s]

### Update and clean backtranslated samples

In [150]:
# TODO: for es and ja, remove extra fields, replace "question" with backtranslation
def update_training_samples(dataset):
  updated_train_list = []
  for sample in dataset["train"]:
    # replace "question" with "backtranslation"
    if "backtranslation" in sample:
      sample["question"] = sample["backtranslation"]
    # remove extra fields
    FIELDS_TO_REMOVE = [
        "source_lang", "pivot_lang", "translation", "backtranslation"
    ]
    for field in FIELDS_TO_REMOVE:
      if field in sample:
        sample.pop(field)
    updated_train_list.append(sample)
  return updated_train_list
    

In [202]:
# updating the backtranslated samples
updated_samples_es = update_training_samples(dataset_es)
updated_samples_ja = update_training_samples(dataset_ja)

# updating 1,000 samples that were translated
updated_samples_to_translate = update_training_samples(dataset_to_translate) # shouldn't be changed, just changes Dataset to List

# combining backtranslated samples with 1,000 samples that were translated
updated_combined_with_seed_phrases = (
    updated_samples_es + updated_samples_ja + updated_samples_to_translate
)

# taking 2,000 samples from the master dataset
updated_training_2000_subset = (
    update_training_samples(master_dataset)[:2000]
)

# taking the 10,000 samples from the master dataset (transforming as a list
# so we can add the backtranslated samples to it)
updated_training_10000_subset = (
    update_training_samples(dataset_base)
)

In [203]:
# write samples back to memory
updated_es_filename = "updated_es.jsonl"
updated_ja_filename = "updated_ja.jsonl"
updated_combined_translations_filename = "updated_combined_translations.jsonl"
updated_combined_with_seed_phrases_filename = "updated_combined_with_seed.jsonl"
updated_training_2000_subset_filename = "updated_training_2000_subset.jsonl"

In [206]:
lst_filenames = [
    updated_es_filename, updated_ja_filename,
    updated_combined_translations_filename,
    updated_combined_with_seed_phrases_filename,
    updated_training_2000_subset_filename
]
lst_objs = [
    updated_samples_es, updated_samples_ja, updated_samples_to_translate,
    updated_combined_with_seed_phrases, updated_training_2000_subset
]

In [207]:
for (fp, lst_samples) in zip(lst_filenames, lst_objs):
  tmp_df = pd.DataFrame(lst_samples)
  tmp_df.to_json(fp, orient='records', lines=True)


### Generating alternative datasets

There are other datasets that we will need as well, let's generate them

#### Does having the original seed phrases along with the backtranslations improve performance?

For this, we'll need 3 datasets:

- 2,000 training samples (baseline)
- 1,000 English seed phrases + 1,000 Spanish backtranslations
- 1,000 English seed phrases + 1,000 Japanese backtranslations

In [208]:
# 2,000 training sample subset already available above as `updated_training_2000_subset_filename`, so just need to write other two datasets
eng_plus_es_2000_fp = "eng_plus_es_2000.jsonl"
eng_plus_ja_2000_fp = "eng_plus_ja_2000.jsonl"

eng_plus_es_2000 = updated_samples_es + updated_samples_to_translate
eng_plus_ja_2000 = updated_samples_ja + updated_samples_to_translate

In [209]:
for (fp, lst_samples) in zip(
    [eng_plus_es_2000_fp, eng_plus_ja_2000_fp],
    [eng_plus_es_2000, eng_plus_ja_2000]
):
  tmp_df = pd.DataFrame(lst_samples)
  tmp_df.to_json(fp, orient='records', lines=True)


In [210]:
training_2000_subset_dataset = datasets.load_dataset(
    'json', data_files=updated_training_2000_subset_filename
)
eng_plus_es_2000_dataset = datasets.load_dataset(
    'json', data_files=eng_plus_es_2000_fp
)
eng_plus_ja_2000_dataset = datasets.load_dataset(
    'json', data_files=eng_plus_ja_2000_fp
)



Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-6fe5eb6b3681d774/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-6fe5eb6b3681d774/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]



Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-234b621db7c978bc/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-234b621db7c978bc/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]



Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-3a33e35a38fa051e/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-3a33e35a38fa051e/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

#### Variations in proportion of original vs. backtranslation (for a fixed sample size)

For this, we'll need 9 datasets:

- Training samples 100%, backtranslation 0%:
    - 1,000 training samples
- Training samples 75%, backtranslation 25%:
    - 750 training samples, 250 Spanish
    - 750 training samples, 250 Japanese
- Training samples 50%, backtranslation 50%:
    - 500 training samples, 500 Spanish
    - 500 training samples, 500 Japanese
- Training samples 25%, backtranslation 75%:
    - 250 training samples, 750 Spanish
    - 250 training samples, 750 Spanish
- Training samples 0%, backtranslation 100%:
    - 1,000 Spanish
    - 1,000 Japanese

In [419]:
train_75_es_data_fp = "train_75_es_data.jsonl"
train_75_ja_data_fp = "train_75_ja_data.jsonl"

train_50_es_data_fp = "train_50_es_data.jsonl"
train_50_ja_data_fp = "train_50_ja_data.jsonl"

train_25_es_data_fp = "train_25_es_data.jsonl"
train_25_ja_data_fp = "train_25_ja_data.jsonl"

In [420]:
# the 100% training samples and 100% backtranslation datasets are generated,
# elsewhere, so all we need are subsets of the different proportions
train_75_es_data = updated_training_2000_subset[:750] + updated_samples_es[:250]
train_75_ja_data = updated_training_2000_subset[:750] + updated_samples_ja[:250]

train_50_es_data = updated_training_2000_subset[:500] + updated_samples_es[:500]
train_50_ja_data = updated_training_2000_subset[:500] + updated_samples_ja[:500]

train_25_es_data = updated_training_2000_subset[:250] + updated_samples_es[:750]
train_25_ja_data = updated_training_2000_subset[:250] + updated_samples_ja[:750]


In [421]:
proportion_data_filenames = [
    train_75_es_data_fp, train_75_ja_data_fp, train_50_es_data_fp,
    train_50_ja_data_fp, train_25_es_data_fp, train_25_ja_data_fp
]

proportion_data = [
    train_75_es_data, train_75_ja_data, train_50_es_data, train_50_ja_data,
    train_25_es_data, train_25_ja_data
]

In [422]:
for (fp, lst_samples) in zip(proportion_data_filenames, proportion_data):
  tmp_df = pd.DataFrame(lst_samples)
  tmp_df.to_json(fp, orient='records', lines=True)


In [423]:
train_75_es_dataset = datasets.load_dataset(
    'json', data_files=train_75_es_data_fp
)
train_75_ja_dataset = datasets.load_dataset(
    'json', data_files=train_75_ja_data_fp
)
train_50_es_dataset = datasets.load_dataset(
    'json', data_files=train_50_es_data_fp
)
train_50_ja_dataset = datasets.load_dataset(
    'json', data_files=train_50_ja_data_fp
)
train_25_es_dataset = datasets.load_dataset(
    'json', data_files=train_25_es_data_fp
)
train_25_ja_dataset = datasets.load_dataset(
    'json', data_files=train_25_ja_data_fp
)



Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-47d2f9f6546f32d5/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-47d2f9f6546f32d5/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]



Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-66be2b6b666e7b96/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-66be2b6b666e7b96/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]



Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-a7056f3e69d9a91a/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-a7056f3e69d9a91a/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]



Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-0fa0c74587b12747/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-0fa0c74587b12747/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]



Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-3f7489fb531a91b4/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-3f7489fb531a91b4/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]



Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-7bb46240fec13e6b/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-7bb46240fec13e6b/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

#### Impact of backtranslations when increasing sample size

For this one we need 5 datasets:
- 10,000 training samples
- 11,000 original English context (10,000 training samples + 1,000 English seed phrases that were backtranslated)
- 11,000 (10,000 training samples + 1,000 Spanish backtranslations)
- 11,000 (10,000 training samples + 1,000 Japanese backtranslations)
- 11,000 (9,000 training samples + 1,000 Spanish backtranslations + 1,000 Japanese backtranslations)

In [219]:
eng_only_11000_fp = "eng_only_11000_data.jsonl"
eng_es_11000_fp = "eng_es_11000_data.jsonl"
eng_ja_11000_fp = "eng_ja_11000_data.jsonl"
eng_es_ja_11000_fp = "eng_es_ja_11000_data.jsonl"

In [470]:
# the 10,000 training samples already downloaded in master dataset
eng_only_11000_data = updated_training_10000_subset + updated_samples_to_translate
eng_es_11000_data = updated_training_10000_subset + updated_samples_es
eng_ja_11000_data = updated_training_10000_subset + updated_samples_ja
eng_es_ja_11000_data = (
    updated_training_10000_subset[:9000]
    + updated_samples_es
    + updated_samples_ja
)

In [472]:
mixed_data_fps = [
  eng_only_11000_fp, eng_es_11000_fp, eng_ja_11000_fp, eng_es_ja_11000_fp
]
mixed_data_lst = [
    eng_only_11000_data, eng_es_11000_data, eng_ja_11000_data,
    eng_es_ja_11000_data
]


In [473]:
for (fp, lst_samples) in zip(mixed_data_fps, mixed_data_lst):
  tmp_df = pd.DataFrame(lst_samples)
  tmp_df.to_json(fp, orient='records', lines=True)


In [474]:
eng_only_11000_dataet = datasets.load_dataset(
    'json', data_files=eng_only_11000_fp
)
eng_es_11000_dataset = datasets.load_dataset(
    'json', data_files=eng_es_11000_fp
)
eng_ja_11000_dataset = datasets.load_dataset(
    'json', data_files=eng_ja_11000_fp
)
eng_es_ja_11000_dataset = datasets.load_dataset(
    'json', data_files=eng_es_ja_11000_fp
)



Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-383425ed55bb834d/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-383425ed55bb834d/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]



Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-8e4ffbbd2acbb183/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-8e4ffbbd2acbb183/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]



Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-9cb1d59dd9f04673/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-9cb1d59dd9f04673/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]



Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-a6378c57bbe70a15/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-a6378c57bbe70a15/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

### Load datasets

Now that we've finished our preprocessing, let's load all the .jsonl files

In [159]:
updated_dataset_es = datasets.load_dataset(
    'json', data_files=updated_es_filename
)
updated_dataset_ja = datasets.load_dataset(
    'json', data_files=updated_ja_filename
)
# combining Spanish and Japanese backtranslations
updated_dataset_combined_translations = datasets.load_dataset(
    'json', data_files=updated_combined_translations_filename
)

# combining Spanish and Japanese backtranslations with the 1,000 samples used
# for the backtranslation
updated_dataset_combined_with_seed_phrases = datasets.load_dataset(
    'json', data_files=updated_combined_with_seed_phrases_filename
)




  0%|          | 0/1 [00:00<?, ?it/s]



  0%|          | 0/1 [00:00<?, ?it/s]



  0%|          | 0/1 [00:00<?, ?it/s]



  0%|          | 0/1 [00:00<?, ?it/s]

### Select the appropriate training data for the current batch of training

In [477]:
# NOTE: this sets the dataset var to be whatever is here, since everything below uses the "dataset" var
#dataset = dataset_to_translate
#dataset = updated_dataset_es
#dataset = updated_dataset_ja
#dataset = updated_dataset_combined_translations
#dataset = updated_dataset_combined_with_seed_phrases
#dataset = training_2000_subset_dataset
#dataset = eng_plus_es_2000_dataset
#dataset = eng_plus_ja_2000_dataset
#dataset = train_75_es_dataset
#dataset = train_75_ja_dataset
#dataset = train_50_es_dataset
#dataset = train_50_ja_dataset
#dataset = train_25_es_dataset
#dataset = train_25_ja_dataset
dataset = eng_only_11000_dataet

In [478]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 11000
    })
})

In [479]:
# do verification, make sure that the dataset is what you want it to be
# (check num_rows, check a few features, etc.)
print(dataset["train"])
print(dataset["train"][0])


Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 11000
})
{'id': '5733be284776f41900661182', 'title': 'University_of_Notre_Dame', 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.', 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?', 'answers': {'text': ['Saint Bern

In [480]:
# set the validation set of the original SQuAD dataset as the validation set of our dataset
dataset["validation"] = master_dataset["validation"]

## Set up configuration

In [481]:
task_kwargs = {}
model_class = AutoModelForQuestionAnswering
pretrained_model = "google/electra-small-discriminator"

In [482]:
run_training_bool = True
run_evaluation_bool = True

## Load ELECTRA model and tokenizer

In [483]:
model = model_class.from_pretrained(pretrained_model, **task_kwargs)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model, use_fast=True)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--google--electra-small-discriminator/snapshots/153f486d928bcfc213932f8fc91fc2e3c41af769/config.json
Model config ElectraConfig {
  "_name_or_path": "google/electra-small-discriminator",
  "architectures": [
    "ElectraForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "embedding_size": 128,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 4,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_proj": true,
  "transformers_version": "4.23.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weight

In [484]:
prepare_train_dataset = lambda exs: prepare_train_dataset_qa(exs, tokenizer)
prepare_eval_dataset = lambda exs: prepare_validation_dataset_qa(exs, tokenizer)

## Set up datasets for training and evaluation

In [485]:
train_dataset = None
eval_dataset = None
train_dataset_featurized = None
eval_dataset_featurized = None
eval_split = "validation"

In [486]:
if run_training_bool:
    train_dataset = dataset['train']
    train_dataset_featurized = train_dataset.map(
        prepare_train_dataset,
        batched=True,
        num_proc=NUM_PREPROCESSING_WORKERS,
        remove_columns=train_dataset.column_names
    )
if run_evaluation_bool:
    eval_dataset = dataset[eval_split]
    eval_dataset_featurized = eval_dataset.map(
        prepare_eval_dataset,
        batched=True,
        num_proc=NUM_PREPROCESSING_WORKERS,
        remove_columns=eval_dataset.column_names
    )

    

#0:   0%|          | 0/6 [00:00<?, ?ba/s]

#1:   0%|          | 0/6 [00:00<?, ?ba/s]

   



 



## Configure training

In [487]:
trainer_class = Trainer
eval_kwargs = {}
compute_metrics = None

In [488]:
# For QA, we need to use a tweaked version of the Trainer (defined in helpers.py)
# to enable the question-answering specific evaluation metrics
trainer_class = QuestionAnsweringTrainer
eval_kwargs['eval_examples'] = eval_dataset
metric = datasets.load_metric('squad')
compute_metrics = lambda eval_preds: metric.compute(
    predictions=eval_preds.predictions, references=eval_preds.label_ids
)

In [489]:
# This function wraps the compute_metrics function, storing the model's predictions
# so that they can be dumped along with the computed metrics
eval_predictions = None
def compute_metrics_and_store_predictions(eval_preds):
    global eval_predictions
    eval_predictions = eval_preds
    return compute_metrics(eval_preds)

### Create args object used by Huggingface

In [490]:
output_dir = "/content/drive/MyDrive/grad_school/2022_fall/nlp"
do_train   = True
do_eval = True
task    = "qa"
#dataset = "squad"
#dataset = "samples_to_train.jsonl"
per_device_train_batch_size = 32
num_train_epochs = 8


#
# DON'T CHANGE - this creates the .args file for the parser below.
#
# Don't worry about the 'No such file or directory' message the first 
# time you run this either.
#
from pathlib import Path
import sys
arg_file = Path(sys.argv[0]).with_suffix(".args")

!rm {arg_file}
!echo "--output_dir {output_dir}" >> {arg_file}
!echo "--do_train {do_train}" >> {arg_file}
!echo "--do_eval {do_eval}" >> {arg_file}
!echo "--task {task}" >> {arg_file}
#!echo "--dataset {dataset}" >> {arg_file}
!echo "--per_device_train_batch_size {per_device_train_batch_size}" >> {arg_file}
!echo "--num_train_epochs {num_train_epochs}" >> {arg_file}
!tail {arg_file}

--output_dir /content/drive/MyDrive/grad_school/2022_fall/nlp
--do_train True
--do_eval True
--task qa
--per_device_train_batch_size 32
--num_train_epochs 8


In [491]:
argp = HfArgumentParser(TrainingArguments)

In [492]:
argp.add_argument('--model', type=str,
                  default='google/electra-small-discriminator',
                  help="""This argument specifies the base model to fine-tune.
    This should either be a HuggingFace model ID (see https://huggingface.co/models)
    or a path to a saved model checkpoint (a folder containing config.json and pytorch_model.bin).""")
argp.add_argument('--task', type=str, choices=['nli', 'qa'], required=True, default='nli',
                  help="""This argument specifies which task to train/evaluate on.
    Pass "nli" for natural language inference or "qa" for question answering.
    By default, "nli" will use the SNLI dataset, and "qa" will use the SQuAD dataset.""")
#argp.add_argument('--dataset', type=str, default="snli",
#                  help="""This argument overrides the default dataset used for the specified task.""")
argp.add_argument('--max_length', type=int, default=128,
                  help="""This argument limits the maximum sequence length used during training/evaluation.
    Shorter sequence lengths need less memory and computation time, but some examples may end up getting truncated.""")
argp.add_argument('--max_train_samples', type=int, default=None,
                  help='Limit the number of examples to train on.')
argp.add_argument('--max_eval_samples', type=int, default=None,
                  help='Limit the number of examples to evaluate on.')
# argp.add_argument('--output_dir', type=str, default='./trained_model/')
# argp.add_argument('--do_train', type=bool, default=True)

_StoreAction(option_strings=['--max_eval_samples'], dest='max_eval_samples', nargs=None, const=None, default=None, type=<class 'int'>, choices=None, help='Limit the number of examples to evaluate on.', metavar=None)

In [493]:
training_args, args, _ = argp.parse_args_into_dataclasses(return_remaining_strings=True)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [494]:
# Initialize the Trainer object with the specified arguments and the model and dataset we loaded above
trainer = trainer_class(
    model=model,
    args=training_args,
    train_dataset=train_dataset_featurized,
    eval_dataset=eval_dataset_featurized,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics_and_store_predictions
)

In [495]:
# from: https://github.com/nlp-with-transformers/notebooks/issues/31
old_collator = trainer.data_collator
trainer.data_collator = lambda data: dict(old_collator(data))

## Run training

In [None]:
# Train and/or evaluate
if run_training_bool:
    # resume from checkpoint, see https://piazza.com/class/l5wwouzxsk54qb/post/767
    #trainer.train(resume_from_checkpoint = True)
    trainer.train()
    #trainer.save_model()
    # If you want to customize the way the loss is computed, you should subclass Trainer and override the "compute_loss"
    # method (see https://huggingface.co/transformers/_modules/transformers/trainer.html#Trainer.compute_loss).
    #
    # You can also add training hooks using Trainer.add_callback:
    #   See https://huggingface.co/transformers/main_classes/trainer.html#transformers.Trainer.add_callback
    #   and https://huggingface.co/transformers/main_classes/callback.html#transformers.TrainerCallback


***** Running training *****
  Num examples = 11009
  Num Epochs = 8
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 2760
You're using a ElectraTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,2.4246
1000,1.2638
1500,0.9092
2000,0.72


Saving model checkpoint to /content/drive/MyDrive/grad_school/2022_fall/nlp/checkpoint-500
Configuration saved in /content/drive/MyDrive/grad_school/2022_fall/nlp/checkpoint-500/config.json
Model weights saved in /content/drive/MyDrive/grad_school/2022_fall/nlp/checkpoint-500/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/grad_school/2022_fall/nlp/checkpoint-500/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/grad_school/2022_fall/nlp/checkpoint-500/special_tokens_map.json
Saving model checkpoint to /content/drive/MyDrive/grad_school/2022_fall/nlp/checkpoint-1000
Configuration saved in /content/drive/MyDrive/grad_school/2022_fall/nlp/checkpoint-1000/config.json
Model weights saved in /content/drive/MyDrive/grad_school/2022_fall/nlp/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/grad_school/2022_fall/nlp/checkpoint-1000/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/gra

## Run evaluation

In [None]:
if run_evaluation_bool:
    results = trainer.evaluate(**eval_kwargs)

    # To add custom metrics, you should replace the "compute_metrics" function (see comments above).
    #
    # If you want to change how predictions are computed, you should subclass Trainer and override the "prediction_step"
    # method (see https://huggingface.co/transformers/_modules/transformers/trainer.html#Trainer.prediction_step).
    # If you do this your custom prediction_step should probably start by calling super().prediction_step and modifying the
    # values that it returns.

    print('Evaluation results:')
    print(results)

    os.makedirs(training_args.output_dir, exist_ok=True)

    with open(os.path.join(training_args.output_dir, 'eval_metrics.json'), encoding='utf-8', mode='w') as f:
        json.dump(results, f)

    with open(os.path.join(training_args.output_dir, 'eval_predictions.jsonl'), encoding='utf-8', mode='w') as f:
        if args.task == 'qa':
            predictions_by_id = {pred['id']: pred['prediction_text'] for pred in eval_predictions.predictions}
            for example in eval_dataset:
                example_with_prediction = dict(example)
                example_with_prediction['predicted_answer'] = predictions_by_id[example['id']]
                f.write(json.dumps(example_with_prediction))
                f.write('\n')
        else:
            for i, example in enumerate(eval_dataset):
                example_with_prediction = dict(example)
                example_with_prediction['predicted_scores'] = eval_predictions.predictions[i].tolist()
                example_with_prediction['predicted_label'] = int(eval_predictions.predictions[i].argmax())
                f.write(json.dumps(example_with_prediction))
                f.write('\n')