In [None]:
# Got CUDA out of memory errors due to fragmentation, used this line to fix the issue
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"

In [None]:
!pip install transformers
!pip install datasets
!pip install sentencepiece
!pip install evaluate
!pip install --upgrade accelerate
!pip install wandb
!pip install sacremoses
!pip install huggingface_hub

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.30.1-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m88.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m29.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m127.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90

In [None]:
# login to HF to upload dataset, model, and tokenizer to hub
from huggingface_hub import notebook_login
notebook_login()

# Read in data

Original Data can be found at: https://huggingface.co/datasets/juancavallotti/multilingual-gec

Note that we're only using the train split as the test split is not the correct proportion of the data (~1%)

In [None]:
from datasets import load_dataset, DatasetDict
raw_dataset = load_dataset('juancavallotti/multilingual-gec',split='train')

In [None]:
# use only the french datapoints
fr_dataset = raw_dataset.filter(lambda example: example["lang"] == 'fr')

In [None]:
print(fr_dataset)

In [None]:
# split into train (90% train), dev, test
train_dataset = fr_dataset.train_test_split(test_size=0.1)

dev_and_test = train_dataset['test'].train_test_split(test_size=0.5)

In [None]:
# build final dataset dict
dataset = DatasetDict({
  'train': train_dataset['train'],
  'dev': dev_and_test['train'],
  'test': dev_and_test['test']
  })

In [None]:
# save split on HF hub
dataset.push_to_hub('fr-gec-dataset', token='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')

In [None]:
print(dataset)

# Load in dataset, model and tokenizer

In [None]:
from datasets import load_dataset
dataset = load_dataset('akufeldt/fr-gec-dataset')

Downloading readme:   0%|          | 0.00/729 [00:00<?, ?B/s]

Downloading and preparing dataset None/None to /root/.cache/huggingface/datasets/akufeldt___parquet/akufeldt--fr-gec-dataset-dc2539dd430be97d/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/8.61M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/486k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/484k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/59850 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3325 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/3325 [00:00<?, ? examples/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/akufeldt___parquet/akufeldt--fr-gec-dataset-dc2539dd430be97d/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
import torch
from transformers import MT5TokenizerFast, MT5ForConditionalGeneration

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
# Using multilingual T5 model
# Model card can be found here: https://huggingface.co/google/mt5-base

model = MT5ForConditionalGeneration.from_pretrained('google/mt5-base')
tokenizer = MT5TokenizerFast.from_pretrained('google/mt5-base')

Downloading (…)lve/main/config.json:   0%|          | 0.00/702 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.33G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/376 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]



# Preprocessing

In [None]:
# define max input,target lengths
max_input_length = 200
max_target_length = 200

In [None]:
# proof of concept for preprocess_inputs fn
st = "fix grammar: Il est très importante de parler une langue étrangère."
print(st[13:])

Il est très importante de parler une langue étrangère.


In [None]:
# remove "fix grammar: " prefix from inputs and tokenize
def preprocess_inputs(data):
  inputs = [input for input in data['modified']]
  fixed_inputs = []
  for x in inputs:
    fixed_inputs.append(x[13:])
  return tokenizer(fixed_inputs, max_length=max_input_length, padding='max_length', truncation=True, return_tensors='pt')

# tokenize
def preprocess_labels(data):
  encoding = tokenizer(data['sentence'], padding='max_length', max_length=max_target_length, truncation=True, return_tensors='pt')
  return {'labels': encoding['input_ids']}

In [None]:
dataset = dataset.map(preprocess_inputs,batched=True)
dataset = dataset.map(preprocess_labels,batched=True)

Map:   0%|          | 0/59850 [00:00<?, ? examples/s]

Map:   0%|          | 0/3325 [00:00<?, ? examples/s]

Map:   0%|          | 0/3325 [00:00<?, ? examples/s]

Map:   0%|          | 0/59850 [00:00<?, ? examples/s]

Map:   0%|          | 0/3325 [00:00<?, ? examples/s]

Map:   0%|          | 0/3325 [00:00<?, ? examples/s]

In [None]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['lang', 'sentence', 'modified', 'transformation', 'sec_transformation', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 59850
    })
    dev: Dataset({
        features: ['lang', 'sentence', 'modified', 'transformation', 'sec_transformation', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3325
    })
    test: Dataset({
        features: ['lang', 'sentence', 'modified', 'transformation', 'sec_transformation', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3325
    })
})


In [None]:
# split into trn, dev, and test datasets, shuffle each
train_dataset = dataset['train'].shuffle(seed=42).remove_columns(['lang','transformation', 'sec_transformation', '__index_level_0__'])
dev_dataset = dataset['dev'].shuffle(seed=42).remove_columns(['lang','transformation', 'sec_transformation', '__index_level_0__'])
test_dataset = dataset['test'].shuffle(seed=42).remove_columns(['lang','transformation', 'sec_transformation', '__index_level_0__'])

In [None]:
print(train_dataset)

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 59850
})


# Get baseline scores: zero-shot evaluate mT5 on our data

In [None]:
# for the given dataset, get model predictions and return a list of inputs, a list of preds, and a list of references for calculating metrics.
def get_model_preds(model, tokenizer, dataset)
  src = []
  preds = []
  refs = []
  for x in dataset:
    tokenized_text = tokenizer(x['modified'][13:], padding=False, truncation=True, return_tensors='pt')['input_ids'].to(device)

    # get model output
    output = model.generate(tokenized_text,
                            do_sample=True,
                            max_length=200,
                            top_k=50,
                            top_p=0.95,
                            num_return_sequences=1,
                            no_repeat_ngram_size=3,
                            early_stopping=True,
                            temperature=0.2)

    pred = tokenizer.decode(output[0], skip_special_tokens=True)

    src.append(x['modified'][13:])
    preds.append(pred)
    refs.append(x['sentence'])

  return src, preds, refs

In [None]:
zeroshot_dev_src, zeroshot_dev_preds, zeroshot_dev_refs = get_model_preds(model, tokenizer, dev_dataset)

In [None]:
zeroshot_tst_src, zeroshot_tst_preds, zeroshot_tst_refs = get_model_preds(model, tokenizer, tst_dataset)

In [None]:
# dump list of sentences (either src, preds, or refs) to a .txt file for computing GLEU
def dump_to_txt(name,lst):
  with open(name, 'w', encoding='utf-8') as f:
    for x in lst:
      f.write(x + '\n')

In [None]:
dump_to_txt('zeroshot_dev_src.txt',zeroshot_dev_src)
dump_to_txt('zeroshot_dev_preds.txt',zeroshot_dev_preds)
dump_to_txt('zeroshot_dev_refs.txt',zeroshot_dev_refs)

dump_to_txt('zeroshot_tst_src.txt',zeroshot_tst_src)
dump_to_txt('zeroshot_tst_preds.txt',zeroshot_tst_preds)
dump_to_txt('zeroshot_tst_refs.txt',zeroshot_tst_refs)

In [None]:
# compute BLEU on dev zeroshot results
dev_zs_results = bleu.compute(predictions=zeroshot_dev_preds, references=zeroshot_dev_refs)
print(dev_zs_results)

In [None]:
# compute BLEU on test zeroshot results
tst_zs_results = bleu.compute(predictions=zeroshot_tst_preds, references=zeroshot_tst_refs)
print(tst_zs_results)

{'bleu': 0.004375680359103682, 'precisions': [0.098064675694058, 0.014265531856242032, 0.004319738480697385, 0.0010281167583031604], 'brevity_penalty': 0.49285648467887505, 'length_ratio': 0.5856387601390498, 'translation_length': 32346, 'reference_length': 55232}


# Finetune model

In [None]:
train_batch_size = 8
eval_batch_size = 8

In [None]:
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer)

In [None]:
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(
    output_dir='gec_chkpts',
    num_train_epochs=2,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=eval_batch_size,
    eval_accumulation_steps=1,
    prediction_loss_only=False,
    learning_rate=0.001,
    evaluation_strategy='steps',
    save_steps=1000,
    save_total_limit=3,
    remove_unused_columns=True,
    run_name='run_3', # Wandb run name
    logging_steps=500,
    eval_steps=500,
    logging_first_step=False,
    load_best_model_at_end=True,
    metric_for_best_model="loss", # loss to eval models
    greater_is_better=False
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    data_collator=data_collator,
)

In [None]:
# Finetune!
trainer.train()

In [None]:
# save finetuned model and tokenizer
trainer.save_model('best_model_gec')
tokenizer.save_pretrained('best_model_gec')

model = trainer.model

In [None]:
# push finetuned model to HF hub
model.save_pretrained('finetuned_gec', push_to_hub=True)

In [None]:
# push finetuned tokenizer to HF hub
tokenizer.push_to_hub("finetuned_gec_tokenizer")

Evaluate model

In [None]:
from transformers import MT5TokenizerFast, MT5ForConditionalGeneration

model_finetuned = MT5ForConditionalGeneration.from_pretrained('akufeldt/finetuned_gec').to(device)
tokenizer_finetuned = MT5TokenizerFast.from_pretrained('akufeldt/finetuned_gec_tokenizer')

Downloading (…)okenizer_config.json:   0%|          | 0.00/285 [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/16.3M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

In [None]:
finetune_dev_src, finetune_dev_preds, finetune_dev_refs = get_model_preds(model_finetuned, tokenizer_finetuned, dev_dataset)

In [None]:
finetune_tst_src, finetune_tst_preds, finetune_tst_refs = get_model_preds(model_finetuned, tokenizer_finetuned, tst_dataset)

In [None]:
dump_to_txt('finetune_dev_src.txt',finetune_dev_src)
dump_to_txt('finetune_dev_preds.txt',finetune_dev_preds)
dump_to_txt('finetune_dev_refs.txt',finetune_dev_refs)

dump_to_txt('finetune_tst_src.txt',finetune_tst_src)
dump_to_txt('finetune_tst_preds.txt',finetune_tst_preds)
dump_to_txt('finetune_tst_refs.txt',finetune_tst_refs)

In [None]:
# compute BLEU on dev finetuned results
dev_ft_results = bleu.compute(predictions=finetune_dev_preds, references=finetune_dev_refs)
print(dev_ft_results)

In [None]:
# compute BLEU on test finetuned results
tst_ft_results = bleu.compute(predictions=finetune_tst_preds, references=finetune_tst_refs)
print(tst_ft_results)

Test how model behaves when given a correct sentence.

In [None]:
ex_incorrect_sent = "bonjour monisserie"

ex_input = tokenizer_finetuned(ex_incorrect_sent, max_length=200, padding='max_length', truncation=True, return_tensors='pt')['input_ids'].to(device)

In [None]:
ex_pred = model_finetuned.generate(ex_input,
                             do_sample=True,
                             max_length=300,
                             top_k=50,
                             top_p=0.95,
                             num_return_sequences=1,
                             no_repeat_ngram_size=3,
                             early_stopping=True,
                             temperature=0.2)

In [None]:
ex_output = tokenizer_finetuned.decode(ex_pred[0], skip_special_tokens=True)
print(ex_output)

Bonjour, mon-série.
