## 1. Set-up

In [1]:
# Mount Drive files
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
MODEL_VERSION = "english"

In [3]:
!pip install transformers
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m56.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m98.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.2-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.2/199.2 KB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.2 tokenizers-0.13.2 transformers-4.26.1
Looking in indexes: https://pypi.org/simple, https://us

In [4]:
import numpy as np
import pandas as pd

import transformers
from datasets import Dataset,load_dataset,load_from_disk, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, BertForSequenceClassification, BertTokenizer, Trainer, VisualBertModel
from scipy.stats import pearsonr
import sklearn.metrics
from datasets import load_metric, load_dataset
from sklearn.metrics import mean_squared_error

## 2. Arrange datasets and tokenize them

In this section, I'll use the dataset provided from

In [5]:
column_names = ["sentence1", "sentence2", "label"]
test_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/dl4nlp_labs/final_project/sentence_similarity/data/en-test.txt", names=column_names, sep="\t")
validation_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/dl4nlp_labs/final_project/sentence_similarity/data/en-val.txt", names=column_names, sep="\t")
train_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/dl4nlp_labs/final_project/sentence_similarity/data/en-train.txt", names=column_names, sep="\t")

In [6]:
train_df

Unnamed: 0,sentence1,sentence2,label
0,umm the california was a cargo ship i dont car...,the california was a cargo ship .,4.00
1,how can i prepare this old exterior wall for p...,how do i prepare this exterior concrete wall f...,4.00
2,the man is playing the guitar .,a man is playing guitar .,5.00
3,`` it was a final test before delivering the m...,state radio said it was the last test before t...,4.00
4,"this does not fully answer your question , but...","i am a phd student in computational science , ...",0.00
...,...,...,...
13360,"pictures of the day : pakistan , syria , indon...",pictures of the day : pakistan and elsewhere,3.20
13361,claims about trayvon 's character and when did...,slanted blogs and when did i do that ?,2.60
13362,msn messenger 6 will be available for download...,the msn messenger 6 software will be available...,3.25
13363,and they never wondr why ! ! u.s gets it wrong...,"opinion : u.s gets it wrong on egypt , again",4.40


In [7]:
#dataset["test"]["sentence1"][:5]

In [8]:
dataset_train = Dataset.from_pandas(train_df,preserve_index=False) 
dataset_dev = Dataset.from_pandas(validation_df,preserve_index=False) 
dataset_test = Dataset.from_pandas(test_df,preserve_index=False) 
dataset = DatasetDict({"train": dataset_train,
                     "test": dataset_test,
                     "validation": dataset_dev})
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label'],
        num_rows: 13365
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label'],
        num_rows: 250
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label'],
        num_rows: 1500
    })
})

Tokenize datasets

In [9]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=1)
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased", model_max_length=512)

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [10]:
# examples = [("monitos en la cama", "estamos en la cama"), ("tres tristes tigres", "los tigres son lindos")]

# tokenizer(examples, padding="max_length", truncation=True)

In [11]:
# print(dataset['train']['sentence1'][20])
# print(dataset['train']['sentence2'][20])

# a = tokenizer(dataset['train']['sentence1'][20], dataset['train']['sentence2'][20], padding=True, truncation=True)
# tokenizer.decode(a["input_ids"])

In [12]:
def tokenize_function(examples):
    return tokenizer(examples["sentence1"], examples["sentence2"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/13365 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [13]:
#tokenized_datasets

In [14]:
for input_idd in tokenized_datasets["train"]["input_ids"]:
  if len(input_idd) != 512:
    print(len(input_idd))

We use rmse as a metric since we are in a regression scenario
 (the lower this value, the better the model performs)

In [15]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    rmse = mean_squared_error(labels, predictions, squared=False)
    return {"rmse": rmse}

## 3. Train the model

In [16]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir=f"/content/drive/MyDrive/Colab Notebooks/dl4nlp_labs/final_project/sentence_similarity/models/{MODEL_VERSION}",
                                  logging_strategy="epoch",
                                  evaluation_strategy="epoch",
                                  #evaluation_strategy="steps",
                                  eval_steps = 10,
                                  per_device_train_batch_size=16,
                                  per_device_eval_batch_size=16,
                                  num_train_epochs = 5,
                                  save_total_limit = 2,
                                  #save_strategy = 'steps',
                                  save_strategy = 'epoch',
                                  #load_best_model_at_end=False
                                  load_best_model_at_end=True)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics
)

trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence1, sentence2. If sentence1, sentence2 are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 13365
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 4180
  Number of trainable parameters = 108311041


Epoch,Training Loss,Validation Loss,Rmse
1,0.792,0.52712,0.72603
2,0.3421,0.54097,0.735507
3,0.1987,0.531158,0.728806
4,0.129,0.516701,0.718819
5,0.0842,0.49599,0.704265


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence1, sentence2. If sentence1, sentence2 are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 250
  Batch size = 16
Saving model checkpoint to /content/drive/MyDrive/Colab Notebooks/dl4nlp_labs/final_project/sentence_similarity/models/english/checkpoint-836
Configuration saved in /content/drive/MyDrive/Colab Notebooks/dl4nlp_labs/final_project/sentence_similarity/models/english/checkpoint-836/config.json
Model weights saved in /content/drive/MyDrive/Colab Notebooks/dl4nlp_labs/final_project/sentence_similarity/models/english/checkpoint-836/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence1, sentence2. If sentence1, sentence2 

TrainOutput(global_step=4180, training_loss=0.3092104149777353, metrics={'train_runtime': 6178.2974, 'train_samples_per_second': 10.816, 'train_steps_per_second': 0.677, 'total_flos': 1.75822384091904e+16, 'train_loss': 0.3092104149777353, 'epoch': 5.0})

## 4. Save model

In [17]:
# model.save_pretrained(f"/content/drive/MyDrive/Colab Notebooks/MT-model/models_tokenizers/model_{MODEL_VERSION}")
# tokenizer.save_pretrained(f"/content/drive/MyDrive/Colab Notebooks/MT-model/models_tokenizers/tokenizer_{MODEL_VERSION}")

In [18]:

# # load the model/tokenizer
# model = AutoModelForSequenceClassification.from_pretrained(f"/content/drive/MyDrive/Colab Notebooks/MT-model/models_tokenizers/model_{MODEL_VERSION}")
# tokenizer = AutoTokenizer.from_pretrained(f"/content/drive/MyDrive/Colab Notebooks/MT-model/models_tokenizers/tokenizer_{MODEL_VERSION}")



## 5. New predictions

In [19]:
tokenized_test_dataset = tokenized_datasets["test"]

In [20]:
trainer = Trainer(model=model)

# def tokenize_function(examples):
#     return tokenizer(examples["text"], padding="max_length", truncation=True)

def prediction(tokenized_datasets):
    # df=pd.DataFrame({'text':texts})
    # dataset = Dataset.from_pandas(df,preserve_index=False) 
    # tokenized_datasets = dataset.map(tokenize_function)
    raw_pred, _, _ = trainer.predict(tokenized_datasets) 
    return(raw_pred)

def evaluate(preds, gold):
    # Compute pearson:
    pearson = pearsonr(preds, gold)[0]
      
    # Compute rmse
    rmse = mean_squared_error(preds, gold, squared=False)

    print(f"RMSE = {rmse:.6f}")
    print(f"Pearson: {pearson:.6f}")

No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [21]:
final_preds = prediction(tokenized_test_dataset)
pred_list = [pred[0] for pred in final_preds]
gold = tokenized_test_dataset['label']
evaluate(pred_list, gold)

The following columns in the test set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: sentence1, sentence2. If sentence1, sentence2 are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 250
  Batch size = 8


RMSE = 0.704265
Pearson: 0.881002
