# Aula 4 do curso da Fast AI

Aluno: Adriano Ferreira Lopes

Matrícula: 201802671

## Importando bibliotecas

In [1]:
import os
import pandas as pd
import numpy as np
from pathlib import Path

iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')

In [2]:
! pip install -q -U git+https://github.com/huggingface/transformers.git
! pip install -q -U git+https://github.com/huggingface/accelerate.git

## Obtendo dataset no Kaggle

Caso não esteja usando o Kaggle, é necessário baixar o dataset manualmente. Mais detalhes aqui:
https://www.kaggle.com/code/jhoward/getting-started-with-nlp-for-absolute-beginners

In [3]:
if iskaggle:
    path = Path('../input/us-patent-phrase-to-phrase-matching')
    !pip install -q datasets

## Visualizando dados

In [4]:
!ls {path}

sample_submission.csv  test.csv  train.csv


In [5]:
df = pd.read_csv(path/'train.csv')
df.head()

Unnamed: 0,id,anchor,target,context,score
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75
2,36d72442aefd8232,abatement,active catalyst,A47,0.25
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0


In [6]:
df.describe(include='object')

Unnamed: 0,id,anchor,target,context
count,36473,36473,36473,36473
unique,36473,733,29340,106
top,37d61fd2272659b1,component composite coating,composition,H01
freq,1,152,24,2186


In [7]:
df['input'] = 'TEXT1: ' + df.context + '; TEXT2: ' + df.target + '; ANC1: ' + df.anchor

In [8]:
df.input.head()

0    TEXT1: A47; TEXT2: abatement of pollution; ANC...
1    TEXT1: A47; TEXT2: act of abating; ANC1: abate...
2    TEXT1: A47; TEXT2: active catalyst; ANC1: abat...
3    TEXT1: A47; TEXT2: eliminating process; ANC1: ...
4    TEXT1: A47; TEXT2: forest region; ANC1: abatement
Name: input, dtype: object

## Transformando as entradas em Tokens

In [9]:
from datasets import Dataset,DatasetDict

ds = Dataset.from_pandas(df)

In [10]:
ds

Dataset({
    features: ['id', 'anchor', 'target', 'context', 'score', 'input'],
    num_rows: 36473
})

Utilizando um modelo pré-treinado para os tokens

In [11]:
model_nm = 'microsoft/deberta-v3-small'

In [12]:
from transformers import AutoModelForSequenceClassification,AutoTokenizer
tokz = AutoTokenizer.from_pretrained(model_nm)


Downloading (…)okenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [13]:
def get_tok(x): return tokz(x['input'])

In [14]:
tok_ds = ds.map(get_tok, batched=True)

  0%|          | 0/37 [00:00<?, ?ba/s]

In [15]:
row = tok_ds[0]
row['input'], row['input_ids']

('TEXT1: A47; TEXT2: abatement of pollution; ANC1: abatement',
 [1,
  54453,
  435,
  294,
  336,
  5753,
  346,
  54453,
  445,
  294,
  47284,
  265,
  6435,
  346,
  23702,
  435,
  294,
  47284,
  2])

In [16]:
tok_ds

Dataset({
    features: ['id', 'anchor', 'target', 'context', 'score', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 36473
})

In [17]:
tokz.vocab['▁of']
tok_ds = tok_ds.rename_columns({'score':'labels'})

## Obtendo os dados de teste e validação

In [18]:
eval_df = pd.read_csv(path/'test.csv')
eval_df.describe()

Unnamed: 0,id,anchor,target,context
count,36,36,36,36
unique,36,34,36,29
top,4112d61851461f60,el display,inorganic photoconductor drum,G02
freq,1,2,1,3


In [19]:
dds = tok_ds.train_test_split(0.25, seed=42)
dds

DatasetDict({
    train: Dataset({
        features: ['id', 'anchor', 'target', 'context', 'labels', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 27354
    })
    test: Dataset({
        features: ['id', 'anchor', 'target', 'context', 'labels', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 9119
    })
})

In [20]:
eval_df['input'] = 'TEXT1: ' + eval_df.context + '; TEXT2: ' + eval_df.target + '; ANC1: ' + eval_df.anchor
eval_ds = Dataset.from_pandas(eval_df).map(get_tok, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

## Treinando o modelo

In [21]:
from transformers import TrainingArguments,Trainer

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [22]:
bs = 128
epochs = 4
lr = 8e-5

In [23]:
def corr(x, y): return np.corrcoef(x,y)[0][1]
def corr_d(eval_pred): return {'pearson': corr(*eval_pred)}

In [24]:
args = TrainingArguments('outputs', learning_rate=lr, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=True,
    evaluation_strategy="epoch", per_device_train_batch_size=bs, per_device_eval_batch_size=bs*2,
    num_train_epochs=epochs, weight_decay=0.01, report_to='none')

In [25]:
model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=1)
trainer = Trainer(model, args, train_dataset=dds['train'], eval_dataset=dds['test'],
                  tokenizer=tokz, compute_metrics=corr_d)

Downloading pytorch_model.bin:   0%|          | 0.00/286M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'pooler.dense.bias', 'classifier.weight', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
trainer.train();

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Pearson
1,No log,0.028279,0.774084
2,No log,0.023152,0.808292
3,No log,0.028225,0.817607
4,No log,0.025673,0.818411


In [27]:
preds = trainer.predict(eval_ds).predictions.astype(float)
preds

array([[ 0.61623317],
       [ 0.72855383],
       [ 0.51941592],
       [ 0.31880563],
       [-0.06260909],
       [ 0.52930957],
       [ 0.55608904],
       [ 0.04247641],
       [ 0.23692061],
       [ 1.06963146],
       [ 0.23642482],
       [ 0.28830203],
       [ 0.78981918],
       [ 0.89056182],
       [ 0.77093303],
       [ 0.47908986],
       [ 0.19071339],
       [-0.01915332],
       [ 0.63804108],
       [ 0.38826233],
       [ 0.44855487],
       [ 0.21917124],
       [ 0.16463418],
       [ 0.24154554],
       [ 0.58495772],
       [-0.04215502],
       [-0.04737101],
       [-0.03182377],
       [-0.06195626],
       [ 0.62019444],
       [ 0.40181005],
       [ 0.06641801],
       [ 0.70426631],
       [ 0.51751512],
       [ 0.4524942 ],
       [ 0.17301825]])

In [28]:
preds = np.clip(preds, 0, 1)
preds

array([[0.61623317],
       [0.72855383],
       [0.51941592],
       [0.31880563],
       [0.        ],
       [0.52930957],
       [0.55608904],
       [0.04247641],
       [0.23692061],
       [1.        ],
       [0.23642482],
       [0.28830203],
       [0.78981918],
       [0.89056182],
       [0.77093303],
       [0.47908986],
       [0.19071339],
       [0.        ],
       [0.63804108],
       [0.38826233],
       [0.44855487],
       [0.21917124],
       [0.16463418],
       [0.24154554],
       [0.58495772],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.62019444],
       [0.40181005],
       [0.06641801],
       [0.70426631],
       [0.51751512],
       [0.4524942 ],
       [0.17301825]])