In [90]:
from pathlib import Path
import zipfile

import numpy as np
import kaggle
import pandas as pd

# Huggingface
from datasets import Dataset, DatasetDict
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import TrainingArguments,Trainer

### Getting the data

In [32]:
competition = 'us-patent-phrase-to-phrase-matching'
path = Path(f'/root/{competition}')

In [34]:
kaggle.api.competition_download_cli(competition)

zipfile.ZipFile(f'{competition}.zip').extractall(path)
! rm us-patent-phrase-to-phrase-matching.zip

Downloading us-patent-phrase-to-phrase-matching.zip to /notebooks/fastai_course


100%|██████████| 682k/682k [00:00<00:00, 76.3MB/s]







### Reading the data

In [35]:
!ls {path}

sample_submission.csv  test.csv  train.csv


In [36]:
df = pd.read_csv(path/'train.csv')

In [37]:
df.head(2)

Unnamed: 0,id,anchor,target,context,score
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75


In [38]:
df.describe(include='object')

Unnamed: 0,id,anchor,target,context
count,36473,36473,36473,36473
unique,36473,733,29340,106
top,37d61fd2272659b1,component composite coating,composition,H01
freq,1,152,24,2186


In [39]:
df['input'] = 'TEXT1: ' + df.context + '; TEXT2: ' + df.target + '; ANC1: ' + df.anchor

In [40]:
ds = Dataset.from_pandas(df)

In [29]:
ds

Dataset({
    features: ['id', 'anchor', 'target', 'context', 'score', 'input'],
    num_rows: 36473
})

### Tokenizer: Tokenization and Numericalisation

In [41]:
# From https://huggingface.co/models?sort=downloads&search=deberta
model_nm = 'microsoft/deberta-v3-small'

In [63]:
tokz = AutoTokenizer.from_pretrained(model_nm)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [65]:
len(tokz.vocab)

128001

In [66]:
tokz = AutoTokenizer.from_pretrained(model_nm, use_fast=False)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [69]:
len(tokz.vocab)

128000

In [70]:
tokz.tokenize("Hello there, my name is Lucas")

['▁Hello', '▁there', ',', '▁my', '▁name', '▁is', '▁Lucas']

In [71]:
tokz.tokenize(df['input'].values[0])

['▁TEXT',
 '1',
 ':',
 '▁A',
 '47',
 ';',
 '▁TEXT',
 '2',
 ':',
 '▁abatement',
 '▁of',
 '▁pollution',
 ';',
 '▁ANC',
 '1',
 ':',
 '▁abatement']

In [72]:
def tok_func(x): return tokz(x["input"])

In [73]:
tok_ds = ds.map(tok_func, batched=True)

  0%|          | 0/37 [00:00<?, ?ba/s]

In [74]:
## Or in pandas
# tok_ds = df.apply(tok_func, axis=1)

In [75]:
tok_ds[0]

{'id': '37d61fd2272659b1',
 'anchor': 'abatement',
 'target': 'abatement of pollution',
 'context': 'A47',
 'score': 0.5,
 'input': 'TEXT1: A47; TEXT2: abatement of pollution; ANC1: abatement',
 'input_ids': [1,
  54453,
  435,
  294,
  336,
  5753,
  346,
  54453,
  445,
  294,
  47284,
  265,
  6435,
  346,
  23702,
  435,
  294,
  47284,
  2],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [76]:
# Rename the score to labels
tok_ds = tok_ds.rename_columns({'score':'labels'})

In [78]:
dds = tok_ds.train_test_split(0.25, seed=42)
dds

DatasetDict({
    train: Dataset({
        features: ['id', 'anchor', 'target', 'context', 'labels', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 27354
    })
    test: Dataset({
        features: ['id', 'anchor', 'target', 'context', 'labels', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 9119
    })
})

### Creating test set

In [79]:
eval_df = pd.read_csv(path/'test.csv')

eval_df['input'] = 'TEXT1: ' + eval_df.context + '; TEXT2: ' + eval_df.target + '; ANC1: ' + eval_df.anchor
eval_ds = Dataset.from_pandas(eval_df).map(tok_func, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

### Create the model and the trainer

In [82]:
bs = 128
epochs = 4
lr = 8e-5

In [87]:
args = TrainingArguments(
    output_dir='outputs', #where to store outputs
    learning_rate=lr, #learning rate
    warmup_ratio=0.1, #make sure to do something similar to one_cycle policy
    lr_scheduler_type='cosine', #make sure to do something similar to one_cycle policy
    fp16=True, #use mixed precision
    evaluation_strategy="epoch", #evaluate at the end of each epoch
    per_device_train_batch_size=bs, #train batch size
    per_device_eval_batch_size=bs*2, #eval batch size
    num_train_epochs=epochs, #train for number of epochs
    weight_decay=0.01,  
    report_to='none')

In [101]:
def corr(x,y): 
    '''
    Returns the correlation coefficient between x and y (arrays)
    '''
    return np.corrcoef(x,y)[0][1] 

corr([2],[3]), corr([2,3],[3,5])

(nan, 0.9999999999999999)

In [102]:
def compute_metrics(eval_pred):
    '''
    Wrapper function to be passed into the HF Trainer
    
    parameters:
        eval_preds: tuple of logits and labels
    '''
    logits, labels = eval_pred
    return {'pearson': corr(logits, labels)}

In [92]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_nm, 
    num_labels=1 #num_labels=1 makes this a regression problem https://stackoverflow.com/a/72510500
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dds['train'],
    eval_dataset=dds['test'],
    tokenizer=tokz, 
    compute_metrics=compute_metrics
)

Downloading:   0%|          | 0.00/273M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-small were not used when initializing DebertaV2ForSequenceClassification: ['lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.bias', 'mask_predictions.classifier.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from 

In [105]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: input, context, id, target, anchor. If input, context, id, target, anchor are not expected by `DebertaV2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 27354
  Num Epochs = 4
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 856


Epoch,Training Loss,Validation Loss,Pearson
1,No log,0.02434,0.796798
2,No log,0.025846,0.820384
3,0.030800,0.02241,0.831468
4,0.030800,0.022446,0.832298


The following columns in the evaluation set don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: input, context, id, target, anchor. If input, context, id, target, anchor are not expected by `DebertaV2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9119
  Batch size = 256
The following columns in the evaluation set don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: input, context, id, target, anchor. If input, context, id, target, anchor are not expected by `DebertaV2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9119
  Batch size = 256
Saving model checkpoint to outputs/checkpoint-500
Configuration saved in outputs/checkpoint-500/config.json
Model weights saved in outputs/checkpoint-500/pytorch_model.bin
tokenizer config file saved 

TrainOutput(global_step=856, training_loss=0.02362887380279113, metrics={'train_runtime': 152.3207, 'train_samples_per_second': 718.326, 'train_steps_per_second': 5.62, 'total_flos': 716605488222960.0, 'train_loss': 0.02362887380279113, 'epoch': 4.0})

In [109]:
preds = trainer.predict(eval_ds)
preds = preds.predictions.astype(float)

The following columns in the test set don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: input, context, id, target, anchor. If input, context, id, target, anchor are not expected by `DebertaV2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 36
  Batch size = 256


In [110]:
preds

array([[ 0.50439453],
       [ 0.61572266],
       [ 0.55029297],
       [ 0.29833984],
       [-0.04330444],
       [ 0.50634766],
       [ 0.50390625],
       [-0.01585388],
       [ 0.24023438],
       [ 1.10546875],
       [ 0.25      ],
       [ 0.23168945],
       [ 0.73388672],
       [ 0.90869141],
       [ 0.78222656],
       [ 0.44238281],
       [ 0.27001953],
       [-0.0479126 ],
       [ 0.57763672],
       [ 0.36694336],
       [ 0.46313477],
       [ 0.22668457],
       [ 0.1541748 ],
       [ 0.23083496],
       [ 0.55566406],
       [-0.03042603],
       [-0.04290771],
       [-0.03814697],
       [-0.03372192],
       [ 0.63330078],
       [ 0.38012695],
       [ 0.01074219],
       [ 0.70654297],
       [ 0.54589844],
       [ 0.49487305],
       [ 0.22790527]])

In [111]:
preds = np.clip(preds, 0, 1)

In [113]:
import datasets

submission = datasets.Dataset.from_dict({
    'id': eval_ds['id'],
    'score': preds
})

In [115]:
submission.to_csv('submission.csv')

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

1117

In [116]:
!ls {path}

sample_submission.csv  test.csv  train.csv


In [117]:
ss = pd.read_csv(path/'sample_submission.csv')

In [118]:
ss

Unnamed: 0,id,score
0,4112d61851461f60,0
1,09e418c93a776564,0
2,36baf228038e314b,0
3,1f37ead645e7f0c8,0
4,71a5b6ad068d531f,0
5,474c874d0c07bd21,0
6,442c114ed5c4e3c9,0
7,b8ae62ea5e1d8bdb,0
8,faaddaf8fcba8a3f,0
9,ae0262c02566d2ce,0
