In [1]:
from pathlib import Path
import zipfile

import numpy as np
import kaggle
import pandas as pd

# Huggingface
from datasets import Dataset, DatasetDict
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import TrainingArguments,Trainer

### Getting the data

In [2]:
competition = 'us-patent-phrase-to-phrase-matching'
path = Path(f'/root/{competition}')

In [3]:
kaggle.api.competition_download_cli(competition)

zipfile.ZipFile(f'{competition}.zip').extractall(path)
! rm us-patent-phrase-to-phrase-matching.zip

Downloading us-patent-phrase-to-phrase-matching.zip to /notebooks/fastai_course


100%|██████████| 682k/682k [00:00<00:00, 18.9MB/s]







### Reading the data

In [3]:
!ls {path}

sample_submission.csv  test.csv  train.csv


In [4]:
df = pd.read_csv(path/'train.csv')

In [5]:
df.head(2)

Unnamed: 0,id,anchor,target,context,score
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75


In [6]:
df = df.join(df.groupby('anchor').target.agg(list).rename('ref'), on='anchor')

In [7]:
df['ref2'] = df.apply(lambda x:[i for i in x['ref'] if i != x['target']], axis=1)

In [8]:
df['ref'] = df['ref'].apply(",".join)

In [9]:
df['ref2'] = df['ref2'].apply(",".join)

In [10]:
df['input'] = df.context + '[SEP]' + df.target + '[SEP]' + df.anchor + '[SEP]' + df['ref2']

In [11]:
ds = Dataset.from_pandas(df)

In [12]:
ds

Dataset({
    features: ['id', 'anchor', 'target', 'context', 'score', 'ref', 'ref2', 'input'],
    num_rows: 36473
})

### Tokenize

In [13]:
# From https://huggingface.co/models?sort=downloads&search=deberta
model_nm = 'microsoft/deberta-v3-small'

In [14]:
tokz = AutoTokenizer.from_pretrained(model_nm)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [15]:
tokz = AutoTokenizer.from_pretrained(model_nm, use_fast=False)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [16]:
def tok_func(x): return tokz(x["input"])

In [17]:
tok_ds = ds.map(tok_func, batched=True)



  0%|          | 0/37 [00:00<?, ?ba/s]

In [18]:
## Or in pandas
# tok_ds = df.apply(tok_func, axis=1)

In [19]:
# Rename the score to labels
tok_ds = tok_ds.rename_columns({'score':'labels'})

In [20]:
dds = tok_ds.train_test_split(0.25, seed=42)
dds

DatasetDict({
    train: Dataset({
        features: ['id', 'anchor', 'target', 'context', 'labels', 'ref', 'ref2', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 27354
    })
    test: Dataset({
        features: ['id', 'anchor', 'target', 'context', 'labels', 'ref', 'ref2', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 9119
    })
})

### Creating test set

In [28]:
eval_df = pd.read_csv(path/'test.csv')

eval_df['input'] = eval_df.context + '[SEP]' + eval_df.target + '[SEP]' + eval_df.anchor + '[SEP]' + eval_df['ref2']
eval_ds = Dataset.from_pandas(eval_df).map(tok_func, batched=True)

KeyError: 'ref2'

### Create the model and the trainer

In [None]:
bs = 16
epochs = 4
lr = 8e-5

In [None]:
args = TrainingArguments(
    output_dir='outputs', #where to store outputs
    learning_rate=lr, #learning rate
    warmup_ratio=0.1, #make sure to do something similar to one_cycle policy
    lr_scheduler_type='cosine', #make sure to do something similar to one_cycle policy
    fp16=True, #use mixed precision
    evaluation_strategy="epoch", #evaluate at the end of each epoch
    per_device_train_batch_size=bs, #train batch size
    per_device_eval_batch_size=bs*2, #eval batch size
    num_train_epochs=epochs, #train for number of epochs
    weight_decay=0.01,  
    report_to='none')

In [29]:
def corr(x,y): 
    '''
    Returns the correlation coefficient between x and y (arrays)
    '''
    return np.corrcoef(x,y)[0][1] 

corr([2,3],[3,5])

0.9999999999999999

In [30]:
def compute_metrics(eval_pred):
    '''
    Wrapper function to be passed into the HF Trainer
    
    parameters:
        eval_preds: tuple of logits and labels
    '''
    logits, labels = eval_pred
    return {'pearson': corr(logits, labels)}

In [31]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_nm, 
    num_labels=1 #num_labels=1 makes this a regression problem https://stackoverflow.com/a/72510500
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dds['train'],
    eval_dataset=dds['test'],
    tokenizer=tokz, 
    compute_metrics=compute_metrics
)

loading configuration file https://huggingface.co/microsoft/deberta-v3-small/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/8e0c12a7672d1d36f647c86e5fc3a911f189d8704e2bc94dde4a1ffe38f648fa.9df96bac06c2c492bc77ad040068f903c93beec14607428f25bf9081644ad0da
Model config DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-small",
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input"

In [32]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: id, anchor, input, context, target, ref2, ref. If id, anchor, input, context, target, ref2, ref are not expected by `DebertaV2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 27354
  Num Epochs = 4
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 6840


Epoch,Training Loss,Validation Loss,Pearson
1,0.0338,0.025089,0.78937


Saving model checkpoint to outputs/checkpoint-500
Configuration saved in outputs/checkpoint-500/config.json
Model weights saved in outputs/checkpoint-500/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-500/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-500/special_tokens_map.json
added tokens file saved in outputs/checkpoint-500/added_tokens.json
Saving model checkpoint to outputs/checkpoint-1000
Configuration saved in outputs/checkpoint-1000/config.json
Model weights saved in outputs/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-1000/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-1000/special_tokens_map.json
added tokens file saved in outputs/checkpoint-1000/added_tokens.json
Saving model checkpoint to outputs/checkpoint-1500
Configuration saved in outputs/checkpoint-1500/config.json
Model weights saved in outputs/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in outp

KeyboardInterrupt: 

In [None]:
preds = trainer.predict(eval_ds)
preds = preds.predictions.astype(float)

In [None]:
preds

In [None]:
preds = np.clip(preds, 0, 1)

In [None]:
import datasets

submission = datasets.Dataset.from_dict({
    'id': eval_ds['id'],
    'score': preds
})

In [None]:
submission.to_csv('submission.csv')

In [116]:
!ls {path}

sample_submission.csv  test.csv  train.csv


In [117]:
ss = pd.read_csv(path/'sample_submission.csv')

In [118]:
ss

Unnamed: 0,id,score
0,4112d61851461f60,0
1,09e418c93a776564,0
2,36baf228038e314b,0
3,1f37ead645e7f0c8,0
4,71a5b6ad068d531f,0
5,474c874d0c07bd21,0
6,442c114ed5c4e3c9,0
7,b8ae62ea5e1d8bdb,0
8,faaddaf8fcba8a3f,0
9,ae0262c02566d2ce,0
