### This is a copy of the Kaggle notebook which submits to the us-patent-phrase-to-phrase matching competition

link: https://www.kaggle.com/code/lucasvw/patent-phrase-01/

In [1]:
from pathlib import Path

import numpy as np
import pandas as pd

In [2]:
path = Path('/kaggle/input/us-patent-phrase-to-phrase-matching')
!ls {path}

/bin/bash: /opt/conda/lib/libtinfo.so.6: no version information available (required by /bin/bash)
sample_submission.csv  test.csv  train.csv


In [3]:
train_csv = pd.read_csv(path/'train.csv')
train_csv = train_csv[['id', 'context', 'anchor', 'target', 'score']]
train_csv.head(2)

Unnamed: 0,id,context,anchor,target,score
0,37d61fd2272659b1,A47,abatement,abatement of pollution,0.5
1,7b9652b17b68b7a4,A47,abatement,act of abating,0.75


In [4]:
train_csv.shape

(36473, 5)

In [5]:
def create_input(row):
    return row['context'] + ' [SEP] ' + row['anchor'] + ' [SEP] ' + row['target']

train_csv['input'] = train_csv.apply(create_input, axis=1)

In [6]:
# Dataset is a huggingface wrapper around our data
from datasets import Dataset

t = train_csv[['input', 'score']].rename(
    {'score': 'labels'} # needed for transformers to work 
)

ds = Dataset.from_pandas(t)

In [7]:
ds

Dataset({
    features: ['input', 'score', '__index_level_0__'],
    num_rows: 36473
})

### Tokenizer: Tokenization and Numericalisation

In [8]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/deberta-v3-small/deberta-v3-small/spm.model
/kaggle/input/deberta-v3-small/deberta-v3-small/config.json
/kaggle/input/deberta-v3-small/deberta-v3-small/README.md
/kaggle/input/deberta-v3-small/deberta-v3-small/tokenizer_config.json
/kaggle/input/deberta-v3-small/deberta-v3-small/pytorch_model.bin
/kaggle/input/us-patent-phrase-to-phrase-matching/sample_submission.csv
/kaggle/input/us-patent-phrase-to-phrase-matching/train.csv
/kaggle/input/us-patent-phrase-to-phrase-matching/test.csv


In [9]:
from transformers import AutoTokenizer

model_nm = '/kaggle/input/deberta-v3-small/deberta-v3-small'

#Tokenizers come from pre-trained models, as the vocab ofcourse depends on the data the model was trained on
tokz = AutoTokenizer.from_pretrained(model_nm)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [10]:
tokz.vocab_size, tokz.special_tokens_map

(128000,
 {'bos_token': '[CLS]',
  'eos_token': '[SEP]',
  'unk_token': '[UNK]',
  'sep_token': '[SEP]',
  'pad_token': '[PAD]',
  'cls_token': '[CLS]',
  'mask_token': '[MASK]'})

In [11]:
# You can tokenize some input string
tokz.tokenize("Hello, my name is Lucas")

['▁Hello', ',', '▁my', '▁name', '▁is', '▁Lucas']

In [12]:
# Or you can tokenize + numericalise some input string
tokz.encode("Hello, my name is Lucas")

[1, 5365, 261, 312, 601, 269, 10876, 2]

In [13]:
# Or we can do it in two steps
tokz.convert_tokens_to_ids(tokz.tokenize("Hello, my name is Lucas"))

[5365, 261, 312, 601, 269, 10876]

In [14]:
# Or just call the tokenizer, which gives encodings, as well some other information
tokz("Hello, my name is Lucas")

{'input_ids': [1, 5365, 261, 312, 601, 269, 10876, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [15]:
ds[0]

{'input': 'A47 [SEP] abatement [SEP] abatement of pollution',
 'score': 0.5,
 '__index_level_0__': 0}

In [16]:
def tokenize(ds_row): return tokz(ds_row['input'])

ds = ds.map(tokenize, batched=True)

# Now we have added some fields to the ds
ds

  0%|          | 0/37 [00:00<?, ?ba/s]

Dataset({
    features: ['input', 'score', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 36473
})

In [17]:
# Make splits
dss = ds.train_test_split(test_size=0.2, seed=42)
dss

DatasetDict({
    train: Dataset({
        features: ['input', 'score', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 29178
    })
    test: Dataset({
        features: ['input', 'score', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 7295
    })
})

### Same for the test set

In [18]:
def create_dataset(full_path,
                   tokz,
                   train_set=True,
                   test_size=0.25):

    df = pd.read_csv(full_path)
    
    def create_input(row):
        return row['context'] + ' [SEP] ' + row['anchor'] + ' [SEP] ' + row['target']
    df['input'] = df.apply(create_input, axis=1)
    
    def tokenize(ds_row): return tokz(ds_row['input'])
    
    if train_set:
        df = df.rename(columns={'score': 'labels'})
        ds = Dataset.from_pandas(df[['id', 'input', 'labels']])
        ds = ds.map(tokenize)
        ds = ds.train_test_split(test_size, seed=42)
    else:
        ds = Dataset.from_pandas(df[['id', 'input']])
        ds = ds.map(tokenize)
    
    return ds

In [19]:
ds = create_dataset(path/'train.csv', tokz)
dst = create_dataset(path/'test.csv', tokz, train_set=False)

  0%|          | 0/36473 [00:00<?, ?ex/s]

  0%|          | 0/36 [00:00<?, ?ex/s]

### Create the model

In [20]:
from transformers import TrainingArguments

bs = 128
epochs = 4
lr = 8e-5

args = TrainingArguments(
    output_dir='outputs', #where to store outputs
    learning_rate=lr, #learning rate
    warmup_ratio=0.1, #make sure to do something similar to one_cycle policy
    lr_scheduler_type='cosine', #make sure to do something similar to one_cycle policy
    fp16=True, #use mixed precision
    evaluation_strategy="epoch", #evaluate at the end of each epoch
    per_device_train_batch_size=bs, #train batch size
    per_device_eval_batch_size=bs*2, #eval batch size
    num_train_epochs=epochs, #train for number of epochs
    weight_decay=0.01,  
    report_to='none')

In [21]:
# There are many AutoModelFor... imports, we need the SequenceClassification
from transformers import AutoModelForSequenceClassification


model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=1)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Some weights of the model checkpoint at /kaggle/input/deberta-v3-small/deberta-v3-small were not used when initializing DebertaV2ForSequenceClassification: ['lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifer.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifer.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassif

In [22]:
def corr(x,y): 
    '''
    Returns the correlation coefficient between x and y (arrays)
    '''
    return np.corrcoef(x,y)[0][1] 


def compute_metrics(eval_pred):
    '''
    Wrapper function to be passed into the HF Trainer
    
    parameters:
        eval_preds: tuple of logits and labels
    '''
    logits, labels = eval_pred
    print(logits.shape)
    return {'pearson': corr(logits, labels)}

In [23]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=ds['train'],
    eval_dataset=ds['test'],
    tokenizer=tokz,
    compute_metrics=compute_metrics
)

Using cuda_amp half precision backend


In [24]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: input, id. If input, id are not expected by `DebertaV2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 27354
  Num Epochs = 4
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 856


Epoch,Training Loss,Validation Loss,Pearson
1,No log,0.027766,0.788748
2,No log,0.02584,0.819725
3,0.036300,0.023256,0.828241
4,0.036300,0.022667,0.829955


The following columns in the evaluation set don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: input, id. If input, id are not expected by `DebertaV2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9119
  Batch size = 256


(9119,)


The following columns in the evaluation set don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: input, id. If input, id are not expected by `DebertaV2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9119
  Batch size = 256


(9119,)


Saving model checkpoint to outputs/checkpoint-500
Configuration saved in outputs/checkpoint-500/config.json
Model weights saved in outputs/checkpoint-500/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-500/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-500/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: input, id. If input, id are not expected by `DebertaV2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9119
  Batch size = 256


(9119,)


The following columns in the evaluation set don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: input, id. If input, id are not expected by `DebertaV2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9119
  Batch size = 256


(9119,)




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=856, training_loss=0.026902480660197892, metrics={'train_runtime': 240.0141, 'train_samples_per_second': 455.873, 'train_steps_per_second': 3.566, 'total_flos': 461822682950400.0, 'train_loss': 0.026902480660197892, 'epoch': 4.0})

In [25]:
preds = trainer.predict(dst)

The following columns in the test set don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: input, id. If input, id are not expected by `DebertaV2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 36
  Batch size = 256


In [26]:
predictions = preds.predictions.squeeze().astype(float).clip(0,1)

In [27]:
predictions

array([0.52734375, 0.63476562, 0.55859375, 0.25708008, 0.        ,
       0.54150391, 0.49780273, 0.        , 0.23571777, 1.        ,
       0.18200684, 0.2722168 , 0.79638672, 0.90673828, 0.73681641,
       0.44750977, 0.22021484, 0.        , 0.59033203, 0.32617188,
       0.39013672, 0.25415039, 0.02389526, 0.23010254, 0.58056641,
       0.        , 0.        , 0.        , 0.        , 0.58154297,
       0.36279297, 0.01560974, 0.71630859, 0.54638672, 0.4140625 ,
       0.20947266])

In [28]:
submission = pd.DataFrame.from_dict({
    'id': dst['id'],
    'score': predictions
})

In [29]:
submission.to_csv('submission.csv',index=False)