In [1]:
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch

try:
  import datasets
except:
    !pip install -q datasets
    try:
        import datasets
    except:
        print("Can't import datasets.")

In [2]:
class CFG():
    PATH = "../input/us-patent-phrase-to-phrase-matching"
    MODEL = 'microsoft/deberta-v3-small'
    NUM_FOLDS = 4
    LR = 8e-5
    BATCH_SIZE = 128
    WEIGHT_DECAY = 0.01
    EPOCHS = 4
    DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

In [3]:
def lowercase_df(df): # lowercase for better score
    df['context'] = df['context'].str.lower()
    df['anchor'] = df['anchor'].str.lower()
    df['target'] = df['target'].str.lower()
    return df
train_df = pd.read_csv(f"{CFG.PATH}/train.csv")
train_df = lowercase_df(train_df)

In [4]:
# credits https://www.kaggle.com/code/hannes82/pppm-deberta-v3-large-closing-the-cv-lb-gap/notebook

!pip install -q iterative-stratification
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

def k_fold(train_df):
    dfx = pd.get_dummies(train_df, columns=["score"]).groupby(["anchor"], as_index=False).sum()
    cols = [c for c in dfx.columns if c.startswith("score_") or c == "anchor"]
    dfx = dfx[cols]

    mskf = MultilabelStratifiedKFold(n_splits=CFG.NUM_FOLDS, shuffle=True, random_state=42)
    labels = [c for c in dfx.columns if c != "anchor"]
    dfx_labels = dfx[labels]
    dfx["fold"] = -1

    for fold, (trn_, val_) in enumerate(mskf.split(dfx, dfx_labels)):
        dfx.loc[val_, "fold"] = fold

    train_df = train_df.merge(dfx[["anchor", "fold"]], on="anchor", how="left")
    return train_df

train_df = k_fold(train_df)



In [5]:
tokenizer = AutoTokenizer.from_pretrained(CFG.MODEL)
def sep(df):
    df['input'] = df['context'] + tokenizer.sep_token + df['anchor'] + tokenizer.sep_token + df['target']
    return df

train_df = sep(train_df)

def convert(df, isTest=False):
    if isTest:
        ds = datasets.Dataset.from_pandas(df)
    else:
        ds = datasets.Dataset.from_pandas(df).rename_column('score', 'label')
    return ds

Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/578 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
def tokenize(example):
    return tokenizer(example["input"])

def create_tokenized_ds_from_df(df, isTest=False):
    ds = convert(df, isTest=True if isTest == True else False)
    inps = "anchor","target","context"
    if isTest:
        tokenized_ds = ds.map(tokenize, batched=True, remove_columns=inps+('input','id'))
    else:
        tokenized_ds = ds.map(tokenize, batched=True, remove_columns=inps+('input','id', '__index_level_0__', 'fold'))
        
    tokenized_ds = ds.map(tokenize, batched=True, remove_columns=inps+('input','id'))
    return tokenized_ds

# Column to remove ['__index_level_0__', 'fold'] not in the dataset. Current columns in the dataset: ['id', 'anchor', 'target', 'context', 'input']

In [7]:
def compute_metrics(eval_pred):
    return {'pearson': np.corrcoef(*eval_pred)[0][1]}

def get_trainer(train_dataset, eval_dataset):
    args = TrainingArguments('outputs', learning_rate=CFG.LR, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=True if torch.cuda.is_available() else False,
                             evaluation_strategy="epoch", per_device_train_batch_size=CFG.BATCH_SIZE, per_device_eval_batch_size=CFG.BATCH_SIZE*2, optim="adamw_torch", 
                             num_train_epochs=CFG.EPOCHS, weight_decay=CFG.WEIGHT_DECAY, report_to='none')
    model = AutoModelForSequenceClassification.from_pretrained(CFG.MODEL, num_labels=1)
    return Trainer(model, args, train_dataset=train_dataset, eval_dataset=eval_dataset,
                   tokenizer=tokenizer, compute_metrics=compute_metrics)

In [8]:
for fold in range(CFG.NUM_FOLDS):
    train_folds = train_df[train_df.fold!=fold]
    eval_folds = train_df[train_df.fold==fold]
    
    train_ds, eval_ds = create_tokenized_ds_from_df(train_folds), create_tokenized_ds_from_df(eval_folds)
    
    trainer = get_trainer(train_ds, eval_ds)
    trainer.train()

  0%|          | 0/28 [00:00<?, ?ba/s]

  0%|          | 0/28 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

Downloading:   0%|          | 0.00/273M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-small were not used when initializing DebertaV2ForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.dense.bias', 'mask_predictions.classifier.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from 

Epoch,Training Loss,Validation Loss,Pearson
1,No log,0.029058,0.785483
2,No log,0.025029,0.798968
3,0.030000,0.02685,0.800561
4,0.030000,0.026207,0.802342


The following columns in the evaluation set  don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: fold, __index_level_0__.
***** Running Evaluation *****
  Num examples = 9379
  Batch size = 256
The following columns in the evaluation set  don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: fold, __index_level_0__.
***** Running Evaluation *****
  Num examples = 9379
  Batch size = 256
Saving model checkpoint to outputs/checkpoint-500
Configuration saved in outputs/checkpoint-500/config.json
Model weights saved in outputs/checkpoint-500/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-500/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-500/special_tokens_map.json
added tokens file saved in outputs/checkpoint-500/added_tokens.json
The following columns in the evaluation set  don't have a corresponding argument in `DebertaV2ForSequenceClass

  0%|          | 0/28 [00:00<?, ?ba/s]

  0%|          | 0/28 [00:00<?, ?ba/s]

  0%|          | 0/9 [00:00<?, ?ba/s]

  0%|          | 0/9 [00:00<?, ?ba/s]

PyTorch: setting up devices
loading configuration file https://huggingface.co/microsoft/deberta-v3-small/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/8e0c12a7672d1d36f647c86e5fc3a911f189d8704e2bc94dde4a1ffe38f648fa.9df96bac06c2c492bc77ad040068f903c93beec14607428f25bf9081644ad0da
Model config DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-small",
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  

Epoch,Training Loss,Validation Loss,Pearson
1,No log,0.027832,0.77669
2,No log,0.025851,0.784703
3,0.032500,0.027912,0.79183
4,0.032500,0.026684,0.791982


The following columns in the evaluation set  don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: fold, __index_level_0__.
***** Running Evaluation *****
  Num examples = 8860
  Batch size = 256
The following columns in the evaluation set  don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: fold, __index_level_0__.
***** Running Evaluation *****
  Num examples = 8860
  Batch size = 256
Saving model checkpoint to outputs/checkpoint-500
Configuration saved in outputs/checkpoint-500/config.json
Model weights saved in outputs/checkpoint-500/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-500/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-500/special_tokens_map.json
added tokens file saved in outputs/checkpoint-500/added_tokens.json
The following columns in the evaluation set  don't have a corresponding argument in `DebertaV2ForSequenceClass

  0%|          | 0/28 [00:00<?, ?ba/s]

  0%|          | 0/28 [00:00<?, ?ba/s]

  0%|          | 0/9 [00:00<?, ?ba/s]

  0%|          | 0/9 [00:00<?, ?ba/s]

PyTorch: setting up devices
loading configuration file https://huggingface.co/microsoft/deberta-v3-small/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/8e0c12a7672d1d36f647c86e5fc3a911f189d8704e2bc94dde4a1ffe38f648fa.9df96bac06c2c492bc77ad040068f903c93beec14607428f25bf9081644ad0da
Model config DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-small",
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  

Epoch,Training Loss,Validation Loss,Pearson
1,No log,0.031545,0.786899
2,No log,0.02511,0.806835
3,0.032700,0.02526,0.808383
4,0.032700,0.02538,0.80881


  args.max_grad_norm,
The following columns in the evaluation set  don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: fold, __index_level_0__.
***** Running Evaluation *****
  Num examples = 8612
  Batch size = 256
The following columns in the evaluation set  don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: fold, __index_level_0__.
***** Running Evaluation *****
  Num examples = 8612
  Batch size = 256
Saving model checkpoint to outputs/checkpoint-500
Configuration saved in outputs/checkpoint-500/config.json
Model weights saved in outputs/checkpoint-500/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-500/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-500/special_tokens_map.json
added tokens file saved in outputs/checkpoint-500/added_tokens.json
The following columns in the evaluation set  don't have a corresponding argument in `Deb

  0%|          | 0/27 [00:00<?, ?ba/s]

  0%|          | 0/27 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

PyTorch: setting up devices
loading configuration file https://huggingface.co/microsoft/deberta-v3-small/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/8e0c12a7672d1d36f647c86e5fc3a911f189d8704e2bc94dde4a1ffe38f648fa.9df96bac06c2c492bc77ad040068f903c93beec14607428f25bf9081644ad0da
Model config DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-small",
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  

Epoch,Training Loss,Validation Loss,Pearson
1,No log,0.036177,0.75581
2,No log,0.028505,0.772538
3,0.032300,0.028348,0.774707
4,0.032300,0.029204,0.773494


The following columns in the evaluation set  don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: fold, __index_level_0__.
***** Running Evaluation *****
  Num examples = 9622
  Batch size = 256
The following columns in the evaluation set  don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: fold, __index_level_0__.
***** Running Evaluation *****
  Num examples = 9622
  Batch size = 256
Saving model checkpoint to outputs/checkpoint-500
Configuration saved in outputs/checkpoint-500/config.json
Model weights saved in outputs/checkpoint-500/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-500/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-500/special_tokens_map.json
added tokens file saved in outputs/checkpoint-500/added_tokens.json
The following columns in the evaluation set  don't have a corresponding argument in `DebertaV2ForSequenceClass

In [9]:
test_df = pd.read_csv(f"{CFG.PATH}/test.csv")

In [10]:
def create_test_ds():
    test_df = pd.read_csv(f"{CFG.PATH}/test.csv")
    test_df = lowercase_df(test_df)
    test_df = sep(test_df)
    test_ds = create_tokenized_ds_from_df(test_df, isTest=True)
    return test_ds
test_ds = create_test_ds()

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [11]:
pred = trainer.predict(test_ds)
pred_arr = np.array(pred[0])

submissions =  pd.DataFrame(pred_arr, columns=['score'])
submissions['id'] = train_df['id']
submissions = submissions.reindex(columns=['id', 'score'])

***** Running Prediction *****
  Num examples = 36
  Batch size = 256


In [12]:
submissions.to_csv('submission.csv')