In [1]:
import os

import pandas as pd
import numpy as np
from model.USPPM_kfold_datamodule import USPPPM_kf_datamodule
from model.USPPM_dataset import set_max_len
from model.USPPM_model import USPPPM_model
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping, TQDMProgressBar 

In [2]:
model_name = "microsoft/deberta-v3-large"
batch_size = 8
feature = "anchor_target_CPCdescription"
feature_id = 1
gpu_id = "0"
out_dir_prefix = "predictions_"

In [3]:
# Defining a search space!
config_dict = {
    "debug_samples": 1500,
    "n_fold" : 4,
    "DEBUG": False,
    "target_size" : 1,
    "num_workers" : 8,
    # Training parameters
    "batch_size" : batch_size,
    "epochs" : 8,
    "warmup_steps" : 0,
    "min_lr" : 1e-6,
    "encoder_lr" : 2e-5,
    "decoder_lr" : 2e-5,
    "eps" : 1e-6,
    "betas" : (0.9, 0.999),
    "weight_decay" : 0.01,
    "fc_dropout" : 0.2,
    "seed" : 42,
    "train_test_split": 0.9,
    "loss": "bce",
    "stratify_on" : 'stratification_index',
    "features" : feature,
    "model" : model_name,
    "save_configs": False,
    "training_steps" : 0,
    }

INPUT_DIR = '../dataset/us-patent-phrase-to-phrase-matching/'

In [4]:
visible_devices = gpu_id
os.environ["CUDA_VISIBLE_DEVICES"]=visible_devices
num_gpus = len(visible_devices.split(","))

In [16]:
train_df = pd.read_csv("/storagenfs/m.petix/hlt_usppm/src/data/train_dataframe_with_features.csv")
test_df = pd.read_csv("/storagenfs/m.petix/hlt_usppm/src/data/test_dataframe_with_features.csv")
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36473 entries, 0 to 36472
Data columns (total 15 columns):
 #   Column                                              Non-Null Count  Dtype  
---  ------                                              --------------  -----  
 0   id                                                  36473 non-null  object 
 1   anchor                                              36473 non-null  object 
 2   target                                              36473 non-null  object 
 3   context                                             36473 non-null  object 
 4   score                                               36473 non-null  float64
 5   context_text                                        36473 non-null  object 
 6   score_map                                           36473 non-null  int64  
 7   anchor_target_CPCdescription                        36473 non-null  object 
 8   same_anchor_similar_targets                         36473 non-null  object 


In [17]:
train_df = pd.read_csv("/storagenfs/m.petix/hlt_usppm/src/data/train_dataframe_with_features.csv")
test_df = pd.read_csv("/storagenfs/m.petix/hlt_usppm/src/data/test_dataframe_with_features.csv")
if config_dict["DEBUG"]:
    train_df = train_df.iloc[:config_dict["debug_samples"],:]

# datamodule = USPPPM_kf_datamodule(config_dict, train_df, test_df, 0.9)
datamodule = USPPPM_kf_datamodule(config_dict, train_df)
datamodule.setup()
datamodule.setup_folds(config_dict['n_fold'])

pred_df = pd.DataFrame(columns=["anchor", "target", "context", "prediction", "label"])

for fold in range(0,config_dict['n_fold']):

    model = USPPPM_model.load_from_checkpoint(f"/storagenfs/m.petix/ray_results/AUTO_BATCH_SIZEmicrosoft/deberta-v3-large/trainable_12fdb_00001_1_features=anchor_target_CPCdescription,model=microsoft_deberta-v3-large,stratify_on=stratification_index,wa_2023-03-15_10-20-05/ensemble_checkpoints/model.{fold}.pt", config_dict=config_dict)
    
    set_max_len(config_dict, train_df)  
    
    trainer = pl.Trainer(
                    num_sanity_val_steps=0,
                    check_val_every_n_epoch=1,
                    max_epochs=config_dict['epochs'],
                    min_epochs=2,
                    devices=[0], # lightning sees only the gpu that is being assigned to this instance of trainable, so it will be always 0 even if it's using gpu 1,2 or 3
                    accelerator="gpu",
                    
                    )

    datamodule.setup_fold_index(fold)
    model.current_fold = fold
    predictions = trainer.predict(model, datamodule.val_dataloader(), return_predictions=True)

    # test_df['score'] = predictions[0][1].numpy()
    # test_df[['id','score']].to_csv(f"val_predictions/{model_name}_{feature}.csv", index=None)
    for batch in predictions:
        for i in range(len(batch["anchors"])):
            df = pd.DataFrame({'anchor' : batch['anchors'][i], 'target' : batch['targets'][i], 'context' : batch['cpc_codes'][i], 'prediction' : batch['predictions'][i], 'label' : float(batch['labels'][i]), 'fold' : fold})
            pred_df = pred_df.append(df, ignore_index=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.bias', 'mask_predictions.classifier.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.dense.bias']
- This IS e

  0%|          | 0/36473 [00:00<?, ?it/s]

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


TRAIN FOLD_ 0 24618
VALID FOLD_ 0 8207


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.bias', 'mask_predictions.classifier.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Special tokens have 

  0%|          | 0/36473 [00:00<?, ?it/s]

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


TRAIN FOLD_ 1 24619
VALID FOLD_ 1 8206


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.bias', 'mask_predictions.classifier.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Special tokens have 

  0%|          | 0/36473 [00:00<?, ?it/s]

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


TRAIN FOLD_ 2 24619
VALID FOLD_ 2 8206


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.bias', 'mask_predictions.classifier.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Special tokens have 

  0%|          | 0/36473 [00:00<?, ?it/s]

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


TRAIN FOLD_ 3 24619
VALID FOLD_ 3 8206


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

In [23]:
len(pred_df)

32825

In [43]:
10100 % (8000)

2100

In [45]:
pred_df['fold'] = (pred_df.index / (1026*batch_size)).astype(int)
pred_df

Unnamed: 0,anchor,target,context,prediction,label,fold
0,ammonia recovery,recovery of water,C01,0.442739,0.25,0
1,inner closed,cylindrical inner member,E04,0.497942,0.50,0
2,produce thin layers,produce layers,G01,0.497206,0.50,0
3,split into flows,tunnel,F16,0.233675,0.50,0
4,antiatherosclerotic,cholesterol lowering,C07,0.564063,0.50,0
...,...,...,...,...,...,...
32820,generate control signal,safety,B60,0.253755,0.25,3
32821,relational formula,value,B61,0.233974,0.25,3
32822,component composite coating,carbon coated,C09,0.467336,0.50,3
32823,adjust gas flow,pressurized supply,C23,0.292459,0.25,3


In [46]:
from scipy.stats import pearsonr

pred_df.groupby('fold').apply(lambda x : pearsonr(x.prediction, x.label))
# pearsonr(pred_df.prediction, pred_df.label)

fold
0    (0.9504467629226465, 0.0)
1    (0.9515799264733218, 0.0)
2    (0.9429439661911927, 0.0)
3    (0.9485738822494294, 0.0)
dtype: object

In [33]:
misclassifications_df = pred_df[np.abs(pred_df.prediction - pred_df.label) >= 0.5]
len(misclassifications_df)

50

In [34]:
misclassifications_df.anchor.value_counts(ascending=False).reset_index(name='count')

Unnamed: 0,index,count
0,display object,2
1,axial extension,2
2,pulverulent material,2
3,multiplexed data,2
4,polls,2
5,reduction factor,2
6,container section,1
7,trommel screen,1
8,noncollinear,1
9,stepped pin,1


In [31]:
misclassifications_df.context.apply(lambda x : x[0]).value_counts(ascending=False).reset_index(name='count')

Unnamed: 0,index,count
0,G,182
1,H,159
2,B,154
3,F,106
4,A,91
5,C,91
6,E,61
7,D,31


In [47]:
misclassifications_df[['anchor', 'context']].value_counts(ascending=False).reset_index(name='count')

Unnamed: 0,anchor,context,count
0,axial extension,B05,2
1,multiplexed data,H01,2
2,polls,B21,2
3,type parameter,H04,1
4,display object,A63,1
5,her2 targeted,A61,1
6,gripping layer,D05,1
7,glycitin,B01,1
8,gas leak,F16,1
9,form trench isolation,H01,1
