In [3]:
import os
# General Libraries
import pandas as pd
import numpy as np
# PyTorch Lightning
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping, TQDMProgressBar 
from pytorch_lightning.loggers import TensorBoardLogger
import torch
# Scikit-learn
from sklearn.model_selection import train_test_split

#our code 
from USPPM_model import USPPPM_model
from USPPM_dataset import set_max_len
from USPPM_datamodule import USPPPM_datamodule

from pynvml import *
import argparse

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
from pytorch_lightning.callbacks.callback import Callback


In [19]:
batch_sizes = {
        "distilbert-base-uncased":32,
        "bert-base-uncased":64,
        "Yanhao/simcse-bert-for-patent":64,
        "ahotrod/electra_large_discriminator_squad2_512":32,
        "microsoft/deberta-v3-large":8
        }

features = [
           'CPCdescription_same_anchor_context_similar_targets',
           'anchor_target_CPCdescription',
            # do not add this #'same_anchor_similar_targets',
           'same_anchor_context_targets',
           'same_anchor_context_similar_targets'
            ]


In [20]:
model_name = list(batch_sizes.keys())[0]
batch_size = batch_sizes[model_name]

out_dir_prefix = "final_train_"
gpu_id = "1"

print("Model: " + model_name)
print("GPU: " + gpu_id)
print("Batch size: " + str(batch_size))


# Defining a search space!
config_dict = {
    "debug_samples": 1500,
    "DEBUG": True,
    "target_size" : 1,
    "num_workers" : 8,
    # Training parameters
    "batch_size" : batch_size,
    "epochs" : 8,
    "warmup_steps" : 0,
    "min_lr" : 1e-6,
    "encoder_lr" : 2e-5,
    "decoder_lr" : 2e-5,
    "eps" : 1e-6,
    "betas" : (0.9, 0.999),
    "weight_decay" : 0.01,
    "fc_dropout" : 0.2,
    "seed" : 42,
    "train_test_split": 1,
    "loss": "pearson",
    "stratify_on" : 'stratification_index',
    "features" : features[2],
    "model" : model_name,
    }

INPUT_DIR = '../dataset/us-patent-phrase-to-phrase-matching/'

visible_devices = gpu_id
os.environ["CUDA_VISIBLE_DEVICES"]=visible_devices
num_gpus = len(visible_devices.split(","))


Model: distilbert-base-uncased
GPU: 1
Batch size: 32


In [17]:
train_df = pd.read_csv("./train_dataframe_with_features.csv")
test_df = pd.read_csv("./test_dataframe_with_features.csv")
if config_dict["DEBUG"]:
    train_df = train_df.iloc[:config_dict["debug_samples"],:]

metrics = {"train_loss" : "train_loss", "val_loss":"val_loss", "val_score":"val_score","train_score":"train_score", "batch_size":"batch_size","fold":"fold", "epoch":"epoch"}

#trial_id = ray.air.session.get_trial_id()
OUTPUT_DIR = './'
logging_dir = f"USPPPM"

for d in [OUTPUT_DIR, "lightning_logs/"+logging_dir]:
    try:
        os.makedirs(d)
    except FileExistsError:
        pass

logger = TensorBoardLogger("lightning_logs", name=logging_dir)

In [23]:
done_pre_evaluation = False

# this try catch is needed to properly terminate the run

pl.seed_everything(config_dict["seed"])

steps_per_epoch = len(train_df) * config_dict['train_test_split'] // config_dict['batch_size']
config_dict['training_steps'] = steps_per_epoch * config_dict['epochs']
config_dict['warmup_steps'] = int(config_dict['training_steps'] * config_dict['warmup_steps'])
                            
set_max_len(config_dict, train_df)  

callbacks = [
            #TuneReportCallback(metrics, on="epoch_end"),
            ModelCheckpoint(
                dirpath=f"checkpoints/",
                filename="best_checkpoint",
                save_top_k=1,
                verbose=True,
                monitor='train_loss',
                mode='min'
            ), 
            EarlyStopping(monitor='train_score', patience=2,mode='max'), 
            TQDMProgressBar(refresh_rate=100)
            ]

os.environ["TOKENIZERS_PARALLELISM"] = "true"
datamodule = USPPPM_datamodule(config_dict, 0.9, train_df, test_df)
model = USPPPM_model(config_dict)

trainer = pl.Trainer(
        logger=logger,
        num_sanity_val_steps=0,
        check_val_every_n_epoch=1,
        callbacks=callbacks,
        max_epochs=config_dict['epochs'],
        min_epochs=2,
        devices=[1], # lightning sees only the gpu that is being assigned to this instance of trainable, so it will be always 0 even if it's using gpu 1,2 or 3
        accelerator="gpu",
        limit_val_batches = 0.0 # needed to skip validation
        )

datamodule.setup()
model.epoch=-1
trainer.validate(model, datamodule)
done_pre_evaluation = True
 
trainer.fit(model, datamodule)

Global seed set to 42
Global seed set to 42


  0%|          | 0/1500 [00:00<?, ?it/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
GPU available: True (cuda), used: True
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
H

Training: 0it [00:00, ?it/s]

Epoch -1, global step 43: 'train_loss' reached 0.52956 (best 0.52956), saving model to '/storagenfs/m.petix/hlt_usppm/src/checkpoints/best_checkpoint.ckpt' as top 1
Epoch -1, global step 43: 'train_loss' reached 0.52956 (best 0.52956), saving model to '/storagenfs/m.petix/hlt_usppm/src/checkpoints/best_checkpoint.ckpt' as top 1


Number of NaNs: 0
Number of Infs: 0


Epoch 0, global step 86: 'train_loss' reached 0.38812 (best 0.38812), saving model to '/storagenfs/m.petix/hlt_usppm/src/checkpoints/best_checkpoint.ckpt' as top 1
Epoch 0, global step 86: 'train_loss' reached 0.38812 (best 0.38812), saving model to '/storagenfs/m.petix/hlt_usppm/src/checkpoints/best_checkpoint.ckpt' as top 1


Number of NaNs: 0
Number of Infs: 0


Epoch 1, global step 129: 'train_loss' reached 0.19204 (best 0.19204), saving model to '/storagenfs/m.petix/hlt_usppm/src/checkpoints/best_checkpoint.ckpt' as top 1
Epoch 1, global step 129: 'train_loss' reached 0.19204 (best 0.19204), saving model to '/storagenfs/m.petix/hlt_usppm/src/checkpoints/best_checkpoint.ckpt' as top 1


Number of NaNs: 0
Number of Infs: 0


Epoch 2, global step 172: 'train_loss' was not in top 1
Epoch 2, global step 172: 'train_loss' was not in top 1


Number of NaNs: 0
Number of Infs: 0


Epoch 3, global step 215: 'train_loss' reached 0.14754 (best 0.14754), saving model to '/storagenfs/m.petix/hlt_usppm/src/checkpoints/best_checkpoint.ckpt' as top 1
Epoch 3, global step 215: 'train_loss' reached 0.14754 (best 0.14754), saving model to '/storagenfs/m.petix/hlt_usppm/src/checkpoints/best_checkpoint.ckpt' as top 1


Number of NaNs: 0
Number of Infs: 0


Epoch 4, global step 258: 'train_loss' was not in top 1
Epoch 4, global step 258: 'train_loss' was not in top 1


Number of NaNs: 0
Number of Infs: 0


Epoch 5, global step 301: 'train_loss' reached 0.10894 (best 0.10894), saving model to '/storagenfs/m.petix/hlt_usppm/src/checkpoints/best_checkpoint.ckpt' as top 1
Epoch 5, global step 301: 'train_loss' reached 0.10894 (best 0.10894), saving model to '/storagenfs/m.petix/hlt_usppm/src/checkpoints/best_checkpoint.ckpt' as top 1


Number of NaNs: 0
Number of Infs: 0


Epoch 6, global step 344: 'train_loss' was not in top 1
Epoch 6, global step 344: 'train_loss' was not in top 1
`Trainer.fit` stopped: `max_epochs=8` reached.
`Trainer.fit` stopped: `max_epochs=8` reached.


Number of NaNs: 0
Number of Infs: 0


In [37]:
model = USPPPM_model.load_from_checkpoint("~/ray_results/final_train_distilbert-base-uncased/trainable_e42a4_00000_0_2023-04-16_19-48-27/checkpoints/best_checkpoint.ckpt", config_dict=config_dict)

KeyboardInterrupt: 

In [None]:
predictions = trainer.predict(model, datamodule.train_dataloader(), return_predictions=True)

In [24]:
predictions = trainer.predict(model, datamodule.test_dataloader(), return_predictions=True)
print(type(predictions), type(predictions[0]), type(predictions[0][0]))
print(predictions[0][1].numpy())


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [1]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [1]


Predicting: 43it [00:00, ?it/s]

<class 'list'> <class 'tuple'> <class 'int'>
[[0.4135774 ]
 [0.5905266 ]
 [0.33691028]
 [0.4095657 ]
 [0.35680413]
 [0.39533228]
 [0.4374867 ]
 [0.26101255]
 [0.5007539 ]
 [0.6555991 ]
 [0.49835995]
 [0.5257712 ]
 [0.41701564]
 [0.48323244]
 [0.42927295]
 [0.47450602]
 [0.31133038]
 [0.39408654]
 [0.62861735]
 [0.52045375]
 [0.4139055 ]
 [0.3575145 ]
 [0.36369085]
 [0.3474173 ]
 [0.5636541 ]
 [0.29699004]
 [0.24356504]
 [0.38855627]
 [0.39017656]
 [0.44193944]
 [0.3332249 ]
 [0.21477272]]


In [32]:
predictions[0][1]

tensor([[0.4136],
        [0.5905],
        [0.3369],
        [0.4096],
        [0.3568],
        [0.3953],
        [0.4375],
        [0.2610],
        [0.5008],
        [0.6556],
        [0.4984],
        [0.5258],
        [0.4170],
        [0.4832],
        [0.4293],
        [0.4745],
        [0.3113],
        [0.3941],
        [0.6286],
        [0.5205],
        [0.4139],
        [0.3575],
        [0.3637],
        [0.3474],
        [0.5637],
        [0.2970],
        [0.2436],
        [0.3886],
        [0.3902],
        [0.4419],
        [0.3332],
        [0.2148]])

In [None]:
test_df['score'] = predictions[0][1].numpy()
test_df[['id','score']].to_csv("test_predictions.csv", index=None)