In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import traceback
import torch
import transformers
from datasets import load_dataset
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import LambdaLR

In [None]:
train_df = pd.read_csv('full_train.csv')
test_df = pd.read_csv('test.csv')

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli")
model = AutoModelForSequenceClassification.from_pretrained("MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli")

In [None]:
class LegalLensDataset(Dataset):
    def __init__(self, data, tokenizer, max_len = 512, num_labels = 3):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.encoded_label = {'Contradict': 0, 'Entailed':1, 'Neutral': 2}
        if num_labels == 2:
            self.encoded_label['Neutral'] = 0
    def __getitem__(self, idx):
        try:
            item = self.data.iloc[idx]
            pre = item.premise
            hypo = item.hypothesis
            label = self.encoded_label[item.label]
            encoded_input = self.tokenizer(pre, hypo, padding = 'max_length',
                                truncation = True, max_length = self.max_len, 
                                           return_tensors = 'pt')
            return {'input_ids' : encoded_input['input_ids'].squeeze(),
                  'token_type_ids' : encoded_input['token_type_ids'].squeeze(),
                  'attention_mask' : encoded_input['attention_mask'].squeeze(),
                  'labels' : torch.tensor(label, dtype=torch.long)}
        except:
            traceback.print_exc()
            return None
    def __len__(self):
        return len(self.data)
    
train = LegalLensDataset(train_df, tokenizer, num_labels = 3)
test = LegalLensDataset(test_df, tokenizer, num_labels = 3)

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    f1_macro = f1_score(labels, predictions, average = 'macro')
    precision = precision_score(labels, predictions, average = 'macro')
    recall = recall_score(labels, predictions, average = 'macro')
    return {'f1_macro': f1_macro, 'precision': precision, 'recall': recall}

In [None]:
from transformers import get_cosine_schedule_with_warmup
optimizer = torch.optim.Adam(model.parameters(), lr=3e-5)
num_training_steps = 15 * len(train)
num_warmup_steps = int(0.05 * num_training_steps)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='./results',               
    evaluation_strategy="epoch",    
    save_strategy="epoch",   
    learning_rate=5e-06,       
    per_device_train_batch_size=1,   
    gradient_accumulation_steps=2,
    per_device_eval_batch_size=1,         
    num_train_epochs=8,                   
    warmup_ratio=0.06,         
    weight_decay=0.01,                  
    load_best_model_at_end=True,           
    metric_for_best_model='f1_macro',      
    greater_is_better=True,
    logging_strategy='epoch',
    save_total_limit=1
)

trainer = Trainer(
    model=model,                           
    args=training_args,                   
    train_dataset=train,           
    eval_dataset=test,             
    compute_metrics=compute_metrics

)

# Train the model
trainer.train()


In [None]:
results = trainer.evaluate()
print(results)

trainer.save_model('./best_deberta_large')