In [1]:
import torch
import torch.nn as nn
import random

In [2]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
MODEL_NAME = 'roberta-base'
random.seed(a=42)

In [4]:
from transformers import RobertaModel, AutoTokenizer, get_scheduler, AdamW

In [5]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [6]:
from torch.utils.data import DataLoader, Dataset

Class to handle the datasets and split them into columns and process them to the right size and add paddings

In [7]:
class NLIDataset(Dataset):
    def __init__(self, data):
        self.encodings = tokenizer(list(data['premise']), list(data['hypothesis']), padding=True, truncation=True, return_tensors="pt", max_length=MAX_LENGTH)
        self.labels = torch.tensor(data['label'].values, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self,idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

Model to learn predictions

In [8]:
class DNNTransformerModel(nn.Module):
    def __init__(self, model_name, num_labels):
        super(DNNTransformerModel, self).__init__()
        self.transformer = RobertaModel.from_pretrained(model_name)
        self.dnn = nn.Sequential(
            nn.Linear(self.transformer.config.hidden_size, 512),
            nn.LayerNorm(512),
            nn.GELU(),
            nn.Dropout(0.2),
            nn.Linear(512, 256),
            nn.LayerNorm(256),
            nn.GELU(),
            nn.Dropout(0.2),
            nn.Linear(256, num_labels)
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = outputs.last_hidden_state[:, 0, :]
        logits = self.dnn(hidden_state)
        return logits

In [9]:
import pandas as pd

In [10]:
train_data = pd.read_csv('/kaggle/input/trained/train.csv') 
val_data = pd.read_csv('/kaggle/input/trained/dev.csv') 

In [11]:
import spacy

Extracing Nouns and Verbs from the sets of premise and hypothesis

In [12]:
nlp = spacy.load("en_core_web_sm")

def extract_nouns_verbs(textP, textH):
    nouns = []
    verbs = []
    for sent in textP:
        doc = nlp(sent)
        nouns.extend(token.text for token in doc if token.pos_ == "NOUN")
        verbs.extend(token.text for token in doc if token.pos_ == "VERB")
    for sent in textH:
        doc = nlp(sent)
        nouns.extend(token.text for token in doc if token.pos_ == "NOUN")
        verbs.extend(token.text for token in doc if token.pos_ == "VERB")
    return nouns, verbs

Adding additional data based on research papers to improve accuracy (explained in README.md)

In [13]:
premise = train_data['premise']
hypothesis = train_data['hypothesis']
print("Extracting")
nouns, verbs = extract_nouns_verbs(premise, hypothesis)
print("Extracted")
new_training_data = pd.DataFrame(columns=["premise", "hypothesis", "label"])
print("adding new data")
for i in range(1000):
    n1, n2 = random.sample(nouns, 2)
    v = random.choice(verbs)
    sent1 = f"The {n1} {v} the {n2}"
    sent2 = f"The {n1} does not {v} the {n2}"
    num = random.randint(1, 10)
    if num <= 3:
        new_training_data.loc[len(new_training_data)] = [sent1, sent1, 1]
    new_training_data.loc[len(new_training_data)] = [sent2, sent1, 0]

train_data = pd.concat([train_data, new_training_data], ignore_index=True)
print("new data added")

Extracting
Extracted
adding new data
new data added


In [14]:
BATCH_SIZE = 16
MAX_LENGTH = 256

In [15]:
train_dataset = NLIDataset(train_data)
val_dataset = NLIDataset(val_data)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

In [16]:
model = DNNTransformerModel(MODEL_NAME, num_labels=2)
model = model.to(DEVICE)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
LEARNING_RATE = 2e-5
WEIGHT_DECAY = 0.01

In [18]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.NAdam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

In [19]:
WARMUP_RATIO = 0.1
EPOCHS = 1
NUM_TRAINING_STEPS = len(train_loader) * EPOCHS
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=WARMUP_RATIO, num_training_steps=NUM_TRAINING_STEPS)

In [20]:
from tqdm import tqdm

In [21]:
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    loop = tqdm(train_loader, leave=True)
    loop.set_description(f"Epoch {epoch + 1}/{EPOCHS}")
    
    for batch in loop:
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)

        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        total_loss += loss.item()
    
    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1} - Training Loss: {avg_train_loss:.4f}")

Epoch 1/1: 100%|██████████| 1607/1607 [10:25<00:00,  2.57it/s]

Epoch 1 - Training Loss: 0.4032





In [22]:
torch.save(model.state_dict(), "nli_dnn_transformer.pth")