In [1]:
pip install transformers torch

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

In [13]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [5]:
import torch
from torch.utils.data import DataLoader, TensorDataset
import json
import torch.nn as nn
from transformers import BertModel, BertPreTrainedModel
from transformers import BertTokenizer, AdamW, get_linear_schedule_with_warmup


In [6]:
def read_corpus(file_name, tokenizer, max_length=128, word_sense_dict=None):
    data = []  # List to store processed examples
    is_dict_provided = word_sense_dict is not None  # Check if word_sense_dict is provided
    word_sense_dict = word_sense_dict or {}  # Initialize word_sense_dict if not provided

    with open(file_name, 'r') as file:
        json_list = json.load(file)[:5000]  # Load the first 5000 items from the JSON file

    for item in json_list:
        word = item['tokens'][item['acronym']]  # Extract the word (acronym)
        sense = item['expansion']  # Extract the sense (expansion)
        sentence = ' '.join(item['tokens'])  # Construct the sentence

        # Populate word_sense_dict with words and their senses
        if word not in word_sense_dict:
            word_sense_dict[word] = set()
        word_sense_dict[word].add(sense)

        # Tokenize for the positive example
        pos_input = tokenizer(sentence + ' [SEP] ' + word + ' [SEP] ' + sense,
                              padding='max_length', max_length=max_length,
                              truncation=True, return_tensors='pt')
        data.append((pos_input['input_ids'], pos_input['attention_mask'], 1))  # Add positive example

        # Create negative examples
        for word_sense in word_sense_dict[word]:
            if word_sense != sense:
                neg_input = tokenizer(sentence + ' [SEP] ' + word + ' [SEP] ' + word_sense,
                                      padding='max_length', max_length=max_length,
                                      truncation=True, return_tensors='pt')
                data.append((neg_input['input_ids'], neg_input['attention_mask'], 0))  # Add negative example

    return data



In [7]:
# Create DataLoader
def create_dataloader(data, batch_size):
    input_ids = [item[0] for item in data]
    attention_masks = [item[1] for item in data]
    labels = [item[2] for item in data]

    dataset = TensorDataset(torch.cat(input_ids), torch.cat(attention_masks), torch.tensor(labels))
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    return dataloader

In [8]:
class BertForWSD(BertPreTrainedModel):
    def __init__(self, args):
        super().__init__(BertModel.from_pretrained(args.model_name).config)
        self.bert = BertModel.from_pretrained(args.model_name)
        self.dropout = nn.Dropout(args.dropout_rate)
        self.classifier = nn.Linear(768, args.num_labels)  # Using 768 directly for bert-base-uncased
        self.init_weights()

    def forward(self, input_ids, attention_mask=None, token_type_ids=None, labels=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        cls_output = outputs[1]  # CLS token output
        cls_output = self.dropout(cls_output)
        logits = self.classifier(cls_output)

        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)

        return loss, logits


In [9]:
def train(model, dataloader, optimizer, scheduler, epochs, device):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in dataloader:
            b_input_ids, b_input_mask, b_labels = batch
            b_input_ids = b_input_ids.to(device)
            b_input_mask = b_input_mask.to(device)
            b_labels = b_labels.to(device)

            optimizer.zero_grad()

            loss, logits = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
            total_loss += loss.item()

            loss.backward()
            optimizer.step()
            scheduler.step()

        avg_loss = total_loss / len(dataloader)
        print(f'Epoch {epoch + 1}/{epochs}, Loss: {avg_loss}')


In [10]:
def evaluate(model, dataloader, device):
    model.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0

    for batch in dataloader:
        b_input_ids, b_input_mask, b_labels = batch
        b_input_ids = b_input_ids.to(device)
        b_input_mask = b_input_mask.to(device)
        b_labels = b_labels.to(device)

        with torch.no_grad():
            loss, logits = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)

        total_eval_loss += loss.item()

        preds = torch.argmax(logits, dim=1).flatten()
        accuracy = (preds == b_labels).cpu().numpy().mean() * 100
        total_eval_accuracy += accuracy

    avg_accuracy = total_eval_accuracy / len(dataloader)
    avg_loss = total_eval_loss / len(dataloader)

    print(f'Accuracy: {avg_accuracy}')
    print(f'Loss: {avg_loss}')

In [11]:
# training_args

class TrainingArguments:
    def __init__(self):
        # File paths
        self.train_file = '/content/drive/MyDrive/NLP_Final/train.json'
        self.val_file = '/content/drive/MyDrive/NLP_Final/dev.json'
        self.model_save_path = '/content/drive/MyDrive/NLP_Final'
        self.tokenizer_save_path = '/content/drive/MyDrive/NLP_Final'

        # Model and tokenizer
        self.model_name = 'bert-base-uncased'
        self.num_labels = 2  # Number of labels for classification

        # Training parameters
        self.batch_size = 32
        self.learning_rate = 2e-5
        self.epsilon = 1e-8
        self.num_epochs = 3
        self.warmup_proportion = 0.1  # Warm-up for 10% of the total steps
        self.max_length = 512  # Max length for tokenization
        self.dropout_rate = 0.1  # Dropout rate for the classifier layer

        # Device configuration
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'



In [14]:
args = TrainingArguments()

# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained(args.model_name)

# Read data
train_data = read_corpus(args.train_file, tokenizer, args.max_length)
val_data = read_corpus(args.val_file, tokenizer, args.max_length)

# Create DataLoaders
train_dataloader = create_dataloader(train_data, args.batch_size)
val_dataloader = create_dataloader(val_data, args.batch_size)

# Initialize your custom model with training arguments
model = BertForWSD(args)
model.to(args.device)

# Initialize optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=args.learning_rate, eps=args.epsilon)
total_steps = len(train_dataloader) * args.num_epochs
warmup_steps = int(args.warmup_proportion * total_steps)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]



In [16]:
# Initialize the pre-trained BERT model

print("Calculating Zero-shot accuracy:\n")
evaluate(model, val_dataloader, args.device)


Calculating Zero-shot accuracy:

Accuracy: 54.99856321839081
Loss: 0.6975185598792701


In [17]:
train(model, train_dataloader, optimizer, scheduler, args.num_epochs, args.device)

Epoch 1/3, Loss: 0.6092667316638979
Epoch 2/3, Loss: 0.4385087294530732
Epoch 3/3, Loss: 0.3492355962793602


In [18]:
# Evaluate on validation set
evaluate(model, val_dataloader, args.device)

# Save model and tokenizer
model.save_pretrained(args.model_save_path)
tokenizer.save_pretrained(args.tokenizer_save_path)

Accuracy: 80.31752873563218
Loss: 0.44535953281768437


('/content/drive/MyDrive/NLP_Final/tokenizer_config.json',
 '/content/drive/MyDrive/NLP_Final/special_tokens_map.json',
 '/content/drive/MyDrive/NLP_Final/vocab.txt',
 '/content/drive/MyDrive/NLP_Final/added_tokens.json')