# Task 1: Sentence Transformer Implementation

In [1]:
from transformers import BertModel, BertTokenizer
import torch

'''     
    For task1, we have to implement a sentence transformer, I chose pre-trained BERT model (works well for most the NLP tasks)
	The model processes input text, extracts token-level embeddings from BERT, and applies an adaptive average pooling layer to obtain fixed-size        sentence representations. The tokenizer converts input sentences into tensors with padding and truncation for uniform input processing. 
    Finally, the model generates sentence embeddings, which can be used for NLP tasks like similarity detection and classification.

'''
class SentenceTransformer(torch.nn.Module):
    def __init__(self, model_name='bert-base-uncased'):
        super(SentenceTransformer, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.pooling = torch.nn.AdaptiveAvgPool1d(1) #torch.nn.AdaptiveAvgPool1d(1) is used to convert variable-length token embeddings into a fixed-size sentence embedding.

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs.last_hidden_state  
        embeddings = self.pooling(last_hidden_state.permute(0, 2, 1)).squeeze(2)  
        return embeddings

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = SentenceTransformer()

sentences = ["AI is transforming every Industry", "AI Agents are going to be the next big thing"]
inputs = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True)
embeddings = model(inputs['input_ids'], inputs['attention_mask'])

print("Sentence Embeddings:", embeddings)



Sentence Embeddings: tensor([[ 0.0559,  0.0093,  0.0912,  ..., -0.0607,  0.2878, -0.1385],
        [ 0.0922, -0.2819,  0.3255,  ..., -0.1049,  0.2217, -0.3235]],
       grad_fn=<SqueezeBackward1>)


In [3]:
print("Sentence Embeddings:", embeddings.shape)

Sentence Embeddings: torch.Size([2, 768])


# Model Choice: I chose BERT as it does a pretty good job in various NLP tasks

In [2]:
class MultiTaskLearningTransformer(torch.nn.Module):
    def __init__(self, model_name='bert-base-uncased', num_sentence_classes=3, num_sentiment_classes=3):
        super().__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.sentence_classifier = torch.nn.Linear(768, num_sentence_classes)
        self.sentiment_analysis = torch.nn.Linear(768, num_sentiment_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sentence_embeddings = outputs.last_hidden_state[:, 0, :]
        return self.sentence_classifier(sentence_embeddings), self.sentiment_analysis(outputs.last_hidden_state)

    
model = MultiTaskLearningTransformer()
class_logits, ner_logits = model(inputs['input_ids'], inputs['attention_mask'])

print("Class Logits:", class_logits)
print("NER Logits:", ner_logits)

Class Logits: tensor([[ 0.2969,  0.2258, -0.3647],
        [ 0.1419,  0.1071, -0.3310]], grad_fn=<AddmmBackward0>)
NER Logits: tensor([[[-0.4228,  0.4767, -0.3502],
         [-0.0603,  0.5656,  0.1865],
         [-0.2978, -0.0703, -0.2870],
         [ 0.0303,  0.1853, -0.2249],
         [ 0.1140,  0.2757, -0.4664],
         [ 0.0982, -0.0459,  0.0544],
         [ 0.2256,  0.2150,  0.0583],
         [ 0.1169,  0.0808, -0.0017],
         [ 0.0717,  0.0349,  0.0260],
         [ 0.1206,  0.1475,  0.0597],
         [ 0.0673, -0.0122, -0.0017],
         [ 0.1185, -0.0009,  0.0132]],

        [[-0.3958,  0.3844, -0.2965],
         [-0.1042,  0.5323,  0.1365],
         [ 0.1411,  0.0420, -0.3621],
         [ 0.0907, -0.2787,  0.0215],
         [ 0.2288, -0.3820,  0.0359],
         [-0.1557, -0.3091,  0.1784],
         [ 0.0955, -0.1547,  0.2409],
         [ 0.2524, -0.3988, -0.0172],
         [ 0.3267, -0.0236,  0.0009],
         [ 0.2136, -0.1029, -0.0035],
         [ 0.0673, -0.1791,  0.0398

# Task 3: Training Considerations

In [174]:
from datasets import Dataset
from transformers import BertTokenizerFast
from sklearn.model_selection import train_test_split

data = {
    "sentence": [
        "Machine learning is fascinating.",
        "Deep learning is a subset of machine learning.",
        "Transformers are powerful models for NLP.",
        "Social media is fake.",
        "AI is revolutionizing healthcare.",
        "Self-driving cars are the future.",
        "AI is impacting almost every industry.",
        "Machine learning algorithms are improving.",
        "Artificial intelligence helps in decision making.",
        "Social media addiction is harmful.",
        "The internet is full of misinformation.",
        "The future of technology is AI.",
        "Natural language processing is transforming communication.",
        "AI-driven tools are changing the way we work.",
        "Automation through AI can help increase productivity.",
        "Machine learning can help in analyzing big data.",
        "AI can predict stock market trends.",
        "Neural networks can be used in voice recognition.",
        "Robots powered by AI are becoming common.",
        "The AI market is growing rapidly.",
        "Deep learning is a key part of modern AI.",
        "Machine learning can solve complex problems.",
        "AI is a driving force for innovation.",
        "Social media platforms are increasing their influence.",
        "Machine learning models can be used for fraud detection.",
        "AI is used in facial recognition technologies.",
        "The development of AI is accelerating.",
        "AI can help in personalized healthcare.",
        "Social media platforms are sources of fake news."
    ],
    "task_a_label": [1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0],
    "task_b_labels": [1, 0, 2, 2, 0, 0, 1, 0, 2, 0, 1, 2, 1, 0, 2, 1, 0, 1, 2, 1, 0, 0, 1, 0, 2, 2, 0, 2, 1]
}

dataset = Dataset.from_dict(data)


train_data, test_data = train_test_split(dataset, test_size=0.2)

train_dataset = Dataset.from_dict(train_data)
test_dataset = Dataset.from_dict(test_data)

# Load tokenizer
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")


#Tokenization
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["sentence"],
        truncation=True,
        padding=True,
        is_split_into_words=False,  
    )
    
    labels_task_b = []
    for i, sentence in enumerate(examples["sentence"]):
        task_b_label = examples["task_b_labels"][i]  

        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  
            elif word_idx != previous_word_idx:
                label_ids.append(task_b_label)  
            else:
                label_ids.append(-100)  
            previous_word_idx = word_idx
        
        labels_task_b.append(label_ids)

    tokenized_inputs["task_a_label"] = examples["task_a_label"]
    tokenized_inputs["task_b_labels"] = labels_task_b
    
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

print(tokenized_dataset)

Map:   0%|          | 0/29 [00:00<?, ? examples/s]

Dataset({
    features: ['sentence', 'task_a_label', 'task_b_labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 29
})


In [175]:
import torch
import torch.optim as optim
import torch.nn.functional as F
from sklearn.metrics import f1_score
from torch.utils.data import DataLoader
from transformers import BertTokenizerFast
from torch.utils.data import Dataset as TorchDataset
from sklearn.model_selection import train_test_split
import torch
from transformers import BertModel
from datasets import Dataset

train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
test_dataset = test_dataset.map(tokenize_and_align_labels, batched=True)

class DatasetWrapper(TorchDataset):
    def __init__(self, dataset):
        self.dataset = dataset
        self.columns = dataset.column_names  

    def __getitem__(self, idx):
        item = {}
        for key in self.columns:
            if isinstance(self.dataset[key][idx], str):  
                item[key] = self.dataset[key][idx]  
            else:
                item[key] = torch.tensor(self.dataset[key][idx])  
        return item

    def __len__(self):
        return len(self.dataset)
    
train_dataloader = DataLoader(DatasetWrapper(train_dataset), batch_size=2, shuffle=True)
test_dataloader = DataLoader(DatasetWrapper(test_dataset), batch_size=2, shuffle=False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = "cpu"
model = MultiTaskTransformer().to(device)  
optimizer = optim.Adam(model.parameters(), lr=1e-5)

classification_loss_fn = torch.nn.CrossEntropyLoss()
ner_loss_fn = torch.nn.CrossEntropyLoss(ignore_index=-100)  

num_epochs = 3  

for epoch in range(num_epochs):
    model.train()
    total_classification_loss = 0
    total_ner_loss = 0
    correct_classifications = 0
    total_classifications = 0
    all_ner_preds = []
    all_ner_labels = []

    for batch in train_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        task_a_labels = batch["task_a_label"].to(device)
        task_b_labels = batch["task_b_labels"].to(device)
        
        optimizer.zero_grad()
        class_logits, ner_logits = model(input_ids, attention_mask)
        class_loss = classification_loss_fn(class_logits, task_a_labels)
        batch_size, seq_len = ner_logits.size(0), ner_logits.size(1)
        ner_logits_reshaped = ner_logits.view(batch_size * seq_len, -1)
        task_b_labels_reshaped = task_b_labels.view(-1)
        ner_loss = ner_loss_fn(ner_logits_reshaped, task_b_labels_reshaped)
        total_loss = class_loss + ner_loss
        total_loss.backward()
        optimizer.step()
        total_classification_loss += class_loss.item()
        total_ner_loss += ner_loss.item()
        class_preds = torch.argmax(class_logits, dim=-1)
        correct_classifications += (class_preds == task_a_labels).sum().item()
        total_classifications += task_a_labels.size(0)
        ner_preds = torch.argmax(ner_logits, dim=-1)
        ner_preds_flat = ner_preds.view(-1)
        task_b_labels_reshaped_flat = task_b_labels_reshaped.view(-1)
        valid_indices = task_b_labels_reshaped_flat != -100
        ner_preds_valid = ner_preds_flat[valid_indices].cpu().numpy()
        task_b_labels_valid = task_b_labels_reshaped_flat[valid_indices].cpu().numpy()
        all_ner_preds.extend(ner_preds_valid)
        all_ner_labels.extend(task_b_labels_valid)
        
    epoch_classification_loss = total_classification_loss / len(train_dataloader)
    epoch_ner_loss = total_ner_loss / len(train_dataloader)
    classification_accuracy = correct_classifications / total_classifications
    f1 = f1_score(all_ner_labels, all_ner_preds, average='macro')
    print(f"Epoch {epoch + 1}/{num_epochs}")
    print(f"Classification Loss: {epoch_classification_loss:.4f}, NER Loss: {epoch_ner_loss:.4f}")
    print(f"Classification Accuracy: {classification_accuracy:.4f}")
    print(f"NER F1-Score: {f1:.4f}")
    model.eval()
    total_test_classification_loss = 0
    total_test_ner_loss = 0
    correct_test_classifications = 0
    total_test_classifications = 0
    all_test_ner_preds = []
    all_test_ner_labels = []
    
    with torch.no_grad():
        for batch in test_dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            task_a_labels = batch["task_a_label"].to(device)
            task_b_labels = batch["task_b_labels"].to(device)
            class_logits, ner_logits = model(input_ids, attention_mask)
            class_loss = classification_loss_fn(class_logits, task_a_labels)
            batch_size, seq_len = ner_logits.size(0), ner_logits.size(1)
            ner_logits_reshaped = ner_logits.view(batch_size * seq_len, -1)
            task_b_labels_reshaped = task_b_labels.view(-1)
            ner_loss = ner_loss_fn(ner_logits_reshaped, task_b_labels_reshaped)
            total_test_classification_loss += class_loss.item()
            total_test_ner_loss += ner_loss.item()
            class_preds = torch.argmax(class_logits, dim=-1)
            correct_test_classifications += (class_preds == task_a_labels).sum().item()
            total_test_classifications += task_a_labels.size(0)
            ner_preds = torch.argmax(ner_logits, dim=-1)
            ner_preds_flat = ner_preds.view(-1)
            task_b_labels_reshaped_flat = task_b_labels_reshaped.view(-1)
            valid_indices = task_b_labels_reshaped_flat != -100
            ner_preds_valid = ner_preds_flat[valid_indices].cpu().numpy()
            task_b_labels_valid = task_b_labels_reshaped_flat[valid_indices].cpu().numpy()
            all_test_ner_preds.extend(ner_preds_valid)
            all_test_ner_labels.extend(task_b_labels_valid)
        test_classification_accuracy = correct_test_classifications / total_test_classifications
        test_f1 = f1_score(all_test_ner_labels, all_test_ner_preds, average='macro')
        print(f"Test Classification Accuracy: {test_classification_accuracy:.4f}")
        print(f"Test NER F1-Score: {test_f1:.4f}")


Map:   0%|          | 0/23 [00:00<?, ? examples/s]

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

Epoch 1/3
Classification Loss: 0.7931, NER Loss: 1.1368
Classification Accuracy: 0.7391
NER F1-Score: 0.2233
Test Classification Accuracy: 0.3333
Test NER F1-Score: 0.1237
Epoch 2/3
Classification Loss: 0.5337, NER Loss: 1.0949
Classification Accuracy: 0.7391
NER F1-Score: 0.3725
Test Classification Accuracy: 0.3333
Test NER F1-Score: 0.0000
Epoch 3/3
Classification Loss: 0.4164, NER Loss: 0.9890
Classification Accuracy: 0.7826
NER F1-Score: 0.3487
Test Classification Accuracy: 0.6667
Test NER F1-Score: 0.0317
