In [None]:
# Requried Installations
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.26" trl peft accelerate bitsandbytes

Collecting unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-ubgnids5/unsloth_e6854c2c46ce4ea3a7b31138abefddd0
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-ubgnids5/unsloth_e6854c2c46ce4ea3a7b31138abefddd0
  Resolved https://github.com/unslothai/unsloth.git to commit 8dc0561ec0776fcc49d8a406c8a0acf295bd561a
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tyro (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Downloading tyro-0.8.4-py3-none-any.whl (102 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.4/102.4 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Collecting datasets>=2.16.0 (from unsloth[colab-ne

In [None]:
import torch
from torch import nn
import torch.nn.functional as F

<h2> Transformer Based Ensembling Model Defination

In [None]:
class SingleHeadAttention(nn.Module):
    def __init__(self, input_dim):
        super(SingleHeadAttention, self).__init__()
        self.input_dim = input_dim

        # Learnable parameters
        self.query = nn.Linear(input_dim, input_dim)
        self.key = nn.Linear(input_dim, input_dim)
        self.value = nn.Linear(input_dim, input_dim)

        # self.out = nn.Linear(input_dim, input_dim)

    def forward(self, x):
        # Calculate query, key, and value
        Q = self.query(x)
        K = self.key(x)
        V = self.value(x)

        # Calculate attention scores
        attention_scores = torch.matmul(Q, K.transpose(-2, -1)) / torch.sqrt(torch.tensor(self.input_dim, dtype=torch.float32))

        # Apply softmax to get attention weights
        attention_weights = torch.softmax(attention_scores, dim=-1)

        # Calculate attention output
        attention_output = torch.matmul(attention_weights, V)

        # output = self.out(attention_output)

        return attention_output


In [None]:
class FusionTransformer(nn.Module):
    def __init__ (self,model1,model2,modelSize=32000,device='cuda'):
      super().__init__()

      self.model1 = model1.to(device)
      self.model2 = model2.to(device)
      self.linear1 = nn.Linear(modelSize,4).to(device)
      self.linear2 = nn.Linear(modelSize,4).to(device)
      self.transformer = SingleHeadAttention(input_dim=4*2).to(device)
      # self.transformer = nn.Transformer(d_model=8,nhead=1,num_encoder_layers=1,num_decoder_layers=1).to(device)
      # self.relu = nn.ReLU()
      self.linear = nn.Linear(8, 4).to(device)

    def forward(self,inputIndices, attn_mask):
      y1 = self.model1(inputIndices, attn_mask = attn_mask).logits
      y2 = self.model2(inputIndices, attn_mask = attn_mask).logits

      n,h,w = y1.shape

      y1 = y1[:,h-1,:]
      y2 = y2[:,h-1,:]
      y1 = self.linear1(y1)
      y2 = self.linear2(y2)
      y = torch.cat((y1,y2),dim=1)
      y = self.transformer(y)
      y = self.linear(y)

      return y


<h2> Data Import </h2>

In [None]:
from datasets import load_dataset, load_from_disk, concatenate_datasets, Dataset,DatasetDict

from transformers import (
    AutoModelForSequenceClassification,
    MistralForSequenceClassification,
    PretrainedConfig,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer
)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
dataset_location = "/content/drive/MyDrive/685 Final Project/Datasets/medmcqa-prompts"

In [None]:
# train_dataset = load_from_disk(f"{dataset_location}/train_prompts_micro.hf")
# # test_dataset = load_from_disk(f"{dataset_location}/test_prompts_micro.hf")
# eval_dataset = load_from_disk(f"{dataset_location}/eval_prompts_micro.hf")

train_dataset = load_from_disk(f"{dataset_location}/train_prompts_mini.hf")
# test_dataset = load_from_disk(f"{dataset_location}/test_prompts_mini.hf")
eval_dataset = load_from_disk(f"{dataset_location}/eval_prompts_mini.hf")

In [None]:
train_dataset

Dataset({
    features: ['id', 'question', 'opa', 'opb', 'opc', 'opd', 'cop', 'choice_type', 'exp', 'subject_name', 'topic_name', 'prompt', 'label_one_hot'],
    num_rows: 20000
})

In [None]:
eval_dataset

Dataset({
    features: ['id', 'question', 'opa', 'opb', 'opc', 'opd', 'cop', 'choice_type', 'exp', 'subject_name', 'topic_name', 'prompt', 'label_one_hot'],
    num_rows: 2000
})

<h2> Base Models </h2>

In [None]:
# Load pre-trained models
from unsloth import FastLanguageModel

max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model_location = "/content/drive/MyDrive/685 Final Project/Models"

model1, tokenizer = FastLanguageModel.from_pretrained(model_location + "/unsloth_domain1",
                                                     max_seq_length=max_seq_length,
                                                     dtype=dtype,
                                                     load_in_4bit=load_in_4bit)

model2, tokenizer = FastLanguageModel.from_pretrained(model_location + "/ai2_arc_instruction_tuned_mistral_7b_1",
                                                     max_seq_length=max_seq_length,
                                                     dtype=dtype,
                                                     load_in_4bit=load_in_4bit)

==((====))==  Unsloth: Fast Mistral patching release 2024.5
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.1+cu121. CUDA = 8.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.25.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Unsloth 2024.5 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


==((====))==  Unsloth: Fast Mistral patching release 2024.5
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.1+cu121. CUDA = 8.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.25.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


<h2>Data Preprocessing</h2>

In [None]:
from torch.utils.data import DataLoader, Dataset

class MCQDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)  # Changed to float for one-hot encoding
        return item

    def __len__(self):
        return len(self.labels)

# Function to encode the data
def encode_data(tokenizer, prompts):
    # encodings = tokenizer(prompts, truncation=True, padding=True, max_length=2048)
    encodings = tokenizer(prompts, truncation=True, padding=True)
    return encodings

# Prepare the data for tokenization
prompts = [item['prompt'] for item in train_dataset]
labels = [item['label_one_hot'] for item in train_dataset]  # one-hot encoded labels

# Tokenize data
encodings = encode_data(tokenizer, prompts)

# Create dataset
train_set = MCQDataset(encodings, labels)

# DataLoader
train_loader = DataLoader(train_set, batch_size=32, shuffle=True)


prompts = [item['prompt'] for item in eval_dataset]
labels = [item['label_one_hot'] for item in eval_dataset]  # one-hot encoded labels

# Tokenize data
encodings = encode_data(tokenizer, prompts)

# Create dataset
eval_set = MCQDataset(encodings, labels)

# DataLoader
val_loader = DataLoader(eval_set, batch_size=32, shuffle=True)

<h2> Model Training and Evaluation </h2>

In [None]:
from torch.cuda.amp import GradScaler, autocast
from tqdm import tqdm
from torch.nn.functional import softmax
def train_and_validate(model, train_loader, val_loader, log_file_path,epochs=3):
    best_val_acc=0
    saved_model_location = "/content/drive/MyDrive/685 Final Project/Models"

    scaler = GradScaler()
    device = torch.device("cuda")
    model = model.to(device)  # Ensures model and all submodules are float32
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    criterion = torch.nn.CrossEntropyLoss()

    # for epoch in tqdm(range(epochs)):
    with open(log_file_path, 'a') as log_file:
      log_file.write("Starting training process...\n")
      log_file.flush()
      print("Log File created!")

      for epoch in range(epochs):
          total_train_loss = 0
          total_train_correct = 0
          train_samples = 0
          # correct=list()
          model.train()
          train_pbar = tqdm(train_loader, desc=f"Epoch {epoch+1} [TRAIN]", unit="batch")
          for i, batch in enumerate(train_pbar):
              input_ids, labels, attn_mask = batch['input_ids'].to(device), batch['labels'].to(device), batch['attention_mask'].to(device)
              train_samples += labels.size(0)
              optimizer.zero_grad()
              with torch.cuda.amp.autocast():
                  output = model(input_ids,attn_mask).float()
                  loss = criterion(output, labels.float())
                  predictions = torch.argmax(softmax(output,dim=1), dim=1)
                  labels_indices = torch.argmax(labels, dim=1)

                  train_correct = (predictions == labels_indices).sum().item()
                  total_train_correct += train_correct
                  # print("\nTotal Correct : ", train_correct)
              log_file.write(f"Batch {i}, Epoch {epoch+1}, Training Loss: {loss.item():.4f}, Training Accuracy: {100 * total_train_correct / train_samples:.2f}%\n")
              log_file.flush()

              scaler.scale(loss).backward()
              scaler.step(optimizer)
              scaler.update()
              total_train_loss += loss.item()

              train_pbar.set_postfix(loss=loss.item(), temp_acc=100 * total_train_correct / train_samples)


              # if i % 1000 == 0:
              #     print(i, loss.item())
              #     print(f"Temp accuracy: ", total_train_correct / train_samples * 100)

              # Releasing the memory
              del input_ids, labels, output, loss, predictions, labels_indices





          avg_train_loss = total_train_loss / len(train_loader)
          train_accuracy = total_train_correct / train_samples * 100
          print(f"Training Accuracy: ", train_accuracy)
          print(f"Epoch {epoch+1}, Loss: {avg_train_loss}")

          model.eval()
          total_val_loss, val_samples, total_val_correct = 0, 0, 0
          eval_pbar = tqdm(val_loader, desc=f"Epoch {epoch+1} [EVAL]", unit="batch")
          with torch.no_grad():
              for i, batch in enumerate(eval_pbar):
                  input_ids, labels, attn_mask = batch['input_ids'].to(device), batch['labels'].to(device), batch['attention_mask'].to(device)
                  with torch.cuda.amp.autocast():
                      outputs = model(input_ids,attn_mask).float()
                      val_loss = criterion(outputs, labels.float())
                      predictions = torch.argmax(softmax(outputs,dim=1), dim=1)
                      labels_indices = torch.argmax(labels, dim=1)
                      total_val_correct += (predictions == labels_indices).sum().item()

                  total_val_loss += val_loss.item()
                  val_samples += labels.size(0)
                  eval_pbar.set_postfix(loss=val_loss.item(), temp_acc=100 * total_val_correct / val_samples)

          avg_val_loss = total_val_loss / len(val_loader)
          val_accuracy = total_val_correct / val_samples * 100

          if val_accuracy > best_val_acc:
            best_val_acc=val_accuracy
            model_save_path = f"{saved_model_location}/TransformerFusionMiniBest.pth"
            torch.save(model.state_dict(), model_save_path)
            print("Best model Saved at", model_save_path)

          print(f"Validation Accuracy: ", val_accuracy)
          print(f"Epoch {epoch+1} - Validation Loss: {avg_val_loss:.4f}")



In [None]:
torch.cuda.empty_cache()

In [None]:
torch.manual_seed(42)

<torch._C.Generator at 0x7e1bec17e990>

In [None]:
ft = FusionTransformer(model1,model2,32000)

In [None]:
log_file_path = '/content/TransformerFusionMiniLogger.txt'

In [None]:
train_and_validate(ft,train_loader,val_loader,log_file_path,epochs=3)

Log File created!


Epoch 1 [TRAIN]: 100%|██████████| 625/625 [1:31:36<00:00,  8.79s/batch, loss=1.4, temp_acc=26.5]


Training Accuracy:  26.495
Epoch 1, Loss: 1.7152837938308716


Epoch 1 [EVAL]: 100%|██████████| 63/63 [01:54<00:00,  1.82s/batch, loss=1.35, temp_acc=31.9]


Best model Saved at /content/drive/MyDrive/685 Final Project/Models/TransformerFusionMiniBest.pth
Validation Accuracy:  31.85
Epoch 1 - Validation Loss: 1.3832


Epoch 2 [TRAIN]: 100%|██████████| 625/625 [1:31:36<00:00,  8.79s/batch, loss=1.45, temp_acc=27]


Training Accuracy:  26.979999999999997
Epoch 2, Loss: 1.43706215133667


Epoch 2 [EVAL]: 100%|██████████| 63/63 [01:54<00:00,  1.82s/batch, loss=1.3, temp_acc=26.2]


Validation Accuracy:  26.25
Epoch 2 - Validation Loss: 1.4317


Epoch 3 [TRAIN]: 100%|██████████| 625/625 [1:31:36<00:00,  8.79s/batch, loss=1.38, temp_acc=26.4]


Training Accuracy:  26.405
Epoch 3, Loss: 1.4524910228729249


Epoch 3 [EVAL]: 100%|██████████| 63/63 [01:54<00:00,  1.82s/batch, loss=1.42, temp_acc=26.2]

Validation Accuracy:  26.25
Epoch 3 - Validation Loss: 1.4846



