In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
! pip install transformers datasets peft torch
! pip install -U bitsandbytes


In [None]:
import torch
from torch import nn
from transformers import AutoModelForCausalLM, AutoTokenizer, AdamW, get_scheduler
from datasets import load_dataset


# Load the model and tokenizer
model_name = "mistralai/Mistral-7B-v0.1"  # Replace with your desired model
tokenizer = AutoTokenizer.from_pretrained(model_name,use_auth_token="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",load_in_4bit=True,device_map="cuda")
model = AutoModelForCausalLM.from_pretrained(model_name,use_auth_token="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",load_in_4bit=True,device_map="cuda")

# Prefix tuning parameters
prefix_length = 10  # Number of prefix tokens
hidden_size = model.config.hidden_size

# Wrap the model with PrefixTuning


# Tokenization and dataset preprocessing
MAX_LENGTH = 512
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

def preprocess_function(example):
    inputs = example['Body'] + example['Question']
    targets = example['Answer']
    model_inputs = tokenizer(inputs, padding="max_length", truncation=True, max_length=MAX_LENGTH)
    labels = tokenizer(targets, padding="max_length", truncation=True, max_length=MAX_LENGTH)["input_ids"]
    model_inputs["labels"] = labels
    return model_inputs

# Load and preprocess dataset
dataset = load_dataset("ChilleD/SVAMP", split='train')
preprocessed_data = [preprocess_function(example) for example in dataset]

from datasets import Dataset
train_dataset = Dataset.from_dict({key: [example[key] for example in preprocessed_data] for key in preprocessed_data[0].keys()})


In [None]:
class PrefixTuning(nn.Module):
    def __init__(self, model, prefix_length, hidden_size, dtype=torch.float16, device='cuda'):
        super(PrefixTuning, self).__init__()
        self.model = model
        self.prefix_length = prefix_length
        self.hidden_size = hidden_size

        # Initialize the trainable prefix embeddings with the correct dtype and device
        self.prefix_embedding = nn.Parameter(torch.randn(prefix_length, hidden_size, dtype=dtype, device=device)*0.01)

    def forward(self, input_ids, attention_mask=None, labels=None):
        # Extract model's embeddings for the input tokens
        input_embeddings = self.model.get_input_embeddings()(input_ids)

        # Use the prefix embeddings directly (already set to the correct dtype and device)
        prefix_embeddings = self.prefix_embedding.unsqueeze(0).expand(input_embeddings.size(0), -1, -1)

        # Concatenate the prefix with the input embeddings
        extended_embeddings = torch.cat((prefix_embeddings, input_embeddings), dim=1)

        # Adjust attention mask for the prefix
        if attention_mask is not None:
            prefix_attention_mask = torch.ones((input_embeddings.size(0), self.prefix_length), device=attention_mask.device, dtype=attention_mask.dtype)
            extended_attention_mask = torch.cat((prefix_attention_mask, attention_mask), dim=1)
        else:
            extended_attention_mask = None

        # Pad labels with -100 to ignore prefix tokens during loss computation
        if labels is not None:
            # Pad the labels with -100 (ignore index for the loss) to account for the prefix tokens
            prefix_padding = torch.full((labels.size(0), self.prefix_length), -100, dtype=labels.dtype, device=labels.device)
            extended_labels = torch.cat((prefix_padding, labels), dim=1)
        else:
            extended_labels = labels

        # Forward pass through the model with the modified embeddings and labels
        outputs = self.model(
            inputs_embeds=extended_embeddings,
            attention_mask=extended_attention_mask,
            labels=extended_labels,
        )
        return outputs


In [None]:

# Initialize the PrefixTuning model
prefix_tuning_model = PrefixTuning(
    model=model, 
    prefix_length=10, 
    hidden_size=model.config.hidden_size, 
    dtype=torch.float16,  # Set to the dtype your model is using (e.g., float16 for mixed precision)
    device='cuda'  # Set the device accordingly
)
optimizer = AdamW(prefix_tuning_model.parameters(), lr=1e-3,weight_decay=1e-4)
num_epochs = 3
from transformers import get_linear_schedule_with_warmup

# Set up a scheduler with a warmup phase
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=500,  # Adjust the number of warmup steps
    num_training_steps=num_epochs * len(train_dataset)
)



num_training_steps = num_epochs * len(train_dataset)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)
# Move model to GPU if available
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
prefix_tuning_model.to(device)

# Freeze the base model parameters and only train the prefix
for param in prefix_tuning_model.model.parameters():
    param.requires_grad = False

# Custom training loop
for epoch in range(num_epochs):
    prefix_tuning_model.train()
    total_loss = 0
    
    for idx in range(700):
        batch = train_dataset[idx]

        input_ids = torch.tensor(batch['input_ids']).unsqueeze(0).to(device)
        labels = torch.tensor(batch['labels']).unsqueeze(0).to(device)

        # Forward pass
        outputs = prefix_tuning_model(input_ids=input_ids, labels=labels)
        loss = outputs.loss

        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        scheduler.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        total_loss += loss.item()
        print(f"Epoch: {epoch + 1}/{num_epochs}, Step: {idx}, Loss: {loss.item()}")

    avg_loss = total_loss / len(train_dataset)
    print(f"Epoch {epoch + 1} finished. Average Loss: {avg_loss}")


In [None]:
def run_inference(prefix_tuning_model, tokenizer, prompt, max_length=100, device='cuda'):
    # Tokenize the input prompt
    input_ids = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).input_ids.to(device)
    
    # Ensure correct padding and attention masks
    attention_mask = (input_ids != tokenizer.pad_token_id).to(device)
    
    # Set the model to evaluation mode
    prefix_tuning_model.eval()

    with torch.no_grad():  # No need to compute gradients during inference
        # Extract model's embeddings for the input tokens
        input_embeddings = prefix_tuning_model.model.get_input_embeddings()(input_ids)

        # Generate prefix embeddings
        prefix_embeddings = prefix_tuning_model.prefix_embedding.unsqueeze(0).expand(input_embeddings.size(0), -1, -1)

        # Concatenate the prefix with the input embeddings
        extended_embeddings = torch.cat((prefix_embeddings, input_embeddings), dim=1)

        # Create an extended attention mask (including the prefix tokens)
        prefix_attention_mask = torch.ones((input_embeddings.size(0), prefix_tuning_model.prefix_length), device=device)
        extended_attention_mask = torch.cat((prefix_attention_mask, attention_mask), dim=1)

        # Perform generation (inference)
        generated_output = prefix_tuning_model.model.generate(
            inputs_embeds=extended_embeddings,
            attention_mask=extended_attention_mask,
            max_length=max_length,
            num_return_sequences=1,
            no_repeat_ngram_size=2,  # Avoid repeating phrases
            do_sample=False  # Greedy decoding
        )

        # Decode the generated tokens to text
        output_text = tokenizer.decode(generated_output[0], skip_special_tokens=True)
    
    return output_text


In [None]:
# Example prompt
prompt = "If 2x+3=7, what is the value of x?"

# Running inference
output = run_inference(prefix_tuning_model, tokenizer, prompt, max_length=100, device='cuda')

# Print the generated text
print(output)
