In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
! pip install transformers datasets peft torch
! pip install -U bitsandbytes


In [None]:
import torch
from torch.optim import AdamW
from transformers import AutoModelForCausalLM, AutoTokenizer, get_scheduler
from datasets import load_dataset
from peft import PromptTuningConfig, get_peft_model

# Load the model and tokenizer with 4-bit quantization enabled and CPU offloading
model_name = "meta-llama/Llama-3.1-8B"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    use_auth_token="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",
    load_in_4bit=True,  # Enable 4-bit quantization
    device_map="cuda"  # Automatically map layers to available devices (CPU & GPU)  # Offload some layers to CPU in 32-bit precision
)

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    use_auth_token="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"
)

# Define the Prompt Tuning configuration
peft_config = PromptTuningConfig(
    task_type="CAUSAL_LM",    # Set the task type for causal language modeling
    num_virtual_tokens=20     # Number of virtual tokens to tune (instead of prompt_length)
)

# Apply prompt tuning to the model
peft_model = get_peft_model(model, peft_config)

# Load the SVAMP dataset
dataset = load_dataset("ChilleD/SVAMP", split='train')




In [None]:
# Preprocess the dataset
# Set the maximum length for tokenization
MAX_LENGTH = 512  # Adjust this depending on the model's input size limit (common is 512 for many models)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})  # or you can specify a custom pad token
# Preprocess the dataset with truncation and padding to MAX_LENGTH
# Set the maximum length for tokenization
MAX_LENGTH = 512  # Adjust this depending on the model's input size limit

# Preprocess function as defined previously
def preprocess_function(example):
    inputs = example['Body'] + example['Question']
    targets = example['Answer']
    
    # Tokenize the inputs and targets with max_length
    model_inputs = tokenizer(inputs, padding="max_length", truncation=True, max_length=MAX_LENGTH)
    labels = tokenizer(targets, padding="max_length", truncation=True, max_length=MAX_LENGTH)["input_ids"]
    
    model_inputs["labels"] = labels
    return model_inputs

# Prepare the list to hold preprocessed data
preprocessed_data = []

# Loop through each sample in the dataset
for example in dataset:
    # Apply the preprocess function to each sample
    preprocessed_example = preprocess_function(example)
    preprocessed_data.append(preprocessed_example)

# Convert the list of preprocessed data to the required format for training

print(len(preprocessed_data))
from datasets import Dataset
train_dataset = Dataset.from_dict({key: [example[key] for example in preprocessed_data] for key in preprocessed_data[0].keys()})

# Now `train_dataset` can be used in training



# Define the optimizer and scheduler
optimizer = AdamW(peft_model.parameters(), lr=5e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_dataset)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

# Move model to GPU if available, CPU offloading is handled by `device_map`
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
peft_model.to(device)

# Custom training loop
for epoch in range(num_epochs):
    peft_model.train()  # Set the model to training mode
    total_loss = 0
    
    for idx in range(700):
        batch = train_dataset[idx]
#         print("Step: ", step)
#         print("\n")
#         print("Batch: ", batch)
        # Move inputs and labels to device
        input_ids = torch.tensor(batch['input_ids']).unsqueeze(0).to(device)
        labels = torch.tensor(batch['labels']).unsqueeze(0).to(device)

        # Forward pass
        outputs = peft_model(input_ids=input_ids, labels=labels)
        loss = outputs.loss

        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        lr_scheduler.step()  # Update learning rate
        optimizer.zero_grad()

        total_loss += loss.item()

        # Print progress
        print(f"Epoch: {epoch + 1}/{num_epochs}, Step: {idx}, Loss: {loss.item()}")

    avg_loss = total_loss / len(train_dataset)
    print(f"Epoch {epoch + 1} finished. Average Loss: {avg_loss}")

In [None]:
# # Function to perform inference on the fine-tuned model
# def generate_response(prompt):
#     # Tokenize the input prompt
#     inputs = tokenizer(prompt, return_tensors="pt", padding="max_length", truncation=True, max_length=MAX_LENGTH)
    
#     # Move the inputs to the appropriate device (CPU or GPU)
#     input_ids = inputs['input_ids'].to(device)
#     attention_mask = inputs['attention_mask'].to(device)
    
#     # Put the model in evaluation mode
#     peft_model.eval()
    
#     # Generate a response using the model (greedy decoding for simplicity)
#     with torch.no_grad():
#         outputs = peft_model.generate(
#             input_ids=input_ids,
#             attention_mask=attention_mask,
#             max_length=MAX_LENGTH,
#             num_return_sequences=1,  # Generate only one sequence
#             do_sample=False          # Greedy decoding
#         )
    
#     # Decode the generated tokens back into text
#     generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
#     return generated_text

# # Sample prompt for testing (you can modify this with any other example)
# sample_prompt = "Solve the following equation: 2x + 3 = 7. What is the value of x?"

# # Generate response from the fine-tuned model
# generated_output = generate_response(sample_prompt)

# # Print the model's generated response
# print("Prompt:", sample_prompt)
# print("Model's Response:", generated_output)


In [None]:
# Set the model to evaluation mode
peft_model.eval()

sample_prompt = "Solve the equation: 2x + 3 = 7."

# Tokenize the input prompt
inputs = tokenizer.encode_plus(
    sample_prompt, 
    return_tensors="pt", 
    padding=True, 
    truncation=True,
    max_length=512  # optional max_length
).to(device)

# Ensure attention mask and pad_token_id are set
input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]

# Generate the output with appropriate settings
generated_output = peft_model.generate(
    input_ids,
    attention_mask=attention_mask,  # pass attention mask
    max_length=512,  # Set the maximum number of tokens in the generated output
    num_return_sequences=1,  # Generate one response
    no_repeat_ngram_size=2,  # Optional: prevent repetition
    do_sample=True,  # Use greedy decoding for deterministic output
    pad_token_id=tokenizer.eos_token_id  # Set pad_token_id to eos_token_id
)

# Decode and print the generated text
print(tokenizer.decode(generated_output[0], skip_special_tokens=True))
