#### **PyTorch**

In [None]:
import torch
import torch.nn.functional as F
print(f"PyTorch Version: {torch.__version__}")

import torch
print(f"Cude is available: {torch.cuda.is_available()}")
print(f"Device name: {torch.cuda.get_device_name(0)}")

#### **Import Other Libraries**

In [1]:
from datasets import load_dataset 
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import numpy as np
import evaluate
import transformers
from transformers import TrainingArguments
import torch 
import matplotlib.pyplot as plt 
from transformers import DataCollatorWithPadding
import os 
from pathlib import Path
import random 
from datasets import Dataset, DatasetDict
import warnings
from functools import partial
from datasets import concatenate_datasets
from functools import partial 
from tqdm import tqdm 
import textwrap
from IPython.display import display
from IPython.display import Markdown
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from peft import LoraConfig, get_peft_model 
from transformers import BitsAndBytesConfig
import os 
import re 
os.environ["WANDB_DISABLED"] = "true"
warnings.filterwarnings('ignore', message='Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.')
from llmft.train import evaluate_model, ModelTrainer, EarlyStopping
from llmft.metrics import compute_recall
from llmft.losses import FocalLoss
from llmft.utils import predict
from llmft.generate import generate_dataset
from llmft.paper import generate_recall_table

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

Device: cuda


#### **Parameters**

In [3]:
# This cell is tagged with `parameters`
"meta-llama/Meta-Llama-3-8B-Instruct" #"google/gemma-1.1-7b-it" #microsoft/phi-2" #"microsoft/phi-2" #"#"meta-llama/Llama-2-7b-chat-hf" # "distilbert-base-uncased" 
model_name = "microsoft/Phi-3-mini-4k-instruct"
column = 'text'
epochs = 1
seed = 0
verbose = True 
test_size = 0.5
p = 0.0

#### **Qlora**

In [4]:
from peft import LoraConfig, get_peft_model 
from transformers import BitsAndBytesConfig

# ----- QUANTIZATION -------# 
# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = True

# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = True

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

# ----- LORA -------# 

lora_config = LoraConfig(
    r=8,
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
    task_type="CAUSAL_LM",
)

Your GPU supports bfloat16: accelerate training with bf16=True


#### **Instantiate Model**

In [5]:
model = AutoModelForCausalLM.from_pretrained(model_name, 
                                             device_map="auto", 
                                             quantization_config=bnb_config, 
                                             trust_remote_code=True)# So we can do gradient checkpointing
model.config.use_cache = False
model.config.pretraining_tp = 1
model.config.gradient_checkpointing = True
model.enable_input_require_grads()
print(model.generation_config)

model = get_peft_model(model, lora_config)
print(model.print_trainable_parameters())

`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attenton` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": [
    32000,
    32001,
    32007
  ],
  "pad_token_id": 32000
}

trainable params: 4,456,448 || all params: 3,825,536,000 || trainable%: 0.11649212031987152
None


In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
def tokenizer_function(example):
  return tokenizer.apply_chat_template(example["messages"], 
                                                          tokenize=True, 
                                                          add_generation_prompt=False, 
                                                          return_dict=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


#### **Hyperparameters**

In [7]:
lr = 1e-4
warmup_ratio = 0.
batch_size = 2
epochs = 30
patience = float('inf') 
gamma = 0.0

training_status = 'standard' if lr==1e-4 else 'preferred'

#### **Data set**

In [8]:
dataset = load_dataset("ppower1/chat_instrument")['train']
labels = np.array([1 if i[2]['content'] == 'Yes' else 0 for i in dataset['messages']])
dataset = dataset.select(range(1000))

# Reshuffle and split the combined dataset with a fixed seed
dataset = dataset.train_test_split(test_size=test_size, seed=seed)  # adjust test_size as needed
tokenized_dataset = dataset.map(tokenizer_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(['messages'])

In [9]:
train_loader = DataLoader(tokenized_dataset['train'], batch_size=batch_size, collate_fn=DataCollatorWithPadding(tokenizer), shuffle=True)
test_loader = DataLoader(tokenized_dataset['test'], batch_size=batch_size, collate_fn=DataCollatorWithPadding(tokenizer))

In [10]:
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
scheduler =transformers.optimization.get_linear_schedule_with_warmup(optimizer,int(warmup_ratio*len(train_loader)*epochs), len(train_loader)*epochs)

In [11]:
class_weights = torch.tensor([1., 1.], device=device) #torch.tensor([1-n_positive/n_samples, n_positive/n_samples], device=device)
criterion = FocalLoss(alpha=class_weights, gamma=gamma, mode='output', reduction='none')
criterion = torch.nn.CrossEntropyLoss(reduction='none') 

In [None]:
yuri = ModelTrainer()

In [None]:
training_losses = []
 
for epoch in range(epochs):
    train_loss = train_decoder(model, train_loader, optimizer, scheduler, compute_recall, criterion, device)
    training_losses.append(train_loss)


In [None]:
def train_decoder(model, train_loader, optimizer, scheduler, metric, criterion, device):
    model.train()
    total_loss = 0

    accumulation_steps = 8  # Adjust based on memory capacity and desired effective batch size
    optimizer.zero_grad()
    for i, batch in enumerate(train_loader):
        input_ids = batch['input_ids'].to(device)
        labels = input_ids[:, 1:].clone().detach()
        input_ids = input_ids[:, :-1]

        attention_mask = batch['attention_mask'].to(device)
        attention_mask = attention_mask[:, :-1]

        logits = model(input_ids, attention_mask).loss['logits']
        mask = labels != -100
        loss = criterion(logits.view(-1, logits.size(-1)), labels.view(-1))
        loss = (loss * mask.view(-1)).sum() / mask.sum()

        loss = loss / accumulation_steps  # Normalize loss to account for accumulation
        loss.backward()
        total_loss += loss.item()
        if (i + 1) % accumulation_steps == 0:  # Perform optimization step after 'accumulation_steps' batches
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
            print(f"Batch Loss: {loss.item()}")
            torch.cuda.empty_cache()


    average_loss = total_loss / len(train_loader)

    return average_loss

In [None]:
train_decoder(model, train_loader, optimizer, scheduler, compute_recall, criterion, device)

In [None]:
for i in range(len(train_loader)):
    print(i)