#### **PyTorch**

In [1]:
import torch
import torch.nn.functional as F
print(f"PyTorch Version: {torch.__version__}")

import torch
print(f"Cude is available: {torch.cuda.is_available()}")
print(f"Device name: {torch.cuda.get_device_name(0)}")

PyTorch Version: 2.3.0+cu121
Cude is available: True
Device name: NVIDIA H100 PCIe


#### **Import Other Libraries**

In [2]:
from datasets import load_dataset 
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import numpy as np
import evaluate
import transformers
from transformers import TrainingArguments
import torch 
import matplotlib.pyplot as plt 
from transformers import DataCollatorWithPadding
import os 
from pathlib import Path
import random 
from datasets import Dataset, DatasetDict
import warnings
from functools import partial
from datasets import concatenate_datasets
from functools import partial 
from tqdm import tqdm 
import textwrap
from IPython.display import display
from IPython.display import Markdown
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from peft import LoraConfig, get_peft_model 
from transformers import BitsAndBytesConfig
import os 
import re 
os.environ["WANDB_DISABLED"] = "true"
warnings.filterwarnings('ignore', message='Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.')
from llmft.generate import generate_dataset

#### **Parameters**

In [3]:
# This cell is tagged with `parameters`
"meta-llama/Meta-Llama-3-8B-Instruct" #"google/gemma-1.1-7b-it" #microsoft/phi-2" #"microsoft/phi-2" #"#"meta-llama/Llama-2-7b-chat-hf" # "distilbert-base-uncased" 
model_name = "microsoft/Phi-3-mini-4k-instruct"
column = 'text'
epochs = 1
seed = 0
verbose = True 
test_size = 0.5
p = 0.0

In [4]:
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

#### **Visual Checks**

In [5]:
### ---         Print Markdown
def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))
### ---

### ---         Memory Check
def Memory():
    print("Current memory usage:")
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')
### ---

Memory()

Current memory usage:
Allocated: 0.0 GB
Cached:    0.0 GB


#### **Qlora**

In [6]:
from peft import LoraConfig, get_peft_model 
from transformers import BitsAndBytesConfig

# ----- QUANTIZATION -------# 
# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = True

# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = True

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

# ----- LORA -------# 

lora_config = LoraConfig(
    r=8,
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
    task_type="CAUSAL_LM",
)

Your GPU supports bfloat16: accelerate training with bf16=True


#### **Instantiate Model**

In [7]:
model = AutoModelForCausalLM.from_pretrained(model_name, 
                                             device_map="auto", 
                                             quantization_config=bnb_config, 
                                             trust_remote_code=True)# So we can do gradient checkpointing
model.config.use_cache = False
model.config.pretraining_tp = 1
model.config.gradient_checkpointing = True
model.enable_input_require_grads()
print(model.generation_config)
Memory()

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attenton` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.


Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": [
    32000,
    32001,
    32007
  ],
  "pad_token_id": 32000
}

Current memory usage:
Allocated: 2.1 GB
Cached:    2.3 GB


In [8]:
dataset = load_dataset("ppower1/chat_instrument")['train']
labels = np.array([1 if i[2]['content'] == 'Yes' else 0 for i in dataset['messages']])
dataset['messages'][0]

[{'role': 'system', 'content': 'You are a housing court clerk'},
 {'role': 'user',
  'content': "Task: The following is a description of an eviction case. Predict whether the tenant has legal represenation (yes or no, and then explain your reasoning.)\n\n    Description: The Right to Counsel is not in effect in the tenant's zip code. This is a summary process (eviction) complaint filed in the Superior Court of Connecticut. The plaintiff, who is the landlord, is seeking to terminate the lease agreement with the defendant, who is the tenant, due to the lapse of time. The lease agreement, which was either oral or in writing, was entered into on January 1, 2015, and the defendant agreed to rent the premises located at 1 Davies Court (Basement and Common Area), Ansonia, CT XXXXX. The defendant agreed to pay $400 weekly or monthly on the 1st day of each week or month.\n\nThe plaintiff alleges that the defendant has used and occupied the premises as agreed under the lease and still occupies t

In [9]:


# dataset = generate_dataset(total_entries=1000, flip_rate=0.)

# def get_prompt(desc):
#     return f"""Task: The following is a description of an eviction case. Predict whether the tenant has legal represenation (yes or no, and then explain your reasoning.)

#     Description: {desc}

#     Prediction:"""

# messages = []
# for i, j in zip(dataset['text'], dataset['label']):
#    message = [{"role": "system", "content": "You are a housing court clerk"}, 
#    {"role": "user", "content": get_prompt(i)}, 
#    {"role": "assistant", "content": 'Yes' if j == 1 else 'No'}]
#    messages.append(message)

# dataset = Dataset.from_dict({'messages': messages})
# dataset[0]

In [10]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [11]:
pipe = pipeline("text-generation", 
                model=model, 
                tokenizer=tokenizer)

generation_args = {
    "max_new_tokens": 500, 
    "return_full_text": False, 
    "temperature": 0.0,
    "do_sample": False, 
}

output = pipe(dataset[1]['messages'], **generation_args)
print(output[0]['generated_text'])

You are not running the flash-attention implementation, expect numerical differences.


The prediction is that the tenant, Marci Perez, does not have legal representation. The reasoning behind this prediction is that the Right to Counsel is not in effect in the tenant's zip code. The Right to Counsel is a legal provision that provides tenants with the right to have an attorney represent them in housing court, particularly in eviction cases. Since this provision is not in effect in the tenant's zip code, it is likely that Marci Perez does not have legal representation in this case. However, it's important to note that this is a prediction based on the information provided and does not confirm whether Marci Perez actually has or does not have legal representation.


#### **Peft Model**

In [12]:
model = get_peft_model(model, lora_config)
print(model.print_trainable_parameters())
Memory()

trainable params: 4,456,448 || all params: 3,825,536,000 || trainable%: 0.11649212031987152
None
Current memory usage:
Allocated: 2.2 GB
Cached:    2.9 GB


#### **Data set**

In [13]:
# dataset = dataset.select(range(1000))

# Reshuffle and split the combined dataset with a fixed seed
new_splits = dataset.train_test_split(test_size=test_size, seed=seed)  # adjust test_size as needed

# Create a new DatasetDict with the shuffled splits
reshuffled_dataset = DatasetDict({
    'train': new_splits['train'],
    'test': new_splits['test']
})

In [14]:
trainer = SFTTrainer(
    model=model,
    train_dataset=reshuffled_dataset['train'],
    eval_dataset=reshuffled_dataset['test'],
    args=TrainingArguments(
        per_device_train_batch_size=64,
        per_device_eval_batch_size=64,
        load_best_model_at_end=True,
        gradient_checkpointing=True,
        gradient_accumulation_steps=2,
        max_steps=200,
        evaluation_strategy = "steps",
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir="outputs",
    ),
    peft_config=lora_config,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/4898 [00:00<?, ? examples/s]

Map:   0%|          | 0/4899 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [15]:
trainer.train()



Step,Training Loss,Validation Loss
1,2.3325,2.314873
2,2.3358,2.258834
3,2.2998,2.187922
4,2.1999,2.103839
5,2.0895,2.016492
6,2.0554,1.931477
7,1.9224,1.84938
8,1.8345,1.778381
9,1.7981,1.7241
10,1.725,1.694108


In [None]:
steps, train_loss =   [i['step'] for i in trainer.state.log_history if 'loss' in i],  [i['loss'] for i in trainer.state.log_history if 'loss' in i]
eval_loss = [i['eval_loss'] for i in trainer.state.log_history if 'eval_loss' in i]

In [None]:
plt.plot(train_loss, label='Train')
plt.plot(eval_loss, label='Validation')
plt.legend()
plt.show()

In [None]:
labels = np.array([1 if i[2]['content'] == 'Yes' else 0 for i in dataset['messages']])
sum(labels)

In [None]:
selected = [dataset['messages'][i]  for i in range(len(dataset)) if labels[i] == 1]

In [None]:
len(selected)

In [None]:
for counter, message in enumerate(selected):
    tokenized_chat = tokenizer.apply_chat_template(message[:2], tokenize=True, add_generation_prompt=True, return_tensors='pt')
    outputs = trainer.model.generate(tokenized_chat, max_new_tokens=1)
    text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(counter, text.split("Prediction:")[1])