In [1]:
import importlib

!pip install -q transformers bitsandbytes peft trl accelerate xformers wandb datasets gradio
!pip install -q spicy
!pip install -q torchtext
!pip install -q nltk
!pip install -q tqdm
!pip install -q fire

[0m

In [2]:
from transformers import AutoModelForCausalLM,AutoTokenizer,BitsAndBytesConfig,HfArgumentParser,TrainingArguments,pipeline, logging, TextStreamer
from peft import LoraConfig, PeftModel
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, AutoPeftModelForCausalLM
import os
import torch
from datasets import load_dataset
from trl import SFTTrainer
import wandb
import gradio
import platform
from huggingface_hub import notebook_login
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

import tqdm
import transformers
import bitsandbytes as bnb
from datasets import load_dataset
from functools import partial
import os

In [3]:
def print_system_specs():
    # Check if CUDA is available
    is_cuda_available = torch.cuda.is_available()
    print("CUDA Available:", is_cuda_available)
# Get the number of available CUDA devices
    num_cuda_devices = torch.cuda.device_count()
    print("Number of CUDA devices:", num_cuda_devices)
    if is_cuda_available:
        for i in range(num_cuda_devices):
            # Get CUDA device properties
            device = torch.device('cuda', i)
            print(f"--- CUDA Device {i} ---")
            print("Name:", torch.cuda.get_device_name(i))
            print("Compute Capability:", torch.cuda.get_device_capability(i))
            print("Total Memory:", torch.cuda.get_device_properties(i).total_memory, "bytes")
    # Get CPU information
    print("--- CPU Information ---")
    print("Processor:", platform.processor())
    print("System:", platform.system(), platform.release())
    print("Python Version:", platform.python_version())
print_system_specs()

CUDA Available: True
Number of CUDA devices: 1
--- CUDA Device 0 ---
Name: NVIDIA RTX A6000
Compute Capability: (8, 6)
Total Memory: 51040157696 bytes
--- CPU Information ---
Processor: x86_64
System: Linux 5.4.0-163-generic
Python Version: 3.10.13


In [4]:
# Pre trained model
model_name = "meta-llama/Llama-2-7b-hf" 

In [5]:
import huggingface_hub
import os
huggingface_hub.login("hf_xxxx")

import wandb
wandb.login(key="xxxx")

!mkdir -p /workspace/cache
os.environ["TRANSFORMERS_CACHE"] = "/workspace/cache"

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mshyboykt[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [6]:
def create_bnb_config():
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )

    return bnb_config
    
def create_peft_config(modules):
    """
    Create Parameter-Efficient Fine-Tuning config for your model
    :param modules: Names of the modules to apply Lora to
    """
    config = LoraConfig(
        r=16,  # dimension of the updated matrices
        lora_alpha=64,  # parameter for scaling
        target_modules=modules,
        lora_dropout=0.1,  # dropout probability for layers
        bias="none",
        task_type="CAUSAL_LM",
    )

    return config

In [7]:
def load_model(model_name, bnb_config, model_max_length):
    n_gpus = torch.cuda.device_count()
    max_memory = f'{40960}MB'

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto", # dispatch efficiently the model on the available ressources
        max_memory = {i: max_memory for i in range(n_gpus)},
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True)

    # Needed for LLaMA tokenizer
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = 'right'
    tokenizer.model_max_length = model_max_length
    tokenizer.pad_token = tokenizer.unk_token

    return model, tokenizer

In [8]:
# SOURCE https://github.com/artidoro/qlora/blob/main/qlora.py

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit #if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if 'lm_head' in lora_module_names:  # needed for 16-bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

def print_trainable_parameters(model, use_4bit=False):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        num_params = param.numel()
        # if using DS Zero 3 and the weights are initialized empty
        if num_params == 0 and hasattr(param, "ds_numel"):
            num_params = param.ds_numel

        all_param += num_params
        if param.requires_grad:
            trainable_params += num_params
    if use_4bit:
        trainable_params /= 2
    print(
        f"all params: {all_param:,d} || trainable params: {trainable_params:,d} || trainable%: {100 * trainable_params / all_param}"
    )

In [9]:
# # Get lora module names
# modules = find_all_linear_names(model)

# # Create PEFT config for these modules and wrap the model to PEFT
# peft_config = create_peft_config(modules)
# model = get_peft_model(model, peft_config)

# # Print information about the percentage of trainable parameters
# print_trainable_parameters(model)

In [10]:
# # Clear the memory footprint
# del model, trainer
# torch.cuda.empty_cache()

In [11]:
# fire.Fire(train, command=f'--base_model={model_name}')

In [12]:
import os
import sys
from typing import List

import fire
import torch
import torch.nn as nn
import bitsandbytes as bnb
from datasets import load_dataset
import transformers
import torch
from transformers import AutoTokenizer, GenerationConfig
from datasets import Dataset
import re

from peft import (
    prepare_model_for_int8_training,
    PeftModel,
    get_peft_model,
    get_peft_model_state_dict,
    set_peft_model_state_dict,
)


# model/data params
base_model: str = ""  # the only required argument
data_path: str = "./alpaca_data_cleaned.json"
output_dir: str = "./lora-alpaca"
# training hyperparams
batch_size: int = 8
micro_batch_size: int = 2
num_epochs: int = 3
learning_rate: float = 1e-4
cutoff_len: int = 512
val_set_size: int = 10
# lora hyperparams
lora_r: int = 8
lora_alpha: int = 16
lora_dropout: float = 0.05
lora_target_modules: List[str] = [
    "q_proj",
    "v_proj",
]
# llm hyperparams
train_on_inputs: bool = True  # if False, masks out inputs in loss
group_by_length: bool = False  # faster, but produces an odd training loss curve,
resume_from_checkpoint: str = "lora-alpaca/checkpoint-1600"  # either training checkpoint or final adapter
print(
    f"Training Alpaca-LoRA model with params:\n"
    f"base_model: {base_model}\n"
    f"data_path: {data_path}\n"
    f"output_dir: {output_dir}\n"
    f"batch_size: {batch_size}\n"
    f"micro_batch_size: {micro_batch_size}\n"
    f"num_epochs: {num_epochs}\n"
    f"learning_rate: {learning_rate}\n"
    f"cutoff_len: {cutoff_len}\n"
    f"val_set_size: {val_set_size}\n"
    f"lora_r: {lora_r}\n"
    f"lora_alpha: {lora_alpha}\n"
    f"lora_dropout: {lora_dropout}\n"
    f"lora_target_modules: {lora_target_modules}\n"
    f"train_on_inputs: {train_on_inputs}\n"
    f"group_by_length: {group_by_length}\n"
    f"resume_from_checkpoint: {resume_from_checkpoint}\n"
)
gradient_accumulation_steps = batch_size // micro_batch_size

device_map = "auto"
world_size = int(os.environ.get("WORLD_SIZE", 1))
ddp = world_size != 1
if ddp:
    device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)}
    gradient_accumulation_steps = gradient_accumulation_steps // world_size

n_gpus = torch.cuda.device_count()
max_memory = f'{40960}MB'

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    # quantization_config=bnb_config,
    torch_dtype=torch.float16,
    device_map="auto", # dispatch efficiently the model on the available ressources
    max_memory = {i: max_memory for i in range(n_gpus)},
)
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True)

# Needed for LLaMA tokenizer
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'
tokenizer.model_max_length = cutoff_len
tokenizer.pad_token = tokenizer.unk_token

# model = prepare_model_for_kbit_training(model)
# # Get lora module names
# # modules = find_all_linear_names(model)
# modules = lora_target_modules
# print(f"{modules=}")

# # Create PEFT config for these modules and wrap the model to PEFT
# peft_config = create_peft_config(modules)
# #peft_config.inference_mode = True
# model = get_peft_model(model, peft_config)


# Print information about the percentage of trainable parameters
# print_trainable_parameters(model)

def tokenize(prompt, add_eos_token=True):
    # there's probably a way to do this with the tokenizer settings
    # but again, gotta move fast
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=cutoff_len,
        padding=False,
        return_tensors=None,
    )
    if (
        result["input_ids"][-1] != tokenizer.eos_token_id
        and len(result["input_ids"]) < cutoff_len
        and add_eos_token
    ):
        result["input_ids"].append(tokenizer.eos_token_id)
        result["attention_mask"].append(1)

    result["labels"] = result["input_ids"].copy()

    return result

def generate_prompt(data_point):
    return data_point.replace("_", " ")

def generate_and_tokenize_prompt(item):
    data_point = item["content"]
    full_prompt = generate_prompt(data_point)
    tokenized_full_prompt = tokenize(full_prompt)
    if not train_on_inputs:
        user_prompt = generate_prompt({**data_point, "output": ""})
        tokenized_user_prompt = tokenize(user_prompt, add_eos_token=False)
        user_prompt_len = len(tokenized_user_prompt["input_ids"])

        tokenized_full_prompt["labels"] = [
            -100
        ] * user_prompt_len + tokenized_full_prompt["labels"][
            user_prompt_len:
        ]  # could be sped up, probably
    return tokenized_full_prompt

if resume_from_checkpoint:    
    if os.path.exists(resume_from_checkpoint):
        print(f"{resume_from_checkpoint=}")
        model = PeftModel.from_pretrained(model, resume_from_checkpoint, torch_dtype=torch.float16, 
                    device_map={"": "cpu"})
        model = model.merge_and_unload()
        #model.save_pretrained("merge-model")
    else:
        print(f"Checkpoint {resume_from_checkpoint} not found")

print("start testing")
print(type(model))
model.eval()
if torch.__version__ >= "2":
    model = torch.compile(model)


if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"


def evaluate(
    input=None,
    temperature=0.1,
    top_p=0.75,
    top_k=10,
    num_beams=2,
    max_new_tokens=512,
    **kwargs,
):
    prompt = generate_prompt(input)
    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs["input_ids"].to(device)
    generation_config = GenerationConfig(
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
        num_beams=num_beams,
        **kwargs,
    )
    with torch.no_grad():
        generation_output = model.generate(
            input_ids=input_ids,
            generation_config=generation_config,
            return_dict_in_generate=True,
            output_scores=True,
            max_new_tokens=max_new_tokens,
        )
    s = generation_output.sequences[0]
    output = tokenizer.decode(s)
    print("output: ", output)



Training Alpaca-LoRA model with params:
base_model: 
data_path: ./alpaca_data_cleaned.json
output_dir: ./lora-alpaca
batch_size: 8
micro_batch_size: 2
num_epochs: 3
learning_rate: 0.0001
cutoff_len: 512
val_set_size: 10
lora_r: 8
lora_alpha: 16
lora_dropout: 0.05
lora_target_modules: ['q_proj', 'v_proj']
train_on_inputs: True
group_by_length: False
resume_from_checkpoint: lora-alpaca/checkpoint-1600



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



resume_from_checkpoint='lora-alpaca/checkpoint-1600'
start testing
<class 'transformers.models.llama.modeling_llama.LlamaForCausalLM'>


Process ForkProcess-4:
Process ForkProcess-19:
Process ForkProcess-13:
Process ForkProcess-16:
Process ForkProcess-23:
Process ForkProcess-8:
Process ForkProcess-14:
Process ForkProcess-5:
Process ForkProcess-7:
Process ForkProcess-15:
Process ForkProcess-12:
Process ForkProcess-21:
Process ForkProcess-11:
Process ForkProcess-24:
Process ForkProcess-22:
Process ForkProcess-9:
Process ForkProcess-17:
Process ForkProcess-10:
Process ForkProcess-20:
Process ForkProcess-18:
Process ForkProcess-6:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (m

In [21]:
from threading import Thread

import torch
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TextIteratorStreamer


torch_device = "cuda" if torch.cuda.is_available() else "cpu"
server_name = "0.0.0.0"
server_port = 5000


def run_generation(user_text, top_p, temperature, top_k, max_new_tokens):
    # Get the model and tokenizer, and tokenize the user text.
    model_inputs = tokenizer([user_text], return_tensors="pt").to(torch_device)

    # Start generation on a separate thread, so that we don't block the UI. The text is pulled from the streamer
    # in the main thread. Adds timeout to the streamer to handle exceptions in the generation thread.
    streamer = TextIteratorStreamer(tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True)
    generate_kwargs = dict(
        model_inputs,
        streamer=streamer,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        top_p=top_p,
        temperature=float(temperature),
        top_k=top_k,
    )
    t = Thread(target=model.generate, kwargs=generate_kwargs)
    t.start()

    # Pull the generated text from the streamer, and update the model output.
    model_output = user_text + " "
    for new_text in streamer:
        model_output += new_text
        yield model_output
    return model_output


def reset_textbox():
    return gr.update(value='')


with gr.Blocks() as demo:
    
    gr.Markdown(
        "# Demo Natural Language Generation - group 5:\n"
    )

    with gr.Row():
        with gr.Column(scale=4):
            user_text = gr.Textbox(
                placeholder="Input text here...",
                label="User input"
            )
            model_output = gr.Textbox(label="Model output", lines=10, interactive=False)
            button_submit = gr.Button(value="Submit")

        with gr.Column(scale=1):
            max_new_tokens = gr.Slider(
                minimum=1, maximum=512, value=512, step=1, interactive=True, label="Max New Tokens",
            )
            top_p = gr.Slider(
                minimum=0.05, maximum=1.0, value=0.95, step=0.05, interactive=True, label="Top-p (nucleus sampling)",
            )
            top_k = gr.Slider(
                minimum=1, maximum=50, value=50, step=1, interactive=True, label="Top-k",
            )
            temperature = gr.Slider(
                minimum=0.1, maximum=5.0, value=0.8, step=0.1, interactive=True, label="Temperature",
            )

    user_text.submit(run_generation, [user_text, top_p, temperature, top_k, max_new_tokens], model_output)
    button_submit.click(run_generation, [user_text, top_p, temperature, top_k, max_new_tokens], model_output)

    demo.queue(max_size=32).launch(enable_queue=True, server_name="0.0.0.0", server_port=5000)

  demo.queue(max_size=32).launch(enable_queue=True, server_name="0.0.0.0", server_port=5000)


Running on local URL:  http://0.0.0.0:5000

To create a public link, set `share=True` in `launch()`.


In [20]:
#demo.close()

Closing server running on port: 5000
