<a href="https://colab.research.google.com/github/pamelag/level-up-ai/blob/main/Falcon_FT(4).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!nvidia-smi

In [None]:
!pip install pip
!pip install bitsandbytes
!pip install torch
!pip install transformers
!pip install peft
!pip install accelerate
!pip install datasets
!pip install loralib
!pip install einops

In [None]:
!pip install pyarrow==15.0.0

In [None]:
import json
import os
from pprint import pprint

import bitsandbytes as bnb
import pandas as pd
import torch
import torch.nn as nn
import transformers
from datasets import load_dataset
from huggingface_hub import notebook_login
from peft import (
    LoraConfig,
    PeftConfig,
    PeftModel,
    get_peft_model,
    prepare_model_for_kbit_training,
)
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
)

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
notebook_login()

In [None]:
!gdown 1u85RQZdRTmpjGKcCc5anCMAHZ-um4DUC

In [None]:
with open("delivery-plan-faq-new.json") as json_file:
    data = json.load(json_file)

In [None]:
data

In [None]:
with open("delivery-plan-faq-new.json", "w") as f:
    json.dump(data, f)

In [None]:
pd.DataFrame(data).head()

In [None]:
MODEL_NAME = "tiiuae/falcon-7b"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    trust_remote_code=True,
    quantization_config=bnb_config,
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [None]:
config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["query_key_value"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

In [None]:
prompt = f"""
: Strategy & Innovation team is working on Data Insights and Actions under Digital and Customer Identity strategic Program and Embrace Innovation strategic Focus Area. The initiative lead of Digital Identity is Mimi, Director, Strategy & Innovation. High Level Actions under this initiative is to Implement the capture, use & storage of Tax File Number (TFN) as a unique identifier for customers in property transactions. Major Milestones - Commonwealth legislative amendments commenced to enable Revenue NSW's use of TFNs to administer property related taxes, grants and schemes. Developed collateral for the Interjurisdictional Working Group to implement the proposal with a consistent and considered approach Completed artefacts supporting TFN proposal for NSW and commenced implementation preparations.
:
""".strip()
print(prompt)

In [None]:
generation_config = model.generation_config
generation_config.max_new_tokens = 200
generation_config.temperature = 0.7
generation_config.top_p = 0.7
generation_config.num_return_sequences = 1
generation_config.pad_token_id = tokenizer.eos_token_id
generation_config.eos_token_id = tokenizer.eos_token_id

In [None]:
generation_config

In [None]:
%%time
device = "cuda:0"

encoding = tokenizer(prompt, return_tensors="pt").to(device)
# with torch.inference_mode():
outputs = model.generate(
    input_ids=encoding.input_ids,
    attention_mask=encoding.attention_mask,
    generation_config=generation_config,
)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

In [None]:
data = load_dataset("json", data_files="delivery-plan-faq-new.json")

In [None]:
data

In [None]:
data["train"][0]

In [None]:
def generate_prompt(data_point):
    return f"""
: {data_point["question"]}
: {data_point["answer"]}
""".strip()


def generate_and_tokenize_prompt(data_point):
    full_prompt = generate_prompt(data_point)
    tokenized_full_prompt = tokenizer(full_prompt, padding=True, truncation=True)
    return tokenized_full_prompt

In [None]:
data = data["train"].shuffle().map(generate_and_tokenize_prompt)

In [None]:
data

In [None]:
OUTPUT_DIR = "experiments"

In [None]:
training_args = transformers.TrainingArguments(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    learning_rate=2e-4,
    fp16=True,
    save_total_limit=3,
    logging_steps=1,
    output_dir=OUTPUT_DIR,
    max_steps=100,
    optim="paged_adamw_8bit",
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,
)

trainer = transformers.Trainer(
    model=model,
    train_dataset=data,
    args=training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False
trainer.train()

In [None]:
model.save_pretrained("trained-model")

In [None]:
config = PeftConfig.from_pretrained("trained-model")
model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    return_dict=True,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
tokenizer.pad_token = tokenizer.eos_token

model = PeftModel.from_pretrained(model, "trained-model")

In [None]:
generation_config = model.generation_config
generation_config.max_new_tokens = 200
generation_config.temperature = 0.7
generation_config.top_p = 0.7
generation_config.num_return_sequences = 1
generation_config.pad_token_id = tokenizer.eos_token_id
generation_config.eos_token_id = tokenizer.eos_token_id

In [None]:
DEVICE = "cuda:0"

In [None]:
%%time
prompt = f"""
: Strategy & Innovation team is working on Data Insights and Actions under Digital and Customer Identity strategic Program and Embrace Innovation strategic Focus Area. The initiative lead of Digital Identity is Mimi, Director, Strategy & Innovation. High Level Actions under this initiative is to Implement the capture, use & storage of Tax File Number (TFN) as a unique identifier for customers in property transactions. Major Milestones - Commonwealth legislative amendments commenced to enable Revenue NSW's use of TFNs to administer property related taxes, grants and schemes. Developed collateral for the Interjurisdictional Working Group to implement the proposal with a consistent and considered approach Completed artefacts supporting TFN proposal for NSW and commenced implementation preparations.
:
""".strip()

encoding = tokenizer(prompt, return_tensors="pt").to(DEVICE)
with torch.inference_mode():
    outputs = model.generate(
        input_ids=encoding.input_ids,
        attention_mask=encoding.attention_mask,
        generation_config=generation_config,
    )
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

In [None]:
def generate_response(question: str) -> str:
    prompt = f"""
: {question}
:
""".strip()
    encoding = tokenizer(prompt, return_tensors="pt").to(DEVICE)
    with torch.inference_mode():
        outputs = model.generate(
            input_ids=encoding.input_ids,
            attention_mask=encoding.attention_mask,
            generation_config=generation_config,
        )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    assistant_start = ":"
    response_start = response.find(assistant_start)
    return response[response_start + len(assistant_start) :].strip()

In [None]:
prompt = "Strategy & Innovation team is working on Data Insights and Actions under Digital and Customer Identity strategic Program and Embrace Innovation strategic Focus Area. The initiative lead of Digital Identity is Jocelyn Yem, Director, Strategy & Innovation. High Level Actions under this initiative is to Implement the capture, use & storage of Tax File Number (TFN) as a unique identifier for customers in property transactions. Major Milestones - Commonwealth legislative amendments commenced to enable Revenue NSW's use of TFNs to administer property related taxes, grants and schemes. Developed collateral for the Interjurisdictional Working Group to implement the proposal with a consistent and considered approach Completed artefacts supporting TFN proposal for NSW and commenced implementation preparations."
print(generate_response(prompt))

In [None]:
prompt = "Optimisation and Project Delivery team is working on Improved Accessibility as part of Being Responsive to our Customers Aspire Initiative under Supporting our diverse customers Program and passionate customer focus strategic Focus Area. The initiative lead of Being Responsive to our Customers is Sharon Bicknell, Director, Optimisation and Project Delivery. High Level Actions under this initiative is Expand & improve accessibility to the Work and Development Order program. Major Milestones - Completed discovery and design.,Confirmed acceptance of discovery and design recommendations, Developed implementation plan, Implemented recommendations."
print(generate_response(prompt))

In [None]:
prompt = "Technical Advisory Services team is working on Implement new structure for prosecutions & valuations program as part of Improved Integrity Aspire Initiative under Future Fit Workforce strategic Program and Collect, Protect, Enable Focus Area. The initiative lead of Transparent options to support customers is Cullen Smythe, Executive Director, Technical Advisory Services. High Level Action for this initiative is to Implement new structure for prosecutions & valuations. Major Milestones - Delivered an uplift in staff capability in prosecutions and valuations through delivery of training and cascading of new skill set, Confirmed procurement of an expert valuer"

print(generate_response(prompt))