## Installing libraries

In [1]:
!pip install "transformers==4.35" "datasets==2.13.0" "peft==0.4.0" "accelerate==0.21.0" "bitsandbytes==0.40.2" "trl==0.4.7" "safetensors>=0.3.1" "tiktoken"

Collecting transformers==4.35
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets==2.13.0
  Downloading datasets-2.13.0-py3-none-any.whl (485 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.6/485.6 kB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft==0.4.0
  Downloading peft-0.4.0-py3-none-any.whl (72 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate==0.21.0
  Downloading accelerate-0.21.0-py3-none-any.whl (244 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m29.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitsandbytes==0.40.2
  Downloading bitsandbytes-0.40.2-py3-none-any.whl (92.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [3

In [2]:
import pandas as pd
import torch
from datasets import Dataset, load_dataset
from random import randrange
from peft import LoraConfig, get_peft_model, AutoPeftModelForCausalLM
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from trl import SFTTrainer

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


# Load dataset from local

In [4]:
!ls -ltr *.csv

-rw-r--r-- 1 root root 6264940 May 28 09:59 train_chunch_12_books.csv


In [5]:
df = pd.read_csv("train_chunch_12_books.csv")

In [6]:
#convert to Huggingface Datasets format
train = Dataset.from_pandas(df)

In [7]:
train

Dataset({
    features: ['text'],
    num_rows: 13951
})

# Load the dataset from Huggingface

In [None]:
# !huggingface-cli login

In [None]:
# from datasets import load_dataset, Dataset
# dataset = load_dataset("HuggingFaceH4/no_robots")

In [None]:
# dataset

# Fine-Tuning

In [8]:
model_id = "TheBloke/Llama-2-7B-Chat-fp16"

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) n
Token is valid (permission: read).
Your token has been saved to /root

In [9]:
# Get the type
compute_dtype = getattr(torch, "float16")

# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype
)


In [10]:
%%time
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Downloading tokenizer_config.json:   0%|          | 0.00/770 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

CPU times: user 363 ms, sys: 41.1 ms, total: 404 ms
Wall time: 2.02 s


In [11]:
%%time
# Load the pretrained model
model = AutoModelForCausalLM.from_pretrained(model_id,
                                             quantization_config=bnb_config,
                                             device_map="auto")

Downloading config.json:   0%|          | 0.00/635 [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading generation_config.json:   0%|          | 0.00/167 [00:00<?, ?B/s]

CPU times: user 24.4 s, sys: 27.4 s, total: 51.8 s
Wall time: 4min 50s




In [12]:
# LoRA config based on QLoRA paper
peft_config = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.1,
        r=64,
        bias="none",
        task_type="CAUSAL_LM"
)

In [13]:
# Define the training arguments. For full list of arguments, check
#https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments
args = TrainingArguments(
    output_dir='llama2-7b',
    num_train_epochs=10, # adjust based on the data size
    per_device_train_batch_size=2, # use 4 if you have more GPU RAM
    save_strategy="epoch", #steps
    # evaluation_strategy="epoch",
    learning_rate=2e-4,
    fp16=True,
    seed=42
)

In [14]:
# Create the trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=train,
    # eval_dataset=test,
    dataset_text_field='text',
    peft_config=peft_config,
    max_seq_length=1042,
    tokenizer=tokenizer,
    args=args,
    packing=True,
)



In [16]:
# train
trainer.train()

KeyboardInterrupt: 

In [15]:
# save model in local
trainer.save_model()

# Merge the base model and adapters and save it

Clean the memory

In [33]:
# Empty VRAM
del model
del trainer
import gc
gc.collect()
gc.collect()

NameError: name 'merged_model' is not defined

In [31]:
torch.cuda.empty_cache()

In [18]:
gc.collect()

0

Reload the saved model and merge it then we can save the whole model

In [19]:
%%time
from peft import AutoPeftModelForCausalLM

new_model = AutoPeftModelForCausalLM.from_pretrained(
    'llama2-7b',
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="auto",
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



CPU times: user 24.6 s, sys: 8.4 s, total: 33 s
Wall time: 1min 37s


In [20]:
# Merge LoRA and base model
merged_model = new_model.merge_and_unload()

In [21]:
# Save the merged model
merged_model.save_pretrained("metallama2-7b-tuned-merged", safe_serialization=True)
tokenizer.save_pretrained("metallama2-7b-tuned-merged")


Thrown during validation:
`do_sample` is set to `False`. However, `temperature` is set to `0.9` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.


('metallama2-7b-tuned-merged/tokenizer_config.json',
 'metallama2-7b-tuned-merged/special_tokens_map.json',
 'metallama2-7b-tuned-merged/tokenizer.model',
 'metallama2-7b-tuned-merged/added_tokens.json',
 'metallama2-7b-tuned-merged/tokenizer.json')

# Test the model

In [22]:
prompt = "What is Subdomain Discovery?"

In [23]:
input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [24]:
outputs = merged_model.generate(input_ids=input_ids,
                         max_new_tokens=200,
                        #  do_sample=True,
                        #  top_p=0.9,
                         temperature=0.6)



In [25]:
result = tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]

In [26]:
print(result)

What is Subdomain Discovery in Thp subdomain discovery is a feature that allows you to search for subdomains of a target domain this is useful when you are trying to find all the subdomains of a large domain and not just the ones that are publicly accessible”  subdomain discovery is a feature that is only available in thp 22 since it is a feature that is used to search for subdomains of a target domain and not just the ones that are publicly accessible we can use this feature to search for all the subdomains of cyberspacekittenscom cyberspacekittenscom domain cyberspacekittenscom subdomain discovery cyberspacekittenscom subdomain discovery 


In [28]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [29]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) n
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [30]:
# push merged model to the hub
%%time
hf_model_repo = "alexrai12/TheBloke-llama-2-7b-domain-tuned"
merged_model.push_to_hub(hf_model_repo)
tokenizer.push_to_hub(hf_model_repo)


Thrown during validation:
`do_sample` is set to `False`. However, `temperature` is set to `0.9` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.


model-00003-of-00003.safetensors:   0%|          | 0.00/3.59G [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

CPU times: user 1min 15s, sys: 58.7 s, total: 2min 14s
Wall time: 11min 38s


CommitInfo(commit_url='https://huggingface.co/Kalishasur/TheBloke-llama-2-7b-domain-tuned/commit/9bd95252d80b1e774cf9609c9f1bbcb5930ad76c', commit_message='Upload tokenizer', commit_description='', oid='9bd95252d80b1e774cf9609c9f1bbcb5930ad76c', pr_url=None, pr_revision=None, pr_num=None)

# Load the model from the HF Hub and test it

In [1]:
!pip install bitsandbytes accelerate #restart kernel



In [2]:
import torch
from transformers import BitsAndBytesConfig

# Get the type
compute_dtype = getattr(torch, "float16")

# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype
)

  _torch_pytree._register_pytree_node(


In [3]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

hf_model_repo = "Kalishasur/TheBloke-llama-2-7b-domain-tuned"

# Get the tokenizer
tokenizer = AutoTokenizer.from_pretrained(hf_model_repo)

# Load the model
model = AutoModelForCausalLM.from_pretrained(hf_model_repo,
                                             quantization_config=bnb_config,
                                             device_map="auto")

Downloading tokenizer_config.json:   0%|          | 0.00/863 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/437 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/682 [00:00<?, ?B/s]

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


Downloading (…)fetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading (…)of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:
prompt = "What is Subdomain Discovery?"

In [None]:
# Generate response
%%time
input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids
outputs = model.generate(input_ids=input_ids,
                         max_new_tokens=200,
                         temperature=0.6)

result = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

# Print the result
print(f"Generated response:\n{result}")


# Use Transformers Pipeline for Inference

In [None]:
import transformers

tokenizer = AutoTokenizer.from_pretrained("genaitraining/llama-2-7b-domain-tuned",  trust_remote_code=True)
pipeline = transformers.pipeline(
    "text-generation",
    model="genaitraining/llama-2-7b-domain-tuned",
    trust_remote_code=True

)

In [None]:
%%time
sequences = pipeline(
    prompt,
    temperature=0.6,
    eos_token_id=tokenizer.eos_token_id,
    max_length=200,
)

In [None]:
for seq in sequences:
    print(seq['generated_text'])