# non_Instruction_pretrain_llm_finetuning_on_domain_specific_data using LORA based method

## install libraries

In [None]:
!pip install -U peft bitsandbytes transformers accelerate

In [None]:
!pip install PyMuPDF

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType
from datasets import Dataset, load_dataset

## Our own custom data (non instrcution data) for domain specific finetuning

In [None]:
import fitz

In [None]:
def extract_text_from_pdf(pdf_path):
    text_blocks = []
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text = page.get_text("text").strip()
            if text:
                text_blocks.append(text)
    return text_blocks

In [None]:
pdf_texts = extract_text_from_pdf("/content/Metformin.pdf")

In [None]:
pdf_texts

In [None]:
import re
def split_paragraphs(pages):
    paragraphs = []
    for page_text in pages:
        # Split on double line breaks or long newlines
        chunks = re.split(r'\n\s*\n', page_text)
        for chunk in chunks:
            clean = chunk.strip()
            if len(clean) > 30:  # ignore too short lines
                paragraphs.append(clean)
    return paragraphs

In [None]:
paragraphs = split_paragraphs(pdf_texts)

In [None]:
data = [{"text": p} for p in paragraphs]

In [None]:
print(data)

In [None]:
dataset = Dataset.from_list(data)

In [None]:
dataset

## Lets select the model

In [None]:
model = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model)

In [None]:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [None]:
def tokenize_fn(examples):
    tokens = tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=512
    )
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

In [None]:
tokenized = dataset.map(tokenize_fn, batched=True)

In [None]:
tokenized

In [None]:
#loaded the quantized model
model = AutoModelForCausalLM.from_pretrained(
    model,
    load_in_8bit=True,
    device_map="auto"
)

In [None]:
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=16,
    target_modules=["q_proj","v_proj"],
    lora_dropout=0.05,
    bias="none"
)

In [None]:
model = get_peft_model(model, lora_config)

In [None]:
args = TrainingArguments(
    output_dir="./tinyllama-lora",
    num_train_epochs=5,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=20,
    save_total_limit=1,
    report_to="none"
)

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized
)

In [None]:
trainer.train()

In [None]:
model_path = "/content/tinyllama-lora/checkpoint-5"

In [None]:
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto")

In [None]:
prompt = "Clinical trials demonstrated that combining Atorvastatin with Ezetimibe"

In [None]:
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

In [None]:
outputs = model.generate(
    **inputs,
    max_new_tokens=100,
    temperature=0.8,
    top_p=0.9,
    do_sample=True,
    repetition_penalty=1.1
)

In [None]:
print("\nModel Output:\n")
print(tokenizer.decode(outputs[0], skip_special_tokens=True))