In [None]:
!pip install bitsandbytes


In [None]:
print("hello")

hello


In [None]:
!pip install transformers

In [None]:
!pip install datasets

In [None]:
!pip install peft


In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
from transformers import DataCollatorForSeq2Seq
from transformers import DataCollatorForLanguageModeling

In [None]:
dataset = load_dataset("cnn_dailymail", "3.0.0")

# Select only a subset (e.g., 5k training samples & 1k validation samples)
small_train = dataset["train"].shuffle(seed=42).select(range(5000))
small_valid = dataset["validation"].shuffle(seed=42).select(range(1000))

# Replace the dataset with the smaller subset
dataset["train"] = small_train
dataset["validation"] = small_valid


In [None]:
model_name = "gpt2"  # You can replace this with another model



In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
tokenizer.pad_token = tokenizer.eos_token
def tokenize_function(example):
    return tokenizer(example["article"], truncation=True, padding="max_length", max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [None]:
tokenized_datasets=tokenized_datasets.remove_columns(['article',"highlights"])
print(tokenized_datasets)
print(tokenized_datasets["train"][0]["input_ids"])


In [None]:
tokenized_datasets.set_format("torch")
print(tokenized_datasets)
print(tokenized_datasets["train"][0]["input_ids"])

In [None]:
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")

In [None]:
print(model)

In [None]:
#Apply LoRA
lora_config=LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["c_attn", "c_proj"],
    lora_dropout=0.05
)

In [None]:
model=get_peft_model(model,lora_config)

In [None]:
print(model)
# Check trainable parameters (After LoRA)
model.print_trainable_parameters()

In [None]:
training_args=TrainingArguments(
    output_dir="./fine_tuned_llm",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=1,
    logging_dir="./logs",
    logging_steps=500,
    save_total_limit=2,
    report_to="none"
)

In [None]:
#This collator automatically adds labels by shifting the input tokens during training.
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # mlm=True is for BERT-like models, but BLOOM is autoregressive
)

In [None]:


trainer=Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

  trainer=Trainer(


In [None]:
trainer.train()

In [None]:
trainer.save_model("./fine_tuned_llm")
tokenizer.save_pretrained("./fine_tuned_llm")

In [None]:
model_path="/kaggle/working/fine_tuned_llm"
tokenizer=AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token  # Fix padding issue
model=AutoModelForCausalLM.from_pretrained(model_path,torch_dtype=torch.float16,device_map="auto")

In [None]:
def generate_text(prompt,max_lenght=100):
    inputs=tokenizer(prompt,return_tensors="pt").to("cuda")
    with torch.no_grad():
        output=model.generate(
            **inputs,
            max_length=512,
            num_return_sequences=1,
            pad_token_id=tokenizer.eos_token_id,
            do_sample=True,
            temperature=0.9,
            top_k=40,
            top_p=0.85,
            repetition_penalty=1.2
        )
    return tokenizer.decode(output[0],skip_special_tokens=True)

In [None]:
prompt = "The latest advancements in AI suggest that"
generated_text = generate_text(prompt)
print("Generated Output:\n", generated_text)

Generated Output:
 The latest advancements in AI suggest that robots can be made to recognise and recognize smells. Robots are now capable of detecting smell's cumbersomeness as they walk around, for example through the scent hearers get from their eyes; noddles will detect a whistle on an armchair speaker with air-powered microphones but don't have these tools anymore . [ccordingto news reports , scientists were able develop machines which would stop sniffing when faced withdescription (	) - this is what you'd expect if humans went into practitionesive moods or challengies? In contrast we're still trying it here at MIT – robotic vision has been debated since 2008 by academics who describe them not just too much like dogs without any sense about where your ears stand: We've got tissues! This time though I'll tell some science fiction tale : The future looks brighter than ever before because my favourite thing was having two scissors fitted onto each other while talkingensical sentences