In [None]:
# !pip install -q -U bitsandbytes
!pip install einops
!pip install -q -U pandas
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git 
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install gradio
!pip install sentencepiece

In [6]:

import pandas as pd
import torch
from peft import PeftModel  


from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, BitsAndBytesConfig

from datasets import load_dataset, DatasetDict

model_id = "tiiuae/falcon-7b"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# tokenizer = AutoTokenizer.from_pretrained(model_id)
# model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, trust_remote_code=True)

from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)
# Check if the tokenizer has a pad token; if not, assign one.
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

data = load_dataset("databricks/databricks-dolly-15k")

# Define the tokenization function
def tokenize_function(examples):
    # Concatenate instruction and input text
    input_text = [' '.join(t) for t in zip(examples["instruction"], examples["context"])]
    output_text = examples["response"]

    # Tokenize inputs and outputs
    inputs = tokenizer(input_text, padding="max_length", truncation=True, max_length=512)
    outputs = tokenizer(output_text, padding="max_length", truncation=True, max_length=512)

    # Ensure 'labels' are -100 where we don't compute loss (this is necessary for huggingface/transformers)
    outputs['input_ids'] = [x if x != tokenizer.pad_token_id else -100 for x in outputs['input_ids']]

    return {"input_ids": inputs['input_ids'], "attention_mask": inputs['attention_mask'], "labels": outputs['input_ids']}

dataset = data.map(tokenize_function, batched=True, remove_columns=["instruction", "context", "response"])
dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Split the data into 80% for training and 20% for evaluation
split_data = dataset['train'].train_test_split(test_size=0.2)

# Now we update the dataset with the new split data
dataset = DatasetDict({
    'train': split_data['train'],
    'eval': split_data['test']
})

training_args = TrainingArguments(
    per_device_train_batch_size=1,  # Reduced batch size to manage GPU memory
    gradient_accumulation_steps=3,
    warmup_steps=2,
    #num_train_epochs=3,
    max_steps=10,
    learning_rate=2e-4,
    #int8=True,
    logging_steps=1,
    output_dir="outputs",
    optim="paged_adamw_8bit"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['eval'],
)

model.config.use_cache = False  # silence the warnings. Please re-enable for inference!

torch.cuda.empty_cache()  # Free up GPU memory before starting training
trainer.train()
torch.cuda.empty_cache()  # Free up GPU memory after training




ModuleNotFoundError: No module named 'scipy'