In [None]:
!pip install peft
!pip install transformers torch auto-gptq optimum pip
!pip install optimum


import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from datasets import load_dataset
import transformers

def load_model_and_tokenizer():
    print("Loading model and tokenizer...")
    model_name = "TheBloke/Mistral-7B-Instruct-v0.2-GPTQ"
    
    # Load model with correct parameters
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",  # This handles GPU allocation
        trust_remote_code=True,
        revision="main",
        torch_dtype=torch.float16,
        low_cpu_mem_usage=True
    )
    
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        model.config.pad_token_id = tokenizer.eos_token_id
    
    # Prepare model for k-bit training
    model.train() # model in training mode
    model.gradient_checkpointing_enable()
    model = prepare_model_for_kbit_training(model)

    # LoRA config
    config = LoraConfig(
        r=8,
        lora_alpha=32,
        target_modules=["q_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM"
    )

    # Get PEFT model
    model = get_peft_model(model, config)
    
    return model, tokenizer

def generate_response(model, tokenizer, comment):
    instructions_string = """NewGPT, functioning as a virtual data science \
consultant on YouTube, communicates in clear, accessible language, escalating \
to technical depth upon request."""
    
    prompt = f'[INST] {instructions_string}\n{comment}\n[/INST]'
    
    # Move inputs to GPU immediately after tokenization
    inputs = tokenizer(prompt, return_tensors="pt")
    inputs = {k: v.to('cuda') for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            max_new_tokens=140,
            num_beams=1,
            do_sample=False,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
            temperature=0.7
        )
    
    outputs = outputs.cpu()
    response = tokenizer.batch_decode(outputs)[0]
    return response

def tokenize_function(examples):
    # extract text
    text = examples["example"]

    #tokenize and truncate text
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=512
    )
    return tokenized_inputs

def setup_training():
    # load dataset
    data = load_dataset("shawhin/shawgpt-youtube-comments")

    # tokenize training and validation datasets
    tokenized_data = data.map(tokenize_function, batched=True)

    # data collator
    data_collator = transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)

    # hyperparameters
    training_args = TrainingArguments(
        output_dir="shawgpt-ft",
        learning_rate=2e-4,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        num_train_epochs=10,
        weight_decay=0.01,
        logging_strategy="epoch",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        fp16=True,
        optim="paged_adamw_8bit",
    )

    # configure trainer
    trainer = Trainer(
        model=model,
        train_dataset=tokenized_data["train"],
        eval_dataset=tokenized_data["test"],
        args=training_args,
        data_collator=data_collator
    )

    return trainer

def train_model():
    trainer = setup_training()
    model.config.use_cache = False  # silence the warnings
    trainer.train()
    model.config.use_cache = True

def main():
    try:
        print(f"PyTorch version: {torch.__version__}")
        print(f"CUDA available: {torch.cuda.is_available()}")
        if torch.cuda.is_available():
            print(f"CUDA device: {torch.cuda.get_device_name(0)}")
        
        global model, tokenizer
        model, tokenizer = load_model_and_tokenizer()
        model.eval()  # Set to eval mode for inference
        
        # Print trainable parameters
        model.print_trainable_parameters()
        
        # Test generation
        comment = "Can you explain what a neural network is?"
        print("\nGenerating response...")
        response = generate_response(model, tokenizer, comment)
        
        print("\nModel response:")
        print("-" * 50)
        print(response)

    except Exception as e:
        print(f"An error occurred: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()