In [1]:
!pip install transformers
!pip install peft
!pip install accelerate
!pip install datasets
!pip install pandas
!pip install numpy
!wget https://archive.org/download/stackexchange/ai.stackexchange.com.7z
!7z x ai.stackexchange.com.7z

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m31.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [2]:
import torch
import transformers
import datasets
import pandas as pd
import xml.etree.ElementTree as ET
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model
import json

# Parse XML and create Q&A pairs
def parse_posts(path='Posts.xml'):
    questions = {}
    answers = {}

    # Parse XML
    for _, elem in ET.iterparse(path):
        if elem.tag == 'row':
            post_type = elem.get('PostTypeId')

            if post_type == '1':  # Question
                questions[elem.get('Id')] = {
                    'title': elem.get('Title'),
                    'body': elem.get('Body'),
                    'score': int(elem.get('Score', 0))
                }
            elif post_type == '2':  # Answer
                parent_id = elem.get('ParentId')
                if parent_id in questions:
                    if parent_id not in answers:
                        answers[parent_id] = []
                    answers[parent_id].append({
                        'body': elem.get('Body'),
                        'score': int(elem.get('Score', 0))
                    })

    return questions, answers

# Create dataset
questions, answers = parse_posts()

# Format into training examples
training_pairs = []
for q_id, question in questions.items():
    if q_id in answers:
        # Get highest scored answer
        best_answer = max(answers[q_id], key=lambda x: x['score'])

        # Format as instruction
        text = f"Question: {question['title']}\n\nAnswer: {best_answer['body']}"
        training_pairs.append({"text": text})

# Create dataset
dataset = datasets.Dataset.from_dict({"text": [p["text"] for p in training_pairs]})
print(f"Created dataset with {len(dataset)} Q&A pairs")

Created dataset with 8577 Q&A pairs


In [3]:
# Load model and tokenizer
model_name = "TinyLlama/TinyLlama-1.1B-step-50K-105b"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)

# Set up padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Prepare the dataset for training
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=256,  # Reasonable length for Q&A
        padding="max_length",
        return_tensors="pt"
    )

# Process dataset
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=dataset.column_names
)

# Split into train and validation
split_dataset = tokenized_dataset.train_test_split(test_size=0.1)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

print(f"Train size: {len(train_dataset)}, Validation size: {len(eval_dataset)}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/607 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/4.40G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/129 [00:00<?, ?B/s]

Map:   0%|          | 0/8577 [00:00<?, ? examples/s]

Train size: 7719, Validation size: 858


In [4]:
# Configure LoRA
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "v_proj"]  # Target attention modules
)

# Apply LoRA to model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# Training arguments
training_args = TrainingArguments(
    output_dir="./ml-qa-tinyllama",
    overwrite_output_dir=True,
    num_train_epochs=3,              # Train for 3 epochs
    per_device_train_batch_size=4,   # Smaller batch size for stability
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8,   # Accumulate gradients for stability
    evaluation_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    learning_rate=2e-4,
    fp16=True,                       # Use mixed precision
    logging_steps=100,
    optim="adamw_torch",
    report_to="none"
)

trainable params: 1,126,400 || all params: 1,101,174,784 || trainable%: 0.1023




In [None]:
# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False
    )
)

# Train model
try:
    trainer.train()
    # Save the model
    model.save_pretrained("./ml-qa-tinyllama")
    tokenizer.save_pretrained("./ml-qa-tinyllama")
    print("Training completed and model saved successfully!")
except Exception as e:
    print(f"An error occurred during training: {str(e)}")

Step,Training Loss,Validation Loss


In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftConfig, PeftModel

def load_qa_model():
    try:
        model_path = "./ml-qa-tinyllama"
        print("Loading tokenizer...")
        # Load tokenizer from the saved path
        tokenizer = AutoTokenizer.from_pretrained(model_path)

        print("Loading config...")
        config = PeftConfig.from_pretrained(model_path)

        print("Loading base model...")
        base_model = AutoModelForCausalLM.from_pretrained(
            config.base_model_name_or_path,
            torch_dtype=torch.float16,
            device_map="auto"
        )

        print("Loading fine-tuned model...")
        model = PeftModel.from_pretrained(base_model, model_path)
        print("Model loaded successfully!")
        return model, tokenizer

    except Exception as e:
        print(f"Detailed error: {str(e)}")
        return None, None

def ask_ml_question(question, model, tokenizer, max_length=200):
    if model is None or tokenizer is None:
        return "Model not properly loaded"

    try:
        prompt = f"Question: {question}\n\nAnswer:"
        inputs = tokenizer(prompt, return_tensors="pt")
        inputs = {k: v.to(model.device) for k, v in inputs.items()}

        outputs = model.generate(
            **inputs,
            max_length=max_length,
            temperature=0.7,
            do_sample=True,
            top_p=0.95,
            pad_token_id=tokenizer.pad_token_id
        )

        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        answer = response.split("Answer:")[-1].strip()
        return answer
    except Exception as e:
        return f"Error generating response: {str(e)}"

# Load and test
print("Starting model load...")
model, tokenizer = load_qa_model()

if model is not None and tokenizer is not None:
    test_question = "What is gradient descent?"
    print("\nTesting with question:", test_question)
    answer = ask_ml_question(test_question, model, tokenizer)
    print("Answer:", answer)
else:
    print("Failed to load model properly")

In [None]:
import re

def ask_ml_question(question, model, tokenizer, max_length=200):
    if model is None or tokenizer is None:
        return "Model not properly loaded"

    try:
        prompt = f"Question: {question}\n\nAnswer:"
        inputs = tokenizer(prompt, return_tensors="pt")
        inputs = {k: v.to(model.device) for k, v in inputs.items()}

        outputs = model.generate(
            **inputs,
            max_length=max_length,
            temperature=0.7,
            do_sample=True,
            top_p=0.95,
            pad_token_id=tokenizer.pad_token_id
        )

        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        answer = response.split("Answer:")[-1].strip()

        # Clean HTML tags
        clean_answer = re.sub('<[^<]+?>', '', answer)
        return clean_answer
    except Exception as e:
        return f"Error generating response: {str(e)}"

# Test it
test_questions = [
    "What is gradient descent?",
    "Explain how neural networks work",
    "What is the difference between supervised and unsupervised learning?"
]

for question in test_questions:
    print(f"\nQuestion: {question}")
    print(f"Answer: {ask_ml_question(question, model, tokenizer)}")