# ü§ñ Fine-tuning Babi≈°ova stylu - Google Colab

Tento notebook implementuje kompletn√≠ pipeline pro fine-tuning jazykov√©ho modelu na napodoben√≠ charakteristick√©ho stylu komunikace Andreje Babi≈°e.

## üéØ C√≠l
Vytvo≈ôit model, kter√Ω dok√°≈æe generovat satirick√© odpovƒõdi ve stylu "bab√≠≈°ov≈°tiny" - charakteristick√©ho jazykov√©ho stylu s mluvenou ƒçe≈°tinou, slovensko-ƒçesk√Ωmi odchylkami a specifick√Ωmi r√©torick√Ωmi prvky.

## üöÄ V√Ωhody Google Colab
- ‚úÖ **Zdarma GPU** (Tesla T4/P100)
- ‚úÖ **≈Ω√°dn√° instalace** - v≈°e v prohl√≠≈æeƒçi
- ‚úÖ **Automatick√© ukl√°d√°n√≠** na Google Drive
- ‚úÖ **Sd√≠len√≠** v√Ωsledk≈Ø p≈ôes Hugging Face Hub
- ‚úÖ **Monitoring** p≈ôes Weights & Biases

## üì¶ Instalace a setup

In [None]:
#!/usr/bin/env python3

# Instalace pot≈ôebn√Ωch knihoven
!pip install -q transformers datasets peft accelerate bitsandbytes wandb tiktoken
!pip install -q huggingface_hub gradio streamlit

# Restart runtime po instalaci
import os
os.kill(os.getpid(), 9)

In [None]:
#!/usr/bin/env python3

# Import knihoven
import torch
import json
import logging
from typing import Optional, Dict, Any
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    EarlyStoppingCallback
)
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
    TaskType
)
from datasets import Dataset, DatasetDict
import wandb
from huggingface_hub import HfApi, login
from google.colab import drive
from tqdm import tqdm

# Nastaven√≠ logov√°n√≠
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

print(f"PyTorch verze: {torch.__version__}")
print(f"CUDA dostupn√©: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU pamƒõ≈•: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB") 

In [None]:
#!/usr/bin/env python3

# P≈ôipojen√≠ Google Drive
drive.mount('/content/drive')

# Vytvo≈ôen√≠ adres√°≈ô≈Ø
!mkdir -p /content/babis_finetune
!mkdir -p /content/drive/MyDrive/babis_finetune

print("Google Drive p≈ôipojen a adres√°≈ôe vytvo≈ôeny!") 

## ‚öôÔ∏è Konfigurace

In [None]:
#!/usr/bin/env python3

"""
Konfigurace pro fine-tuning Babi≈°ova stylu
"""
from dataclasses import dataclass
from typing import List

@dataclass
class ColabConfig:
    """Konfigurace pro Google Colab fine-tuning"""
    
    # Model settings
    base_model: str = "microsoft/DialoGPT-medium"  # Mal√Ω model pro Colab
    model_name: str = "babis-dialogpt-colab"
    
    # Training settings (optimalizov√°no pro Colab GPU)
    learning_rate: float = 2e-4
    num_train_epochs: int = 2
    per_device_train_batch_size: int = 2
    per_device_eval_batch_size: int = 2
    gradient_accumulation_steps: int = 8
    max_grad_norm: float = 0.3
    warmup_steps: int = 50
    logging_steps: int = 10
    save_steps: int = 200
    eval_steps: int = 200
    
    # LoRA settings
    lora_r: int = 8
    lora_alpha: int = 16
    lora_dropout: float = 0.1
    target_modules: List[str] = None
    
    # Dataset settings
    max_seq_length: int = 512
    train_split: float = 0.9
    eval_split: float = 0.1
    
    # Output settings
    output_dir: str = "/content/babis_finetune"
    logging_dir: str = "/content/babis_finetune/logs"
    
    # Hardware settings (optimalizov√°no pro Colab)
    fp16: bool = True
    bf16: bool = False
    use_8bit: bool = False
    use_4bit: bool = True
    
    # Evaluation settings
    evaluation_strategy: str = "steps"
    save_strategy: str = "steps"
    load_best_model_at_end: bool = True
    metric_for_best_model: str = "eval_loss"
    greater_is_better: bool = False
    
    def __post_init__(self):
        if self.target_modules is None:
            # Pro DialoGPT
            self.target_modules = ["c_attn", "c_proj", "wte", "wpe"]
        
        # Vytvo≈ôen√≠ adres√°≈ô≈Ø
        import os
        os.makedirs(self.output_dir, exist_ok=True)
        os.makedirs(self.logging_dir, exist_ok=True)

# Vytvo≈ôen√≠ konfigurace
config = ColabConfig()
print("Konfigurace vytvo≈ôena:")
print(f"Base model: {config.base_model}")
print(f"Output dir: {config.output_dir}")
print(f"LoRA r: {config.lora_r}, alpha: {config.lora_alpha}") 

: 

## üìä P≈ô√≠prava datasetu

In [None]:
"""
Modul pro naƒç√≠t√°n√≠ a zpracov√°n√≠ datasetu
"""
import json
from pathlib import Path
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

def create_babis_dataset():
    """Naƒçte skuteƒçn√Ω dataset s Babi≈°ov√Ωmi v√Ωroky z JSONL soubor≈Ø"""
    
    # Cesta k soubor≈Øm s daty
    data_dir = Path("final")
    jsonl_files = list(data_dir.glob("batch_*_babis_output_qa.jsonl"))
    
    if not jsonl_files:
        raise FileNotFoundError(f"Nenalezeny ≈æ√°dn√© JSONL soubory v adres√°≈ôi {data_dir}")
    
    conversations = []
    
    # Naƒçten√≠ v≈°ech soubor≈Ø
    for file_path in jsonl_files:
        print(f"Naƒç√≠t√°m {file_path.name}...")
        with open(file_path, 'r', encoding='utf-8') as f:
            for line_num, line in enumerate(f, 1):
                try:
                    data = json.loads(line.strip())
                    question = data.get('question', '')
                    answer = data.get('answer', '')
                    
                    # Vytvo≈ôen√≠ konverzace ve form√°tu pro fine-tuning
                    conversation = {
                        "prompt": question,
                        "completion": answer,
                        "full_conversation": f"U≈æivatel: {question}\nAndrej Babi≈°: {answer}"
                    }
                    conversations.append(conversation)
                    
                except json.JSONDecodeError as e:
                    print(f"Chyba p≈ôi parsov√°n√≠ JSON na ≈ô√°dku {line_num} v souboru {file_path.name}: {e}")
                    continue
    
    print(f"Celkem naƒçteno {len(conversations)} konverzac√≠ z {len(jsonl_files)} soubor≈Ø")
    
    if len(conversations) == 0:
        raise ValueError("Nebyla naƒçtena ≈æ√°dn√° konverzace z JSONL soubor≈Ø")
    
    # Rozdƒõlen√≠ na train/validation
    train_data, eval_data = train_test_split(conversations, train_size=0.9, random_state=42)
    
    # Vytvo≈ôen√≠ Dataset objekt≈Ø
    train_dataset = Dataset.from_list(train_data)
    eval_dataset = Dataset.from_list(eval_data)
    
    # Vytvo≈ôen√≠ DatasetDict
    dataset_dict = DatasetDict({
        'train': train_dataset,
        'validation': eval_dataset
    })
    
    return dataset_dict

# Vytvo≈ôen√≠ datasetu
dataset = create_babis_dataset()
print(f"Dataset vytvo≈ôen:")
print(f"Train samples: {len(dataset['train'])}")
print(f"Validation samples: {len(dataset['validation'])}")
print(f"\nP≈ô√≠klad dat:")
print(dataset['train'][0])

## ü§ñ Naƒçten√≠ modelu a tokenizeru

In [None]:
# Naƒçten√≠ tokenizeru
tokenizer = AutoTokenizer.from_pretrained(config.base_model)

# P≈ôid√°n√≠ padding tokenu pokud chyb√≠
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print(f"Tokenizer naƒçten: {config.base_model}")
print(f"Vocab size: {tokenizer.vocab_size}")
print(f"Pad token: {tokenizer.pad_token}")

In [None]:
# Naƒçten√≠ modelu s kvantizac√≠ (optimalizov√°no pro Colab GPU)
model_kwargs = {
    "torch_dtype": torch.float16 if config.fp16 else torch.float32,
    "device_map": "auto" if torch.cuda.is_available() else None
}

# Kvantizace pro √∫sporu pamƒõti
if config.use_4bit:
    model_kwargs["load_in_4bit"] = True
elif config.use_8bit:
    model_kwargs["load_in_8bit"] = True

model = AutoModelForCausalLM.from_pretrained(
    config.base_model,
    **model_kwargs
)

# P≈ô√≠prava modelu pro kvantizovan√© tr√©nov√°n√≠
if config.use_4bit or config.use_8bit:
    model = prepare_model_for_kbit_training(model)

print(f"Model naƒçten: {config.base_model}")
print(f"Model device: {next(model.parameters()).device}")
print(f"Model dtype: {next(model.parameters()).dtype}")

## üéØ Nastaven√≠ LoRA

In [None]:
# Nastaven√≠ LoRA konfigurace
lora_config = LoraConfig(
    r=config.lora_r,
    lora_alpha=config.lora_alpha,
    target_modules=config.target_modules,
    lora_dropout=config.lora_dropout,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

# Aplikace LoRA na model
model = get_peft_model(model, lora_config)

# V√Ωpis tr√©novateln√Ωch parametr≈Ø
model.print_trainable_parameters()

print("\nLoRA konfigurace nastavena!")
print(f"LoRA r: {config.lora_r}")
print(f"LoRA alpha: {config.lora_alpha}")
print(f"Target modules: {config.target_modules}") 

## üî§ Tokenizace datasetu

In [None]:
def tokenize_function(examples):
    """Tokenizuje texty v datasetu"""
    # Pou≈æ√≠v√°me full_conversation pro tr√©nov√°n√≠
    texts = examples['full_conversation']
    
    # Tokenizace
    tokenized = tokenizer(
        texts,
        truncation=True,
        padding=True,
        max_length=config.max_seq_length,
        return_tensors="pt"
    )
    
    # Nastaven√≠ labels na input_ids pro causal LM
    tokenized["labels"] = tokenized["input_ids"].clone()
    
    return tokenized

# Tokenizace datasetu
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=dataset["train"].column_names
)

print("Dataset tokenizov√°n!")
print(f"Train samples: {len(tokenized_dataset['train'])}")
print(f"Validation samples: {len(tokenized_dataset['validation'])}")
print(f"\nP≈ô√≠klad tokenizovan√Ωch dat:")
print(f"Input shape: {tokenized_dataset['train'][0]['input_ids'].shape}")
print(f"Labels shape: {tokenized_dataset['train'][0]['labels'].shape}") 

## üèãÔ∏è Fine-tuning

In [None]:
# Nastaven√≠ Weights & Biases (voliteln√©)
try:
    wandb.init(
        project="babis-finetune-colab",
        name=config.model_name,
        config=vars(config)
    )
    print("Weights & Biases inicializov√°no")
except Exception as e:
    print(f"WandB inicializace selhala: {e}")
    wandb = None

In [None]:
# Nastaven√≠ training arguments
training_args = TrainingArguments(
    output_dir=config.output_dir,
    num_train_epochs=config.num_train_epochs,
    per_device_train_batch_size=config.per_device_train_batch_size,
    per_device_eval_batch_size=config.per_device_eval_batch_size,
    gradient_accumulation_steps=config.gradient_accumulation_steps,
    learning_rate=config.learning_rate,
    max_grad_norm=config.max_grad_norm,
    warmup_steps=config.warmup_steps,
    logging_steps=config.logging_steps,
    save_steps=config.save_steps,
    eval_steps=config.eval_steps,
    evaluation_strategy=config.evaluation_strategy,
    save_strategy=config.save_strategy,
    load_best_model_at_end=config.load_best_model_at_end,
    metric_for_best_model=config.metric_for_best_model,
    greater_is_better=config.greater_is_better,
    fp16=config.fp16,
    bf16=config.bf16,
    logging_dir=config.logging_dir,
    report_to="wandb" if wandb else None,
    remove_unused_columns=False,
    dataloader_pin_memory=False,
    save_total_limit=3,
    prediction_loss_only=True
)

print("Training arguments nastaveny:")
print(f"Learning rate: {config.learning_rate}")
print(f"Epochs: {config.num_train_epochs}")
print(f"Batch size: {config.per_device_train_batch_size}")
print(f"Gradient accumulation: {config.gradient_accumulation_steps}")

In [None]:
# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Callbacks
callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    callbacks=callbacks
)

print("Trainer nastaven!")
print(f"Train samples: {len(tokenized_dataset['train'])}")
print(f"Validation samples: {len(tokenized_dataset['validation'])}")

In [None]:
# Spu≈°tƒõn√≠ tr√©nov√°n√≠
print("üöÄ Zaƒç√≠n√°m fine-tuning...")
print(f"Model: {config.base_model}")
print(f"Dataset: {len(tokenized_dataset['train'])} train, {len(tokenized_dataset['validation'])} validation")
print(f"Epochs: {config.num_train_epochs}")
print(f"LoRA r: {config.lora_r}, alpha: {config.lora_alpha}")
print("-" * 50)

# Tr√©nov√°n√≠
train_result = trainer.train()

print("\n‚úÖ Fine-tuning dokonƒçen!")
print(f"Training loss: {train_result.metrics.get('train_loss', 'N/A')}")
print(f"Training time: {train_result.metrics.get('train_runtime', 'N/A')}s")

In [None]:
# Evaluace modelu
print("üìä Spou≈°t√≠m evaluaci...")

eval_results = trainer.evaluate()

print("\nüìà V√Ωsledky evaluace:")
for key, value in eval_results.items():
    print(f"{key}: {value:.4f}")

# Ulo≈æen√≠ metrik
trainer.log_metrics("eval", eval_results)
trainer.save_metrics("eval", eval_results)

## üíæ Ulo≈æen√≠ modelu

In [None]:
# Ulo≈æen√≠ modelu
print("üíæ Ukl√°d√°m model...")

# Ulo≈æen√≠ na Colab
trainer.save_model()
tokenizer.save_pretrained(config.output_dir)

# Ulo≈æen√≠ na Google Drive
drive_path = f"/content/drive/MyDrive/babis_finetune/{config.model_name}"
!cp -r {config.output_dir} {drive_path}

print(f"‚úÖ Model ulo≈æen:")
print(f"Colab: {config.output_dir}")
print(f"Google Drive: {drive_path}")

# V√Ωpis velikosti modelu
!du -sh {config.output_dir} 

## üöÄ Push na Hugging Face Hub (voliteln√©)

In [None]:
# Nastaven√≠ Hugging Face tokenu
from getpass import getpass

print("Pro push na Hugging Face Hub pot≈ôebujete token.")
print("Z√≠skejte ho na: https://huggingface.co/settings/tokens")
print("Pokud nechcete pushovat, stisknƒõte Enter.")

hf_token = getpass("HF Token (voliteln√©): ")

if hf_token.strip():
    login(hf_token)
    print("‚úÖ P≈ôihl√°≈°eno k Hugging Face Hub")
    
    # Push na Hub
    print("üöÄ Pushuji model na Hub...")
    trainer.push_to_hub(f"babis-{config.model_name}")
    tokenizer.push_to_hub(f"babis-{config.model_name}")
    
    print(f"‚úÖ Model pushnut na Hub: babis-{config.model_name}")
    print(f"URL: https://huggingface.co/babis-{config.model_name}")
else:
    print("Model nebyl pushnut na Hub.") 

## üß™ Testov√°n√≠ modelu

In [None]:
# Funkce pro generov√°n√≠ odpovƒõd√≠
def generate_babis_response(prompt, max_length=100, temperature=0.7):
    """Vygeneruje odpovƒõƒè ve stylu Babi≈°e"""
    
    # Tokenizace promptu
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=config.max_seq_length)
    
    # Generov√°n√≠
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=max_length,
            num_return_sequences=1,
            temperature=temperature,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )
    
    # Dek√≥dov√°n√≠
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Odstranƒõn√≠ p≈Øvodn√≠ho promptu
    response = response.replace(prompt, "").strip()
    
    return response

# Testovac√≠ prompty
test_prompts = [
    "U≈æivatel: Jak√Ω je v√°≈° n√°zor na inflaci?\nAndrej Babi≈°: ",
    "U≈æivatel: Co si mysl√≠te o Bruselu?\nAndrej Babi≈°: ",
    "U≈æivatel: Jak hodnot√≠te opozici?\nAndrej Babi≈°: ",
    "U≈æivatel: Jak√© m√°te pl√°ny?\nAndrej Babi≈°: "
]

print("üß™ Testov√°n√≠ fine-tuned modelu:")
print("=" * 50)

for prompt in test_prompts:
    response = generate_babis_response(prompt)
    print(f"\nPrompt: {prompt.strip()}")
    print(f"Odpovƒõƒè: {response}")
    print("-" * 30) 

## üåê Gradio Interface (voliteln√©)

In [None]:
# Vytvo≈ôen√≠ Gradio interface
import gradio as gr

def babis_chat(message, history):
    """Chat funkce pro Gradio"""
    prompt = f"U≈æivatel: {message}\nAndrej Babi≈°: "
    response = generate_babis_response(prompt, max_length=150, temperature=0.8)
    return response

# Vytvo≈ôen√≠ interface
iface = gr.ChatInterface(
    fn=babis_chat,
    title="ü§ñ Babi≈° Chat Bot",
    description="Fine-tuned model ve stylu Andreje Babi≈°e",
    examples=[
        ["Jak√Ω je v√°≈° n√°zor na inflaci?"],
        ["Co si mysl√≠te o Bruselu?"],
        ["Jak hodnot√≠te opozici?"],
        ["Jak√© m√°te pl√°ny do budoucna?"],
        ["Jak tr√°v√≠te ƒças s rodinou?"],
    ]
)

# Spu≈°tƒõn√≠ interface
iface.launch(share=True, debug=True)

print("üåê Gradio interface spu≈°tƒõn!")
print("Sd√≠lejte odkaz s ostatn√≠mi pro testov√°n√≠.") 

## üìä Shrnut√≠ v√Ωsledk≈Ø

In [None]:
# Shrnut√≠ v√Ωsledk≈Ø
print("üéâ FINE-TUNING DOKONƒåEN!")
print("=" * 50)
print(f"Model: {config.base_model}")
print(f"Fine-tuned model: {config.model_name}")
print(f"Training loss: {train_result.metrics.get('train_loss', 'N/A'):.4f}")
print(f"Evaluation loss: {eval_results.get('eval_loss', 'N/A'):.4f}")
print(f"Training time: {train_result.metrics.get('train_runtime', 'N/A'):.1f}s")
print(f"LoRA parameters: r={config.lora_r}, alpha={config.lora_alpha}")
print(f"Dataset size: {len(tokenized_dataset['train'])} train, {len(tokenized_dataset['validation'])} validation")
print("\nüìÅ Ulo≈æen√© soubory:")
print(f"Colab: {config.output_dir}")
print(f"Google Drive: /content/drive/MyDrive/babis_finetune/{config.model_name}")

if hf_token.strip():
    print(f"Hugging Face Hub: babis-{config.model_name}")

print("\nüöÄ Dal≈°√≠ kroky:")
print("1. St√°hnƒõte model z Google Drive")
print("2. Pou≈æijte ho ve vlastn√≠ aplikaci")
print("3. Sd√≠lejte v√Ωsledky s ostatn√≠mi")
print("4. Experimentujte s r≈Øzn√Ωmi prompty")

print("\n‚úÖ Hotovo! Model je p≈ôipraven k pou≈æit√≠.")