# Finetune Qwen-1.5B for Moroccan Darija Summarization
## Setup and Imports

In [None]:
!pip install -q transformers datasets accelerate peft bitsandbytes wandb

In [None]:
import torch
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments
)
from peft import (
    prepare_model_for_kbit_training,
    LoraConfig,
    get_peft_model
)
import pandas as pd
import wandb
from typing import Dict, List

## Configuration

In [None]:
class TrainingConfig:
    def __init__(self):
        self.model_id = "Qwen/Qwen-1_5B"
        self.output_dir = "qwen-darija-summarizer"
        
        # Training params
        self.num_train_epochs = 3
        self.per_device_train_batch_size = 4
        self.gradient_accumulation_steps = 4
        self.learning_rate = 2e-4
        self.max_grad_norm = 0.3
        self.max_steps = -1
        self.warmup_ratio = 0.03
        self.lr_scheduler_type = "cosine"
        
        # LoRA params
        self.lora_r = 64
        self.lora_alpha = 16
        self.lora_dropout = 0.1
        
        # Data processing
        self.max_length = 512
        self.train_test_split = 0.1
        
config = TrainingConfig()

## Data Preparation

In [None]:
def load_dataset(file_path: str) -> Dataset:
    """Load and prepare the dataset from the summarization results"""
    df = pd.read_csv(file_path)
    
    # Create training pairs
    dataset_dict = {
        'text': df['original_text'].tolist(),
        'summary': df['summary'].tolist()
    }
    
    return Dataset.from_dict(dataset_dict)

def prepare_training_data(dataset: Dataset, tokenizer) -> Dataset:
    """Prepare the dataset for training by formatting and tokenizing"""
    def format_prompt(text: str, summary: str) -> str:
        return f"Summarize the following text in Moroccan Darija:\n{text}\n\nSummary:\n{summary}"
    
    def tokenize(examples):
        prompts = [format_prompt(text, summary) 
                   for text, summary in zip(examples['text'], examples['summary'])]
        
        return tokenizer(
            prompts,
            truncation=True,
            max_length=config.max_length,
            padding='max_length',
            return_tensors='pt'
        )
    
    tokenized_dataset = dataset.map(
        tokenize,
        batched=True,
        remove_columns=dataset.column_names
    )
    
    return tokenized_dataset

## Model Preparation

In [None]:
def prepare_model():
    """Initialize and prepare the model for training"""
    # Quantization config
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True
    )
    
    # Load model and tokenizer
    model = AutoModelForCausalLM.from_pretrained(
        config.model_id,
        quantization_config=bnb_config,
        device_map="auto"
    )
    tokenizer = AutoTokenizer.from_pretrained(config.model_id)
    
    # Prepare for k-bit training
    model = prepare_model_for_kbit_training(model)
    
    # LoRA configuration
    lora_config = LoraConfig(
        r=config.lora_r,
        lora_alpha=config.lora_alpha,
        lora_dropout=config.lora_dropout,
        bias="none",
        task_type="CAUSAL_LM"
    )
    
    # Get PEFT model
    model = get_peft_model(model, lora_config)
    
    return model, tokenizer

## Training Setup

In [None]:
def setup_training(tokenized_dataset):
    """Setup training arguments and initialize trainer"""
    training_args = TrainingArguments(
        output_dir=config.output_dir,
        num_train_epochs=config.num_train_epochs,
        per_device_train_batch_size=config.per_device_train_batch_size,
        gradient_accumulation_steps=config.gradient_accumulation_steps,
        learning_rate=config.learning_rate,
        max_grad_norm=config.max_grad_norm,
        max_steps=config.max_steps,
        warmup_ratio=config.warmup_ratio,
        lr_scheduler_type=config.lr_scheduler_type,
        fp16=True,
        logging_steps=10,
        save_strategy="steps",
        save_steps=100,
        evaluation_strategy="steps",
        eval_steps=100,
        report_to="wandb"
    )
    
    return training_args

## Training Pipeline

In [None]:
def train():
    # Initialize wandb
    wandb.init(project="qwen-darija-summarizer")
    
    # Load and prepare dataset
    dataset = load_dataset("summarized_documents.csv")
    
    # Train/test split
    dataset = dataset.train_test_split(test_size=config.train_test_split)
    
    # Prepare model and tokenizer
    model, tokenizer = prepare_model()
    
    # Prepare datasets
    train_dataset = prepare_training_data(dataset['train'], tokenizer)
    eval_dataset = prepare_training_data(dataset['test'], tokenizer)
    
    # Setup training arguments
    training_args = setup_training(train_dataset)
    
    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset
    )
    
    # Train
    trainer.train()
    
    # Save model
    trainer.save_model()
    
    wandb.finish()

## Run Training

In [None]:
if __name__ == "__main__":
    train()

## Inference Example

In [None]:
def generate_summary(text: str, model_path: str):
    # Load fine-tuned model and tokenizer
    model = AutoModelForCausalLM.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    
    prompt = f"Summarize the following text in Moroccan Darija:\n{text}"
    
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=config.max_length)
    
    outputs = model.generate(
        **inputs,
        max_new_tokens=200,
        temperature=0.7,
        num_return_sequences=1
    )
    
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example usage
test_text = "Your test text here..."
summary = generate_summary(test_text, config.output_dir)
print(summary)