# üéØ Telugu Poem Generator - Training Notebook

**CNN-Based Telugu Poem Analysis Inspired by Human Rote Learning**

This notebook trains the Telugu poem generation model using Google Colab's GPU.

## Step 1: Setup Environment

First, mount Google Drive and install dependencies.

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Clone or upload your project
# Option 1: Clone from GitHub (if you have a repo)
# !git clone https://github.com/your-username/telugu-poem-generator.git

# Option 2: Upload project folder to Drive and copy
!cp -r '/content/drive/MyDrive/majorproject - A' /content/project
%cd /content/project

In [None]:
# Install dependencies
!pip install torch torchvision torchaudio
!pip install transformers
!pip install tqdm
!pip install pyyaml

In [None]:
# Check GPU availability
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

## Step 2: Load Dataset

Load the Telugu poem dataset (470 poems).

In [None]:
import json
from pathlib import Path

# Load Telugu poems
data_path = Path('data/processed/telugu_poems.json')

if not data_path.exists():
    # Generate dataset if not exists
    %cd scripts
    !python create_large_dataset.py
    %cd ..

with open(data_path, 'r', encoding='utf-8') as f:
    poems = json.load(f)

print(f"‚úÖ Loaded {len(poems)} Telugu poems")
print(f"\nüìù Sample poem:")
print(poems[0]['text'][:200])

## Step 3: Initialize Model

Create the Telugu poem generator model.

In [None]:
import sys
sys.path.insert(0, '/content/project')

from src.models.telugu_backbone import create_telugu_generator

# Create model
# Options: 'distilmbert' (small), 'mbert' (medium), 'xlm-roberta' (large)
model = create_telugu_generator('distilmbert', freeze_backbone=False)

# Move to GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Count parameters
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total = sum(p.numel() for p in model.parameters())
print(f"\nüìä Model Statistics:")
print(f"   Total parameters: {total:,}")
print(f"   Trainable: {trainable:,} ({100*trainable/total:.1f}%)")

## Step 4: Prepare DataLoader

In [None]:
from torch.utils.data import Dataset, DataLoader
from src.preprocessing.telugu_cleaner import TeluguTextCleaner

class TeluguPoemDataset(Dataset):
    def __init__(self, poems, tokenizer, max_length=128):
        self.poems = poems
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.cleaner = TeluguTextCleaner()
    
    def __len__(self):
        return len(self.poems)
    
    def __getitem__(self, idx):
        poem = self.poems[idx]
        text = poem.get('text', '') if isinstance(poem, dict) else poem
        text = self.cleaner.clean(text)
        
        encoding = self.tokenizer.encode_plus(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()
        labels = input_ids.clone()
        labels[labels == self.tokenizer.pad_token_id] = -100
        
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

# Create dataset and dataloader
dataset = TeluguPoemDataset(poems, model.tokenizer, max_length=128)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True, num_workers=2)

print(f"‚úÖ DataLoader ready: {len(dataloader)} batches")

## Step 5: Training Loop

In [None]:
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR
from tqdm import tqdm

# Training config
CONFIG = {
    'epochs': 10,
    'learning_rate': 3e-5,
    'warmup_steps': 100,
    'save_every': 2  # epochs
}

# Optimizer
optimizer = AdamW(model.parameters(), lr=CONFIG['learning_rate'])
total_steps = len(dataloader) * CONFIG['epochs']
scheduler = CosineAnnealingLR(optimizer, T_max=total_steps)

# Training
print("üöÄ Starting Telugu Poem Training...")
print(f"   Epochs: {CONFIG['epochs']}")
print(f"   Batches per epoch: {len(dataloader)}")
print(f"   Total steps: {total_steps}")

model.train()
best_loss = float('inf')

for epoch in range(CONFIG['epochs']):
    epoch_loss = 0
    progress = tqdm(dataloader, desc=f"Epoch {epoch+1}/{CONFIG['epochs']}")
    
    for batch in progress:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        
        loss = outputs['loss']
        if loss is None:
            continue
        
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        
        epoch_loss += loss.item()
        progress.set_postfix({'loss': f'{loss.item():.4f}'})
    
    avg_loss = epoch_loss / len(dataloader)
    print(f"\nüìä Epoch {epoch+1} | Loss: {avg_loss:.4f}")
    
    # Save checkpoint
    if (epoch + 1) % CONFIG['save_every'] == 0 or avg_loss < best_loss:
        if avg_loss < best_loss:
            best_loss = avg_loss
            save_path = '/content/drive/MyDrive/checkpoints/best_telugu_model.pt'
        else:
            save_path = f'/content/drive/MyDrive/checkpoints/telugu_epoch_{epoch+1}.pt'
        
        Path('/content/drive/MyDrive/checkpoints').mkdir(exist_ok=True)
        torch.save({
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'epoch': epoch,
            'loss': avg_loss
        }, save_path)
        print(f"üíæ Saved: {save_path}")

print("\n‚úÖ Training Complete!")

## Step 6: Test Generation

In [None]:
# Test Telugu poem generation
model.eval()

test_prompts = [
    "‡∞ö‡∞Ç‡∞¶‡∞Æ‡∞æ‡∞Æ ‡∞∞‡∞æ‡∞µ‡±á",
    "‡∞§‡±Ü‡∞≤‡±Å‡∞ó‡±Å ‡∞≠‡∞æ‡∞∑",
    "‡∞Ö‡∞Æ‡±ç‡∞Æ ‡∞™‡±ç‡∞∞‡±á‡∞Æ",
    "‡∞®‡∞æ ‡∞¶‡±á‡∞∂‡∞Ç"
]

print("üìù Telugu Poem Generation Test")
print("=" * 50)

for prompt in test_prompts:
    print(f"\nüîπ Prompt: {prompt}")
    generated = model.generate(prompt, max_length=50, temperature=0.8)
    print(f"   Generated: {generated}")

## Step 7: Save Final Model

In [None]:
# Save final model to Drive
final_path = '/content/drive/MyDrive/checkpoints/telugu_final_model.pt'

torch.save({
    'model_state_dict': model.state_dict(),
    'config': CONFIG,
    'final_loss': best_loss
}, final_path)

print(f"‚úÖ Final model saved to: {final_path}")
print(f"\nüìã To use this model locally:")
print(f"   1. Download from Google Drive")
print(f"   2. Place in project checkpoints/ folder")
print(f"   3. Load with: torch.load('checkpoints/telugu_final_model.pt')")