## 1Ô∏è‚É£ Setup Environment

In [None]:
# Check GPU availability
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
    print("‚ö†Ô∏è No GPU detected! Go to Runtime > Change runtime type > GPU")

In [None]:
# Mount Google Drive for saving checkpoints
from google.colab import drive
drive.mount('/content/drive')

# Create checkpoint directory in Drive
import os
DRIVE_CHECKPOINT_DIR = '/content/drive/MyDrive/telugu_poem_checkpoints'
os.makedirs(DRIVE_CHECKPOINT_DIR, exist_ok=True)
print(f"‚úì Checkpoints will be saved to: {DRIVE_CHECKPOINT_DIR}")

In [None]:
# Clone the repository
# ‚ö†Ô∏è IMPORTANT: Replace with your actual GitHub repository URL
GITHUB_REPO = "https://github.com/maneendra03/CNN-Based-Telugu-Poem-Analysis-inspired-by-human-rote-learning.git"  # <-- CHANGE THIS!

import os

# Remove existing directory if present
!rm -rf /content/telugu-poem-generator

# Clone repository
!git clone {GITHUB_REPO} /content/telugu-poem-generator

# Change to project directory
os.chdir('/content/telugu-poem-generator')
print(f"\n‚úì Working directory: {os.getcwd()}")
!ls -la

In [None]:
# Install dependencies
!pip install -q transformers datasets torch tqdm pyyaml

# Install any additional requirements
!pip install -q -r requirements.txt 2>/dev/null || echo "No requirements.txt or already satisfied"

print("\n‚úì Dependencies installed")

In [None]:
# Verify project structure
import os
from pathlib import Path

required_files = [
    'src/models/enhanced_generator.py',
    'src/training/enhanced_trainer.py',
    'src/preprocessing/advanced_preprocessor.py',
    'src/interpretation/poem_interpreter.py',
    'data/processed/telugu_train.json',
    'data/processed/telugu_val.json',
]

print("üìÅ Checking project structure...")
all_present = True
for f in required_files:
    exists = os.path.exists(f)
    status = '‚úì' if exists else '‚úó'
    print(f"  {status} {f}")
    if not exists:
        all_present = False

if all_present:
    print("\n‚úÖ All required files present!")
else:
    print("\n‚ùå Some files missing! Check your repository.")

## 2Ô∏è‚É£ Load and Verify Data

In [None]:
import json

# Load and check datasets
data_files = {
    'train': 'data/processed/telugu_train.json',
    'val': 'data/processed/telugu_val.json',
    'test': 'data/processed/telugu_test.json'
}

print("üìä Dataset Statistics:")
print("="*50)

total_poems = 0
for split, path in data_files.items():
    if os.path.exists(path):
        with open(path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        if isinstance(data, dict) and 'poems' in data:
            count = len(data['poems'])
        else:
            count = len(data)
        
        total_poems += count
        print(f"  {split:>6}: {count:,} poems")
    else:
        print(f"  {split:>6}: NOT FOUND")

print("="*50)
print(f"  {'Total':>6}: {total_poems:,} poems")

In [None]:
# Show sample poems
with open('data/processed/telugu_train.json', 'r', encoding='utf-8') as f:
    train_data = json.load(f)

if isinstance(train_data, dict) and 'poems' in train_data:
    poems = train_data['poems']
else:
    poems = train_data

print("üìú Sample Poems:")
print("="*60)
for i, poem in enumerate(poems[:3]):
    if isinstance(poem, dict):
        text = poem.get('text', poem.get('content', str(poem)))
    else:
        text = str(poem)
    print(f"\n[{i+1}] {text[:200]}{'...' if len(text) > 200 else ''}")
print("\n" + "="*60)

## 3Ô∏è‚É£ Initialize Model

In [None]:
import sys
sys.path.insert(0, '/content/telugu-poem-generator')

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

from src.models.enhanced_generator import (
    TeluguPoemGeneratorV3,
    GenerationConfig,
    create_enhanced_generator
)

print("üöÄ Creating Telugu Poem Generator V3...")
print("="*60)

# Create model - use indic-bert for better Telugu support
model = create_enhanced_generator(
    model_type='mbert',  # Options: 'indic-bert', 'mbert', 'xlm-roberta'
    freeze_encoder=True
)

# Move to GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Print model info
total_params, trainable_params = model.count_parameters()
print(f"\nüìä Model Statistics:")
print(f"   Device: {device}")
print(f"   Total Parameters: {total_params:,}")
print(f"   Trainable Parameters: {trainable_params:,}")
print(f"   Frozen Parameters: {total_params - trainable_params:,}")

In [None]:
# Test generation before training (untrained model)
print("üé≠ Testing Generation (Before Training):")
print("="*60)

model.eval()
config = GenerationConfig(
    max_length=80,
    min_length=20,
    temperature=0.85,
    repetition_penalty=1.8
)

test_prompts = ["‡∞§‡±Ü‡∞≤‡±Å‡∞ó‡±Å ‡∞≠‡∞æ‡∞∑", "‡∞Ö‡∞Æ‡±ç‡∞Æ ‡∞™‡±ç‡∞∞‡±á‡∞Æ"]

for prompt in test_prompts:
    output = model.generate(prompt, config)
    print(f"\nPrompt: {prompt}")
    print(f"Output: {output[:150]}...")

## 4Ô∏è‚É£ Training Configuration

In [None]:
from src.training.enhanced_trainer import (
    EnhancedTrainer,
    TrainingConfig,
    TeluguPoemDataset
)

# Training Configuration
# Adjust these based on your GPU memory

EPOCHS = 100  # Full training
BATCH_SIZE = 16  # Reduce if OOM error
LEARNING_RATE = 5e-4
MAX_LENGTH = 128  # Sequence length

config = TrainingConfig(
    # Model
    model_name='bert-base-multilingual-cased',
    freeze_encoder=True,
    
    # Training
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    gradient_accumulation_steps=2,  # Effective batch = 32
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    warmup_ratio=0.1,
    max_grad_norm=1.0,
    
    # Scheduler
    scheduler_type='cosine',
    
    # Mixed precision (faster on GPU)
    use_amp=torch.cuda.is_available(),
    
    # Regularization
    label_smoothing=0.1,
    dropout=0.2,
    
    # Loss weights
    coverage_weight=0.1,
    repetition_loss_weight=0.2,
    
    # Validation
    val_every_n_steps=500,
    patience=15,
    min_delta=1e-4,
    
    # Checkpoints - save to Google Drive
    checkpoint_dir=DRIVE_CHECKPOINT_DIR,
    save_every_n_steps=1000,
    max_checkpoints=5,
    
    # Data
    max_length=MAX_LENGTH,
    num_workers=2
)

print("‚öôÔ∏è Training Configuration:")
print("="*60)
for key, value in vars(config).items():
    print(f"  {key}: {value}")

In [None]:
# Create datasets
print("üìö Loading Datasets...")

train_dataset = TeluguPoemDataset(
    'data/processed/telugu_train.json',
    model.tokenizer,
    max_length=config.max_length
)

val_dataset = TeluguPoemDataset(
    'data/processed/telugu_val.json',
    model.tokenizer,
    max_length=config.max_length
)

print(f"\n‚úì Train samples: {len(train_dataset):,}")
print(f"‚úì Validation samples: {len(val_dataset):,}")

In [None]:
# Create trainer
print("üèãÔ∏è Creating Trainer...")

trainer = EnhancedTrainer(
    model=model,
    config=config,
    train_dataset=train_dataset,
    val_dataset=val_dataset,
    device=device
)

print("‚úì Trainer ready!")

## 5Ô∏è‚É£ Run Training (100 Epochs)

In [None]:
# Start training!
print("üöÄ Starting Training...")
print("="*60)
print(f"Training for {EPOCHS} epochs")
print(f"Checkpoints will be saved to: {DRIVE_CHECKPOINT_DIR}")
print("="*60)

import time
start_time = time.time()

try:
    results = trainer.train()
    
    training_time = time.time() - start_time
    hours = int(training_time // 3600)
    minutes = int((training_time % 3600) // 60)
    
    print("\n" + "="*60)
    print("‚úÖ Training Complete!")
    print("="*60)
    print(f"Total Time: {hours}h {minutes}m")
    print(f"Best Val Loss: {results.get('best_val_loss', 'N/A')}")
    print(f"Final Train Loss: {results['train_losses'][-1]:.4f}")
    
except KeyboardInterrupt:
    print("\n‚ö†Ô∏è Training interrupted! Checkpoints are saved.")
except Exception as e:
    print(f"\n‚ùå Training error: {e}")
    import traceback
    traceback.print_exc()

## 6Ô∏è‚É£ Evaluate Trained Model

In [None]:
# Load best model from checkpoints
import glob

best_model_path = os.path.join(DRIVE_CHECKPOINT_DIR, 'best_model.pt')

if os.path.exists(best_model_path):
    print(f"üì• Loading best model from: {best_model_path}")
    checkpoint = torch.load(best_model_path, map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])
    print(f"   Epoch: {checkpoint.get('epoch', 'N/A')}")
    print(f"   Loss: {checkpoint.get('loss', 'N/A')}")
    print("‚úì Best model loaded!")
else:
    print("‚ö†Ô∏è Best model not found, using current model")

In [None]:
# Test generation after training
print("üé≠ Testing Generation (After Training):")
print("="*60)

model.eval()

gen_config = GenerationConfig(
    max_length=120,
    min_length=30,
    temperature=0.85,
    top_k=50,
    top_p=0.92,
    repetition_penalty=1.8,
    no_repeat_ngram_size=4,
    diversity_penalty=0.5
)

test_prompts = [
    "‡∞§‡±Ü‡∞≤‡±Å‡∞ó‡±Å ‡∞≠‡∞æ‡∞∑",
    "‡∞Ö‡∞Æ‡±ç‡∞Æ ‡∞™‡±ç‡∞∞‡±á‡∞Æ",
    "‡∞ß‡∞∞‡±ç‡∞Æ‡∞Ç ‡∞Æ‡∞æ‡∞∞‡±ç‡∞ó‡∞Ç",
    "‡∞µ‡∞ø‡∞¶‡±ç‡∞Ø ‡∞®‡±á‡∞∞‡±ç‡∞ö‡±Å‡∞ï‡±ã",
    "‡∞∏‡±ç‡∞®‡±á‡∞π‡∞Ç ‡∞Æ‡∞Ç‡∞ö‡∞ø‡∞¶‡∞ø"
]

print("\n" + "-"*60)
for prompt in test_prompts:
    output = model.generate(prompt, gen_config)
    print(f"\nüìù Prompt: {prompt}")
    print(f"üìú Generated:\n{output}")
    print("-"*60)

In [None]:
# Test style-conditioned generation
print("üé® Style-Conditioned Generation:")
print("="*60)

styles = ['vemana', 'sumati']
prompt = "‡∞®‡±Ä‡∞§‡∞ø ‡∞¨‡±ã‡∞ß‡∞®"

for style in styles:
    output = model.generate_with_style(prompt, style=style, config=gen_config)
    print(f"\nüè∑Ô∏è Style: {style}")
    print(f"üìú Output:\n{output}")
    print("-"*40)

In [None]:
# Test interpretation of generated poems
from src.interpretation.poem_interpreter import TeluguPoemInterpreter

interpreter = TeluguPoemInterpreter()

print("üìä Interpretation Analysis:")
print("="*60)

# Generate and analyze
prompt = "‡∞ß‡∞∞‡±ç‡∞Æ‡∞Ç ‡∞∏‡∞§‡±ç‡∞Ø‡∞Ç"
generated = model.generate(prompt, gen_config)

print(f"\nüìù Generated Poem:\n{generated}")
print("\n" + "-"*40)

interpretation = interpreter.interpret(generated)

print(f"\nüé≠ Rasa (Emotion): {interpretation['rasa']['dominant']}")
print(f"üìö Themes: {[t[0] for t in interpretation['themes']['primary'][:3]]}")
print(f"üìñ ≈öatakam Style: {interpretation.get('satakam', 'Not detected')}")
print(f"‚≠ê Quality Score: {interpretation['quality']['overall']:.2f}")

## 7Ô∏è‚É£ Save Final Model

In [None]:
# Save final model to Drive
final_model_path = os.path.join(DRIVE_CHECKPOINT_DIR, 'final_model.pt')

torch.save({
    'model_state_dict': model.state_dict(),
    'config': vars(config),
    'vocab_size': model.vocab_size,
    'hidden_dim': model.hidden_dim,
}, final_model_path)

print(f"üíæ Final model saved to: {final_model_path}")

# List all checkpoints
print("\nüìÅ Saved Checkpoints:")
for f in sorted(glob.glob(os.path.join(DRIVE_CHECKPOINT_DIR, '*.pt'))):
    size = os.path.getsize(f) / 1e6
    print(f"   {os.path.basename(f)}: {size:.1f} MB")

In [None]:
# Save training results
import json

results_path = os.path.join(DRIVE_CHECKPOINT_DIR, 'training_results.json')

training_summary = {
    'epochs': EPOCHS,
    'batch_size': BATCH_SIZE,
    'learning_rate': LEARNING_RATE,
    'train_losses': results.get('train_losses', []),
    'val_losses': results.get('val_losses', []),
    'best_val_loss': results.get('best_val_loss', None),
}

with open(results_path, 'w') as f:
    json.dump(training_summary, f, indent=2)

print(f"üìä Training results saved to: {results_path}")

In [None]:
# Plot training curves
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(12, 5))

if 'train_losses' in results and results['train_losses']:
    ax.plot(results['train_losses'], label='Train Loss', alpha=0.8)
if 'val_losses' in results and results['val_losses']:
    ax.plot(results['val_losses'], label='Val Loss', alpha=0.8)

ax.set_xlabel('Epoch')
ax.set_ylabel('Loss')
ax.set_title('Telugu Poem Generator - Training Progress')
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(os.path.join(DRIVE_CHECKPOINT_DIR, 'training_curve.png'), dpi=150)
plt.show()

print(f"üìà Training curve saved!")

## 8Ô∏è‚É£ Download Trained Model

In [None]:
# Download the trained model
from google.colab import files

print("üì• Preparing model for download...")

# Create a smaller export version
export_path = '/content/telugu_poem_model_trained.pt'
torch.save({
    'model_state_dict': model.state_dict(),
    'vocab_size': model.vocab_size,
    'hidden_dim': model.hidden_dim,
}, export_path)

print(f"Model size: {os.path.getsize(export_path) / 1e6:.1f} MB")
print("\n‚¨áÔ∏è Click the download link below:")

files.download(export_path)

---

## ‚úÖ Training Complete!

Your trained model has been saved to:
- **Google Drive**: `/content/drive/MyDrive/telugu_poem_checkpoints/`
- **Best Model**: `best_model.pt`
- **Final Model**: `final_model.pt`

### Next Steps:
1. Download the trained model to your local machine
2. Copy checkpoints from Google Drive to your project
3. Update your project to load the trained weights

### To use the trained model locally:
```python
from src.models.enhanced_generator import create_enhanced_generator
import torch

model = create_enhanced_generator('mbert')
checkpoint = torch.load('path/to/best_model.pt')
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()

# Generate poems
output = model.generate("‡∞§‡±Ü‡∞≤‡±Å‡∞ó‡±Å ‡∞≠‡∞æ‡∞∑")
print(output)
```