# GL RL Model - Complete SageMaker Setup

This notebook uses the working conda approach to install all dependencies including sentencepiece.

## 1. Install problematic packages using conda-forge

In [None]:
# Install sentencepiece and pyarrow from conda-forge (prebuilt binaries)
%conda install -c conda-forge sentencepiece pyarrow -y

## 2. Install PyTorch and Transformers

In [None]:
# Install PyTorch (CPU version for ml.t2.medium)
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu

In [None]:
# Install Transformers ecosystem
%pip install transformers tokenizers huggingface-hub accelerate datasets

In [None]:
# Install fine-tuning libraries
%pip install peft trl

In [None]:
# Install remaining utilities
%pip install numpy pandas protobuf tqdm fsspec aiohttp

## 3. Verify Installation

In [None]:
# Test all imports
import sys
print(f"Python: {sys.version}\n")

# Import all packages
import torch
import transformers
import datasets
import peft
import trl
import pyarrow
import sentencepiece
import pandas as pd
import numpy as np

print("✅ All packages imported successfully!\n")
print(f"PyTorch: {torch.__version__}")
print(f"Transformers: {transformers.__version__}")
print(f"Datasets: {datasets.__version__}")
print(f"PEFT: {peft.__version__}")
print(f"TRL: {trl.__version__}")
print(f"PyArrow: {pyarrow.__version__}")
print(f"Sentencepiece: {sentencepiece.__version__}")
print(f"Pandas: {pd.__version__}")
print(f"NumPy: {np.__version__}")

print(f"\nDevice: {'CUDA' if torch.cuda.is_available() else 'CPU'}")

## 4. Test Qwen Model Loading

In [None]:
from transformers import AutoTokenizer

# Test loading the Qwen tokenizer (requires sentencepiece)
model_name = "Qwen/Qwen2.5-Coder-1.5B-Instruct"
print(f"Loading tokenizer for {model_name}...\n")

try:
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    
    # Test tokenization
    test_queries = [
        "SELECT * FROM users WHERE age > 25;",
        "Show me all customers",
        "Calculate total revenue by month"
    ]
    
    print("✅ Qwen tokenizer loaded successfully!\n")
    print(f"Vocabulary size: {tokenizer.vocab_size}\n")
    
    for query in test_queries:
        tokens = tokenizer.encode(query)
        decoded = tokenizer.decode(tokens)
        print(f"Query: '{query}'")
        print(f"  Tokens: {len(tokens)}")
        print(f"  Decoded: '{decoded}'\n")
        
except Exception as e:
    print(f"❌ Error loading tokenizer: {e}")
    print("Please check your internet connection and try again.")

## 5. Clone Repository and Set Up Data

In [None]:
import os
import json

# Navigate to SageMaker directory
sagemaker_dir = "/home/ec2-user/SageMaker"
if os.path.exists(sagemaker_dir):
    os.chdir(sagemaker_dir)
    print(f"Changed to directory: {os.getcwd()}\n")

# Clone or update repository
if not os.path.exists("gl_rl_model"):
    !git clone https://github.com/maddinenisri/gl_rl_model.git
    print("Repository cloned successfully!")
else:
    print("Repository already exists")

os.chdir("gl_rl_model")
!git pull origin main

print(f"\nCurrent directory: {os.getcwd()}")

In [None]:
# Create training data
os.makedirs("data/training", exist_ok=True)

sample_data = [
    {"query": "Show me all customers", "sql": "SELECT * FROM customers;", "context": "customers(id, name, email, created_at)"},
    {"query": "Get total sales by month", "sql": "SELECT DATE_FORMAT(date, '%Y-%m') as month, SUM(amount) as total FROM sales GROUP BY month;", "context": "sales(id, date, amount, product_id)"},
    {"query": "Find top 5 products by revenue", "sql": "SELECT p.name, SUM(s.amount) as revenue FROM products p JOIN sales s ON p.id = s.product_id GROUP BY p.id ORDER BY revenue DESC LIMIT 5;", "context": "products(id, name, price), sales(id, product_id, amount)"},
    {"query": "List users who registered today", "sql": "SELECT * FROM users WHERE DATE(created_at) = CURDATE();", "context": "users(id, name, email, created_at)"},
    {"query": "Calculate average order value", "sql": "SELECT AVG(total_amount) as avg_order_value FROM orders;", "context": "orders(id, customer_id, total_amount, order_date)"}
]

data_file = "data/training/query_pairs.jsonl"
with open(data_file, 'w') as f:
    for item in sample_data:
        f.write(json.dumps(item) + '\n')

print(f"✅ Created training data: {data_file}\n")

# Load and display
with open(data_file, 'r') as f:
    loaded_data = [json.loads(line) for line in f]

print(f"Loaded {len(loaded_data)} training examples")
for i, example in enumerate(loaded_data[:3], 1):
    print(f"\nExample {i}:")
    print(f"  Query: {example['query']}")
    print(f"  SQL: {example['sql'][:50]}..." if len(example['sql']) > 50 else f"  SQL: {example['sql']}")

## 6. Test Data Loading with Datasets

In [None]:
from datasets import load_dataset

# Load the training data using datasets library
try:
    dataset = load_dataset('json', data_files='data/training/query_pairs.jsonl', split='train')
    print(f"✅ Dataset loaded successfully!")
    print(f"\nDataset info:")
    print(f"  Number of examples: {len(dataset)}")
    print(f"  Features: {dataset.features}")
    print(f"\nFirst example:")
    print(f"  {dataset[0]}")
except Exception as e:
    print(f"Error loading dataset: {e}")

## 7. Environment Summary

In [None]:
import platform

print("="*60)
print("ENVIRONMENT SUMMARY")
print("="*60)

# System info
print(f"\n📊 System Information:")
print(f"  Platform: {platform.platform()}")
print(f"  Python: {platform.python_version()}")
print(f"  Machine: {platform.machine()}")

# Check if in SageMaker
if os.path.exists('/opt/ml/metadata/resource-metadata.json'):
    print(f"\n🚀 SageMaker: Yes")
    try:
        with open('/opt/ml/metadata/resource-metadata.json', 'r') as f:
            metadata = json.load(f)
        print(f"  Instance Type: {metadata.get('InstanceType', 'Unknown')}")
    except:
        pass
else:
    print(f"\n💻 SageMaker: No (local environment)")

# GPU status
print(f"\n🎮 GPU Status:")
if torch.cuda.is_available():
    print(f"  CUDA Available: Yes")
    print(f"  GPU Name: {torch.cuda.get_device_name(0)}")
else:
    print(f"  CUDA Available: No (CPU only)")
    print(f"  Note: Use SageMaker Training Jobs for GPU")

print("\n" + "="*60)
print("✅ SETUP COMPLETE - ALL DEPENDENCIES INSTALLED!")
print("="*60)

print("\n📚 Next Steps:")
print("1. Open GL_RL_Model_Quick_Start.ipynb for training")
print("2. Use this notebook for development")
print("3. Launch GPU training with SageMaker Training Jobs")
print("\n💡 Remember to stop the instance when not in use!")

## ✅ Success!

All dependencies including **sentencepiece** and **pyarrow** are now installed.

### Key Points:
- Used `conda-forge` for problematic packages (avoids compilation)
- Used `pip` for remaining packages
- Qwen model tokenizer works with sentencepiece
- Ready for training on this CPU instance or GPU via Training Jobs

### To reinstall if needed:
```python
%conda install -c conda-forge sentencepiece pyarrow -y
%pip install transformers datasets peft trl accelerate
```