### Phase-4: Model Loading and Inference Testing

Load the fine-tuned model from shared storage and test it to verify training worked:


In [None]:
%pip install transformers>=4.36.0
%pip install peft>=0.6.0

In [20]:
# Model paths in shared storage
trained_model_path = "/opt/app-root/src/shared/models/granite-3.1-2b-instruct-synthetic2"
base_model_cache_path = "/opt/app-root/src/shared/huggingface_cache/hub/models--ibm-granite--granite-3.1-2b-instruct"
base_model_name = "ibm-granite/granite-3.1-2b-instruct"

In [21]:
# Load both original and fine-tuned models from shared storage
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import os
import glob

print("Loading models for comparison...")

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Function to find the actual model path in HuggingFace cache
def find_model_snapshot_path(cache_path):
    """Find the actual model files in HuggingFace cache structure"""
    if not os.path.exists(cache_path):
        return None
    
    # Look for snapshots directory
    snapshots_dir = os.path.join(cache_path, "snapshots")
    if not os.path.exists(snapshots_dir):
        return None
    
    # Get the latest snapshot (usually there's only one)
    snapshot_dirs = [d for d in os.listdir(snapshots_dir) if os.path.isdir(os.path.join(snapshots_dir, d))]
    if not snapshot_dirs:
        return None
    
    # Use the first (or only) snapshot
    snapshot_path = os.path.join(snapshots_dir, snapshot_dirs[0])
    
    # Verify it contains model files
    if os.path.exists(os.path.join(snapshot_path, "config.json")):
        return snapshot_path
    
    return None

# Load tokenizer from local cache
print(f"Loading tokenizer...")
local_model_path = find_model_snapshot_path(base_model_cache_path)

if local_model_path:
    print(f"   Using cached model at: {local_model_path}")
    try:
        tokenizer = AutoTokenizer.from_pretrained(
            local_model_path,
            trust_remote_code=True,
            local_files_only=True
        )
        print("Tokenizer loaded from local cache")
    except Exception as e:
        print(f"   Cache loading failed: {e}")
        print("   Falling back to HuggingFace download...")
        tokenizer = AutoTokenizer.from_pretrained(
            base_model_name,
            trust_remote_code=True
        )
        print("Tokenizer loaded from HuggingFace")
else:
    print("   Local cache not found, downloading from HuggingFace...")
    tokenizer = AutoTokenizer.from_pretrained(
        base_model_name,
        trust_remote_code=True
    )
    print("Tokenizer loaded from HuggingFace")

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# 1. Load original untrained model from local cache
print("\n1. Loading original untrained model...")

if local_model_path:
    print(f"   Loading from cached path: {local_model_path}")
    try:
        original_model = AutoModelForCausalLM.from_pretrained(
            local_model_path,
            dtype=torch.bfloat16 if device == "cuda" else torch.float32,
            device_map="auto" if device == "cuda" else None,
            trust_remote_code=True,
            local_files_only=True
        )
        print("Original model loaded from local cache")
    except Exception as e:
        print(f"Cache loading failed: {e}")
        print("Falling back to HuggingFace download...")
        try:
            original_model = AutoModelForCausalLM.from_pretrained(
                base_model_name,
                dtype=torch.bfloat16 if device == "cuda" else torch.float32,
                device_map="auto" if device == "cuda" else None,
                trust_remote_code=True
            )
            print("Original model loaded from HuggingFace")
        except Exception as e2:
            print(f"Failed to load original model: {e2}")
            original_model = None
else:
    print("Local cache not found, downloading from HuggingFace...")
    try:
        original_model = AutoModelForCausalLM.from_pretrained(
            base_model_name,
            dtype=torch.bfloat16 if device == "cuda" else torch.float32,
            device_map="auto" if device == "cuda" else None,
            trust_remote_code=True
        )
        print("Original model loaded from HuggingFace")
    except Exception as e:
        print(f"Failed to load original model: {e}")
        original_model = None


Loading models for comparison...
Using device: cuda
Loading tokenizer...
   Using cached model at: /opt/app-root/src/shared/huggingface_cache/hub/models--ibm-granite--granite-3.1-2b-instruct/snapshots/bbc2aed595bd38bd770263dc3ab831db9794441d
Tokenizer loaded from local cache

1. Loading original untrained model...
   Loading from cached path: /opt/app-root/src/shared/huggingface_cache/hub/models--ibm-granite--granite-3.1-2b-instruct/snapshots/bbc2aed595bd38bd770263dc3ab831db9794441d


Original model loaded from local cache


In [22]:

# 2. Load fine-tuned model with LoRA adapter
print("\n2. Loading fine-tuned model...")
print(f"   Checking path: {trained_model_path}")

if os.path.exists(trained_model_path):
    print("   Fine-tuned model path exists")
    try:
        # Load base model for LoRA (use same logic as original model)
        if local_model_path:
            print("   Loading base model from cache for LoRA...")
            base_for_lora = AutoModelForCausalLM.from_pretrained(
                local_model_path,
                torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32,
                device_map="auto" if device == "cuda" else None,
                trust_remote_code=True,
                local_files_only=True
            )
        else:
            print("   Loading base model from HuggingFace for LoRA...")
            base_for_lora = AutoModelForCausalLM.from_pretrained(
                base_model_name,
                torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32,
                device_map="auto" if device == "cuda" else None,
                trust_remote_code=True
            )
        
        # Load LoRA adapter and merge
        trained_model = PeftModel.from_pretrained(base_for_lora, trained_model_path)
        trained_model = trained_model.merge_and_unload()  # Merge LoRA weights for inference
        
        print(f"Fine-tuned model loaded from: {trained_model_path}")
        print("LoRA weights merged successfully")
        
        # Clean up base model used for LoRA loading
        del base_for_lora
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
            
    except Exception as e:
        print(f"Error loading fine-tuned model: {e}")
        trained_model = None
        
else:
    print(f"Fine-tuned model not found at: {trained_model_path}")
    print("Training may not have completed or path is incorrect")
    trained_model = None



2. Loading fine-tuned model...
   Checking path: /opt/app-root/src/shared/models/granite-3.1-2b-instruct-synthetic2
   Fine-tuned model path exists
   Loading base model from cache for LoRA...


Fine-tuned model loaded from: /opt/app-root/src/shared/models/granite-3.1-2b-instruct-synthetic2
LoRA weights merged successfully


In [23]:
print(f"\nModels loaded successfully!")
print(f"   Original model: {'Ready' if original_model else 'Not available'}")
print(f"   Fine-tuned model: {'Ready' if trained_model else 'Not available'}")
print(f"   Device: {device}")
print(f"   Cache path used: {local_model_path if local_model_path else 'HuggingFace download'}")


Models loaded successfully!
   Original model: Ready
   Fine-tuned model: Ready
   Device: cuda
   Cache path used: /opt/app-root/src/shared/huggingface_cache/hub/models--ibm-granite--granite-3.1-2b-instruct/snapshots/bbc2aed595bd38bd770263dc3ab831db9794441d


In [24]:

import json
import re
from typing import List, Dict, Any

def generate_response(question: str, model, tokenizer, max_length: int = 512) -> str:
    """Generate response using the model"""
    # Format as chat message
    messages = [{"role": "user", "content": question}]
    
    # Apply chat template
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    
    # Tokenize
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=max_length)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    
    # Generate response
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=256,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    
    # Decode response (remove input prompt)
    response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
    return response.strip()

def load_test_data(file_path: str, num_samples: int = 5) -> List[Dict[str, Any]]:
    """Load test samples from synthetic dataset"""
    try:
        with open(file_path, 'r') as f:
            data = json.load(f)
        
        # Take a subset for testing
        if len(data) > num_samples:
            # Take samples from different parts of the dataset
            step = len(data) // num_samples
            test_samples = [data[i] for i in range(0, len(data), step)][:num_samples]
        else:
            test_samples = data[:num_samples]
            
        return test_samples
    except Exception as e:
        print(f"Error loading test data: {e}")
        return []

def extract_final_number(text: str) -> str:
    """Extract the final numerical answer from text"""
    numbers = re.findall(r'\d+', text)
    return numbers[-1] if numbers else None

def evaluate_accuracy(expected: str, response: str) -> bool:
    """Simple accuracy check based on final number"""
    expected_num = extract_final_number(expected)
    response_num = extract_final_number(response)
    
    if expected_num and response_num:
        return expected_num == response_num
    return False

test_data_path = "/opt/app-root/src/shared/synthetic_data_v2/synthetic_dataset.json"
test_samples = load_test_data(test_data_path, num_samples=20)

if test_samples:
    print(f"Loaded {len(test_samples)} test samples")
    print(f"   Source: {test_data_path}")
else:
    print("No test data available")
    # Create fallback test samples
    test_samples = [
        {
            "question": "A bakery sold 45 cupcakes in the morning and 38 cupcakes in the afternoon. How many cupcakes did they sell in total?",
            "answer": "The bakery sold 45 + 38 = 83 cupcakes in total.",
            "source": "fallback"
        },
        {
            "question": "Sarah has 24 stickers. She gives 8 stickers to her friend and buys 15 more stickers. How many stickers does Sarah have now?",
            "answer": "Sarah had 24 stickers, gave away 8, so she had 24 - 8 = 16 stickers. Then she bought 15 more, so she has 16 + 15 = 31 stickers now.",
            "source": "fallback"
        }
    ]
    print(f"Using {len(test_samples)} fallback test samples")
print(test_samples)


Loaded 20 test samples
   Source: /opt/app-root/src/shared/synthetic_data_v2/synthetic_dataset.json


In [25]:

print("\n" + "="*80)
print("SIDE-BY-SIDE MODEL COMPARISON")
print("="*80)

if original_model and tokenizer:
    original_correct = 0
    trained_correct = 0
    
    for i, sample in enumerate(test_samples, 1):
        question = sample["question"]
        expected_answer = sample["answer"]
        
        print(f"\nTest Problem {i}/{len(test_samples)}:")
        print(f"Question: {question}")
        print(f"Expected: {expected_answer}")
        print("-" * 60)
        
        # Test original model
        print("ORIGINAL MODEL (Untrained):")
        try:
            original_response = generate_response(question, original_model, tokenizer)
            print(f"   Response: {original_response}")
            
            # Check accuracy
            is_correct = evaluate_accuracy(expected_answer, original_response)
            if is_correct:
                original_correct += 1
                print("   Correct")
            else:
                print("   Incorrect")
                
        except Exception as e:
            print(f"   Error: {e}")
        
        print()
        
        # Test fine-tuned model
        if trained_model:
            print("FINE-TUNED MODEL (After Training):")
            try:
                trained_response = generate_response(question, trained_model, tokenizer)
                print(f"   Response: {trained_response}")
                
                # Check accuracy
                is_correct = evaluate_accuracy(expected_answer, trained_response)
                if is_correct:
                    trained_correct += 1
                    print("   Correct")
                else:
                    print("   Incorrect")
                    
            except Exception as e:
                print(f"   Error: {e}")
        else:
            print("FINE-TUNED MODEL: Not available")
        
        print("=" * 80)
    
    print(f"\nPERFORMANCE COMPARISON SUMMARY")
    print("=" * 50)
    
    total_samples = len(test_samples)
    original_accuracy = (original_correct / total_samples) * 100
    
    print(f"Original Model Performance:")
    print(f"   Correct Answers: {original_correct}/{total_samples}")
    print(f"   Accuracy: {original_accuracy:.1f}%")
    
    if trained_model:
        trained_accuracy = (trained_correct / total_samples) * 100
        improvement = trained_accuracy - original_accuracy
        
        print(f"\nFine-tuned Model Performance:")
        print(f"   Correct Answers: {trained_correct}/{total_samples}")
        print(f"   Accuracy: {trained_accuracy:.1f}%")
        
        print(f"\nTraining Impact:")
        print(f"   Accuracy Change: {improvement:+.1f}%")
        
        if improvement > 20:
            print("   Excellent! Training significantly improved performance")
        elif improvement > 0:
            print("   Good! Training showed positive results")
        elif improvement == 0:
            print("   No change. Consider adjusting training parameters")
        else:
            print("   Performance decreased. Check training setup")
    else:
        print(f"\nFine-tuned Model: Not available for comparison")
    
    # ============================================================================
    # DETAILED ANALYSIS
    # ============================================================================
    
    print(f"\nDETAILED ANALYSIS")
    print("=" * 40)
    print("Key improvements to look for in fine-tuned model:")
    print("   Step-by-step mathematical reasoning")
    print("   Correct arithmetic calculations")
    print("   Clear explanation of the process")
    print("   Consistent answer format")
    print("   Better handling of word problems")
    
else:
    print("Cannot run comparison - models not loaded")

# ============================================================================
# CLEANUP
# ============================================================================

print(f"\nCLEANING UP RESOURCES")
print("=" * 30)

# Clear models from memory
if 'original_model' in locals() and original_model:
    del original_model
    print("Original model cleared from memory")

if 'trained_model' in locals() and trained_model:
    del trained_model
    print("Fine-tuned model cleared from memory")

if 'tokenizer' in locals():
    del tokenizer
    print("Tokenizer cleared from memory")

# Clear CUDA cache if using GPU
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("GPU memory cache cleared")



PERFORMANCE COMPARISON SUMMARY
Original Model Performance:
   Correct Answers: 7/20
   Accuracy: 35.0%

Fine-tuned Model Performance:
   Correct Answers: 10/20
   Accuracy: 50.0%

Training Impact:
   Accuracy Change: +15.0%
   Good! Training showed positive results

DETAILED ANALYSIS
Key improvements to look for in fine-tuned model:
   Step-by-step mathematical reasoning
   Correct arithmetic calculations
   Clear explanation of the process
   Consistent answer format
   Better handling of word problems

CLEANING UP RESOURCES
Original model cleared from memory
Fine-tuned model cleared from memory
Tokenizer cleared from memory
GPU memory cache cleared
