In [1]:
import os
import time
import torch
from pathlib import Path
from transformers import AutoModelForCausalLM, AutoTokenizer
import requests
from dotenv import load_dotenv
import json

# Load environment variables from parent directory
load_dotenv("../.env")
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")

def format_time(seconds):
    """Format time in seconds to a readable string."""
    if seconds < 1:
        return f"{seconds*1000:.2f} ms"
    else:
        return f"{seconds:.2f} s"

In [2]:
def phi4_format(system, user, assistant):
    system_part = f"<|im_start|>system<|im_sep|>{system}<|im_end|>" if system else ""
    user_part = f"<|im_start|>user<|im_sep|>{user}<|im_end|>" if user else ""
    assistant_part = f"<|im_start|>assistant<|im_sep|>{assistant}" if assistant else "<|im_start|>assistant<|im_sep|>"
    return f"{system_part}{user_part}{assistant_part}"

In [3]:
# Load model and tokenizer
model_id = "microsoft/phi-4"
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Load model with appropriate GPU settings
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float32 if device == "cuda" else torch.float32,
    device_map="auto" if device == "cuda" else None
)

Using device: cuda


Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

In [4]:
def generate_with_local_phi4(model, tokenizer, device, prompt, max_tokens=2000):
    """Generate text using locally loaded Phi-4 model."""
    print("Loading Phi-4 model locally to generate a large corpus of text...")

    start_time = time.time()    

    prompt = phi4_format(system='You are a helpful assistant that can answer questions and help with tasks.', user=prompt, assistant=None)

    # Tokenize input and get input length
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    input_token_count = len(inputs.input_ids[0])
    print(f"Input tokens: {input_token_count}")
    
    # Generate text
    print("Generating with local Phi-4 model...")
    start_time = time.time()
    
    with torch.no_grad():
        output = model.generate(
            inputs.input_ids,
            max_new_tokens=max_tokens,
            do_sample=True,
            temperature=0.7,
            top_p=0.9
        )
    
    end_time = time.time()
    
    # Get generated text and token count
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    output_token_count = len(output[0]) - input_token_count
    
    total_time = end_time - start_time
    tokens_per_second = output_token_count / total_time
    
    result = {
        "generated_text": generated_text,
        "output_token_count": output_token_count,
        "total_time": total_time,
        "tokens_per_second": tokens_per_second,
    }
    
    return result

In [5]:
def generate_with_openrouter(prompt, max_tokens=2000):
    """Generate text using Phi-4 via OpenRouter API."""
    print("Generating with OpenRouter (Phi-4)...")
    
    if not OPENROUTER_API_KEY:
        raise ValueError("OpenRouter API key not found in environment variables")
    
    url = "https://openrouter.ai/api/v1/chat/completions"
    headers = {
        "Authorization": f"Bearer {OPENROUTER_API_KEY}",
        "Content-Type": "application/json"
    }
    
    payload = {
        "model": "microsoft/phi-4",
        "messages": [{"role": "user", "content": prompt}],
        "max_tokens": max_tokens,
        "temperature": 0.7,
        "top_p": 0.9
    }
    
    start_time = time.time()
    response = requests.post(url, headers=headers, json=payload)
    response.raise_for_status()
    end_time = time.time()
    
    result = response.json()
    generated_text = result["choices"][0]["message"]["content"]
    output_token_count = result["usage"]["completion_tokens"]
    
    total_time = end_time - start_time
    tokens_per_second = output_token_count / total_time
    
    return {
        "generated_text": generated_text,
        "output_token_count": output_token_count,
        "total_time": total_time,
        "tokens_per_second": tokens_per_second,
        "api_response": result
    }

In [10]:
def run_comparison():
    """Run comparison between local and API-based Phi-4."""
    prompt = """Write a comprehensive explanation of how large language models work, 
    including details about transformer architectures, attention mechanisms, 
    and training processes. Include information about tokenization, embedding, 
    and the computational requirements for training and inference. Output a really large chunk of text and be very detailed."""
    
    # Run local generation
    try:
        local_result = generate_with_local_phi4(model, tokenizer, device, prompt)
        print(f"\nLocal Phi-4 Results:")
        print(f"Generated {local_result['output_token_count']} tokens in {format_time(local_result['total_time'])}")
        print(f"Speed: {local_result['tokens_per_second']:.2f} tokens/second")
    except Exception as e:
        print(f"Error running local model: {e}")
        local_result = None
    
    # Run OpenRouter API generation
    try:
        api_result = generate_with_openrouter(prompt)
        print(f"\nOpenRouter Phi-4 Results:")
        print(f"Generated {api_result['output_token_count']} tokens in {format_time(api_result['total_time'])}")
        print(f"Speed: {api_result['tokens_per_second']:.2f} tokens/second")
    except Exception as e:
        print(f"Error with OpenRouter API: {e}")
        api_result = None
    
    # Compare results
    if local_result and api_result:
        speedup = api_result['tokens_per_second'] / local_result['tokens_per_second'] if local_result['tokens_per_second'] > 0 else 0
        print(f"\nComparison:")
        print(f"OpenRouter is {speedup:.2f}x {'faster' if speedup > 1 else 'slower'} than local inference")
        
        # Save results to file
        with open("phi4_comparison_results.json", "w") as f:
            json.dump({
                "local": {
                    "output_token_count": local_result["output_token_count"],
                    "total_time": local_result["total_time"],
                    "tokens_per_second": local_result["tokens_per_second"],
                },
                "openrouter": {
                    "output_token_count": api_result["output_token_count"],
                    "total_time": api_result["total_time"],
                    "tokens_per_second": api_result["tokens_per_second"]
                },
                "comparison": {
                    "speedup_factor": speedup,
                    "faster": "OpenRouter" if speedup > 1 else "Local"
                }
            }, f, indent=2)
        print(f"Results saved to phi4_comparison_results.json")

In [11]:
run_comparison()

Loading Phi-4 model locally to generate a large corpus of text...
Input tokens: 84
Generating with local Phi-4 model...



Local Phi-4 Results:
Generated 1157 tokens in 32.82 s
Speed: 35.26 tokens/second
Generating with OpenRouter (Phi-4)...

OpenRouter Phi-4 Results:
Generated 1152 tokens in 10.71 s
Speed: 107.52 tokens/second

Comparison:
OpenRouter is 3.05x faster than local inference
Results saved to phi4_comparison_results.json


In [12]:
games = 100
steps = 70
players = 7
tokens = 2000
speed = 30

total_tokens = games * steps * players * tokens

total_time = total_tokens / (speed * 60 * 60)
print(f'Sed life, yer task shall take {total_time} hours')

Sed life, yer task shall take 907.4074074074074 hours
