In [40]:
%reload_ext autoreload
%autoreload 2

import getpass
import os
import json
import mlflow 
from openai import OpenAI

In [41]:
os.environ["MLFLOW_TRACKING_URI"] = "http://0.0.0.0:5001"
os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter OPEN API key:")

openai = OpenAI()
mlflow.set_experiment("assignment-5-evaluate-llm")
mlflow.openai.autolog()

In [42]:
@mlflow.trace
def evaluation(prompt: str, deck_data: any):
    """
    Function to evaluate a Clash Royale deck using OpenAI API.
    """

    evaluation_prompt = """
#  Identity
You are a professional Clash Royale coach and deck analyst.

#  Instructions
Your task is to evaluate any 8-card Clash Royale deck using a structured and in-depth scoring system inspired by Deckshop. Your evaluation must include both **numeric ratings** and **detailed explanations** based on:

- Individual card roles
- Meta relevance
- Synergy potential (classic + emerging combos)
- Overall deck balance
- Spell composition

---

 **Rate the deck from 1 to 10** in the following categories:

1. **Overall Power** – How viable the deck is across ladder, global tournaments, and special challenges.
2. **Defense** – Assess how well the deck can handle:
   -  Air threats (e.g., Minions, Balloon, Lava Hound)
   -  Swarms (e.g., Skeleton Army, Bats)
   - Tanks (e.g., Giant, Royal Giant, Golem)
   - Splash resistance
   -  Spell defense (e.g., vs Goblin Barrel, Graveyard)
3. **Attack** – Consider:
   - Presence of a clear **win condition**
   - Strength of **support troops**
   - **Breakthrough ability** (vs buildings, swarms)
   - **Pressure** (dual-lane, bridge spam, counter-push)
4. **Synergy** – Evaluate:
   - Known combo effectiveness (e.g., Miner + Poison, Hog + Ice Spirit)
   - Cycle consistency and elixir pacing
   - Spell synergy (e.g., Log + Fireball)
   - Role diversity (tank, DPS, splash, control)
   - Potential for **new synergy discovery**
5. **Versatility** – Rate:
   - Matchups vs all major archetypes (siege, bait, beatdown, etc.)
   - Recovery ability after a bad rotation
   - Adaptability in both ladder and competitive modes
   - Flexibility for switching between offense and defense
6. difficulty of the deck:
   - how easy it is to play this deck on different levels and how much experience is needed
   - 1 being the easiest and 10 being the hardest only pros can play

---

**Average Elixir Cost**
Return the float as provided (e.g., `3.50`).

---

 **Card Role & Spell Balance Guidelines**

Spells are divided into:
-  **Small Spells** (): Log, Zap, Snowball, Barbarian Barrel, Arrows, Goblin Curse
-  **Big Spells** (): Fireball, Poison, Lightning, Rocket, Earthquake

 Every well-balanced deck **usually  includes one small and one big spell.**


---
 **Deck Usage**
-in the end of the output, yu must give it as a comments:
1 liner explanation of where and when and how to use each card
-saying which cards are defenders, which ones are win condintions and which ones are support
format should be:
-card 1 explanation
-card 2 explanation, etc.

 **Deck Archetype Classification**
Choose the most accurate one:

- Beatdown
- Hybrid Beatdown
- Sparky Beatdown
- Air Beatdown
- Control
- Graveyard Control
- Royal Giant Control
- Splashyard
- Cycle
- Hog Cycle
- Mega Minion Cycle
- Miner Wall Breakers Cycle
- Bridge Spam
- Siege
- Spell Bait
- Off-Meta / Experimental
- Troll / Meme Deck

---

 **Deck Input Format (JSON)**:

Return a JSON object with these fields:
- deck_name: string
- average_elixir: number
- cards: list of strings (card names)
- comment: string (markdown summary of the deck’s analysis, including key points, pros, and cons)

Note: Keep `comment` brief—limit to 6 or 7 sentences max.

**Evaluate this deck:**

{{deck_json}}

- After evaluating the deck, give the comment of explanation of strong and weak sides of deck, synergies, and how, where and when spawn the cards. Also say which are win condintions of the deck, what is synergetic duo/trios and etc.

---

### Output JSON Format:

```json
{
  "overall": X,
  "defense": X,
  "attack": X,
  "synergy": X,
  "versatility": X,
  "avg_elixir": X.XX,
  "difficulty": X,
  "deck_type": "Deck Archetype",
  "comments": "format the output as said here for some given deck:  **Deck Usage**
  comments:
Hog Rider - Win condition, apply pressure, punish opponent's mistakes
Mega Knight - Tank and splash damage, counter big pushes
Firecracker - Support, splash damage, anti-air defense
Skeletons - Cycle, distract, chip damage
Ice Spirit - Cycle, freeze, support Hog pushes
Tesla - Defensive building, distracts, counters tanks
Fireball - Spell, support for Hog pushes, eliminate swarms
The Log - Spell, clear swarms, push back units, support Hog pushes

This Hog Cycle Control deck excels in defense, with the Mega Knight and Tesla providing sturdy defense against various threats. The Hog Rider serves as the primary win condition, applying pressure and punishing mistakes. Firecracker adds splash damage and anti-air support, complementing the
Hog pushes. The deck has good synergy and cycle consistency, allowing for quick and effective gameplay. The versatile card selection enables adaptability in different matchups and scenarios, making it a solid choice for ladder and competitive play."
}
```
"""

    # Load the prompt instructions from a markdown file
    evaluation_prompt = evaluation_prompt.replace(
        "{{deck_json}}",
        json.dumps(deck_data),
    )

    completion = openai.chat.completions.create(
        model="gpt-4.1",
        response_format={"type": "json_object"},
        messages=[
            {
                "role": "system",
                "content": "You are a Clash Royale deck evaluation assistant.",
            },
            {
                "role": "user",
                "content": evaluation_prompt,
            }
        ],
    )

    content = completion.choices[0].message.content
    print("Raw model output:", repr(content))

    # Attempt to parse the content as JSON
    try:
        parsed = json.loads(content)
    except json.JSONDecodeError as e:
        print(f"Failed to parse evaluation response as JSON: {e}")
        print("Raw content:", content)
        raise

    with mlflow.start_run():
        mlflow.log_param("user prompt", prompt)
        mlflow.log_param("deck_name", deck_data["deck_name"])
        mlflow.log_param("average_elixir_cost", deck_data["average_elixir_cost"])
        mlflow.log_param("cards", deck_data["cards"])

        mlflow.log_metric("eval_overall", parsed["overall"])
        mlflow.log_metric("eval_defense", parsed["defense"])
        mlflow.log_metric("eval_attack", parsed["attack"])
        mlflow.log_metric("eval_synergy", parsed["synergy"])
        mlflow.log_metric("eval_versatility", parsed["versatility"])
        mlflow.log_metric("eval_avg_elixir", parsed["avg_elixir"])
        mlflow.log_metric("eval_difficulty", parsed["difficulty"])


@mlflow.trace
def ask_agent(user_input: str):
    """
    Function to ask for deck advice from the OpenAI API.
    """

    instruction_prompt = """
## Identity
You are a Clash Royale deck-building assistant. Provide concise, practical deck-building advice.
    
___

## Instructions
- Output only viable decks.
- Consider user's available cards if provided.
- Reference meta decks only if relevant.
- Include average elixir cost and key stats.
- Avoid unnecessary explanations.
- Output must be valid JSON matching the provided schema.

___

## Output Format

Return a JSON object with these fields:
- deck_name: string
- average_elixir_cost: number
- cards: list of strings (card names)
- comment: string (markdown summary of the deck’s analysis, including key points, pros, and cons)

Note: Keep `comment` brief—limit to 6 or 7 sentences max.

___

## Current Meta Decks

```json
[
    {"deck_name": "Giant Wizard Control", "average_elixir_cost": 3.9, "cards": [ "Giant", "Mini P.E.K.K.A", "Wizard", "Arrows", "Bomber", "Musketeer", "Valkyrie", "Electro Spirit" ]},
    {"deck_name": "Anti-Swarm Control", "average_elixir_cost": 3.6, "cards": [ "Knight", "Archers", "Wizard", "Arrows", "Bomber", "Mini P.E.K.K.A", "Giant", "Fireball" ]},
]
```
"""

    response = openai.responses.create(
        model="gpt-4.1",
        input=[
            {"role": "system", "content": instruction_prompt },
            {"role": "user", "content": user_input},
        ],
        text={"format": {"type": "json_object"}}
    )
    content = response.output_text
    deck_data = json.loads(content)

    evaluation(user_input, deck_data)

In [43]:
ask_agent("""
I need a deck for Hog Cycle Control. I have the following cards available: Mega Knight, Firecracker, Tesla, Hog Rider, Skeletons, Ice Spirit, Fireball, The Log.
Please provide a deck that is effective in the current meta, with a focus on Hog Cycle Control.
""")

ask_agent("""
I want to counter a Giant deck with a Hog Cycle Control deck. I have the following cards available: Mega Knight, Firecracker, Tesla, Hog Rider, Skeletons, Ice Spirit, Fireball, The Log.
Please provide a deck that is effective in the current meta, with a focus on Hog Cycle Control.
""")

ask_agent("""
I need a deck for Wizard Control. I have the following cards available: Giant, Mini P.E.K.K.A, Wizard, Arrows, Bomber, Musketeer, Valkyrie, Electro Spirit.
Please provide a deck that is effective in the current meta, with a focus on Wizard Control.
""")

APIConnectionError: Connection error.

# Monitoring LLM Evaluation Results

This section monitors the quality and relevance of the LLM evaluation results to detect potential issues such as:
- Inconsistent scoring patterns
- Evaluation drift over time
- Outlier scores that may indicate model issues

In [47]:
import pandas as pd
import numpy as np
from datetime import datetime
import pendulum
from sqlalchemy import Boolean, Column, Float, Integer, String, DateTime
from sqlalchemy.orm import declarative_base
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from evidently import Dataset
from evidently import Report
from evidently.presets import DataDriftPreset
import warnings

# Suppress all numpy warnings to avoid division by zero and invalid value warnings
warnings.filterwarnings('ignore')
np.seterr(all='ignore')  # Suppress numpy errors/warnings

# Database setup for Grafana
USER = "admin"
PASSWORD = "admin"
MONITORING_DB_URI = f"postgresql+psycopg2://{USER}:{PASSWORD}@127.0.0.1:5432/monitoring_db"

# Create database table for LLM evaluation monitoring
Base = declarative_base()

class LLMEvaluationTable(Base):
    """Table for LLM evaluation monitoring metrics."""
    __tablename__ = "llm_evaluation_monitoring"
    
    id = Column(Integer, primary_key=True)
    timestamp = Column(Float)
    avg_overall_score = Column(Float)
    avg_defense_score = Column(Float)
    avg_attack_score = Column(Float)
    avg_synergy_score = Column(Float)
    avg_versatility_score = Column(Float)
    avg_difficulty_score = Column(Float)
    score_variance = Column(Float)
    low_score_count = Column(Integer)
    total_evaluations = Column(Integer)
    system_health_status = Column(String)

def create_monitoring_db():
    """Create monitoring database tables."""
    engine = create_engine(MONITORING_DB_URI)
    Base.metadata.create_all(engine)
    print("✅ LLM Evaluation monitoring database created")

def get_llm_monitoring_metrics(runs_df):
    """Extract monitoring metrics from MLflow runs with robust error handling."""
    metrics = {}
    
    # Calculate averages and variance for each metric with safety checks
    eval_metrics = ['eval_overall', 'eval_defense', 'eval_attack', 'eval_synergy', 'eval_versatility', 'eval_difficulty']
    
    for metric in eval_metrics:
        col_name = f'metrics.{metric}'
        if col_name in runs_df.columns:
            values = runs_df[col_name].dropna()
            if len(values) > 0:
                # Safe mean calculation
                mean_val = values.mean()
                if pd.isna(mean_val) or not np.isfinite(mean_val):
                    mean_val = 0.0
                metrics[f'avg_{metric}_score'] = mean_val
    
    # Calculate overall variance (instability indicator) with comprehensive safety
    overall_scores = runs_df['metrics.eval_overall'].dropna() if 'metrics.eval_overall' in runs_df.columns else pd.Series()
    if len(overall_scores) > 1:
        try:
            variance = overall_scores.var()
            if pd.isna(variance) or not np.isfinite(variance):
                variance = 0.0
            metrics['score_variance'] = variance
        except (ZeroDivisionError, RuntimeWarning):
            metrics['score_variance'] = 0.0
    else:
        metrics['score_variance'] = 0.0
    
    # Count low scores (quality issues) with comprehensive safety
    low_threshold = 3.0
    low_score_count = 0
    for metric in eval_metrics:
        col_name = f'metrics.{metric}'
        if col_name in runs_df.columns:
            try:
                values = runs_df[col_name].dropna()
                if len(values) > 0:
                    # Safe comparison that handles NaN values
                    low_scores = np.sum(values < low_threshold)
                    if pd.isna(low_scores) or not np.isfinite(low_scores):
                        low_scores = 0
                    low_score_count += int(low_scores)
            except (TypeError, ValueError):
                continue  # Skip problematic columns
    
    metrics['low_score_count'] = low_score_count
    metrics['total_evaluations'] = len(runs_df)
    
    # Determine system health with ultra-safe division
    avg_overall = metrics.get('avg_eval_overall_score', 0)
    if pd.isna(avg_overall) or not np.isfinite(avg_overall):
        avg_overall = 0.0
    
    try:
        if avg_overall >= 7.0:
            metrics['system_health_status'] = 'HEALTHY'
        elif avg_overall >= 5.0:
            metrics['system_health_status'] = 'CAUTION'
        else:
            metrics['system_health_status'] = 'CRITICAL'
    except (TypeError, ValueError):
        metrics['system_health_status'] = 'UNKNOWN'
    
    return metrics

# Initialize monitoring database
try:
    create_monitoring_db()
    
    # Retrieve MLflow experiment data
    experiment = mlflow.get_experiment_by_name("assignment-5-evaluate-llm")
    runs = mlflow.search_runs(experiment_ids=[experiment.experiment_id])
    
    if len(runs) > 0:
        print(f"🔍 Monitoring {len(runs)} evaluation runs...")
        
        # Extract monitoring metrics
        monitoring_metrics = get_llm_monitoring_metrics(runs)
        
        # Display current monitoring status
        print("\n📊 LLM Evaluation Monitoring Report:")
        print(f"   • Total evaluations: {monitoring_metrics['total_evaluations']}")
        print(f"   • Average overall score: {monitoring_metrics.get('avg_eval_overall_score', 0):.2f}")
        print(f"   • Score variance: {monitoring_metrics['score_variance']:.3f}")
        print(f"   • Low scores detected: {monitoring_metrics['low_score_count']}")
        print(f"   • System status: {monitoring_metrics['system_health_status']}")
        
        # Log metrics to database for Grafana
        engine = create_engine(MONITORING_DB_URI)
        Session = sessionmaker(bind=engine)
        session = Session()
        
        # Create monitoring record - convert NumPy types to Python types
        timestamp = pendulum.now().timestamp()
        monitoring_record = LLMEvaluationTable(
            timestamp=timestamp,
            avg_overall_score=float(monitoring_metrics.get('avg_eval_overall_score', 0)),
            avg_defense_score=float(monitoring_metrics.get('avg_eval_defense_score', 0)),
            avg_attack_score=float(monitoring_metrics.get('avg_eval_attack_score', 0)),
            avg_synergy_score=float(monitoring_metrics.get('avg_eval_synergy_score', 0)),
            avg_versatility_score=float(monitoring_metrics.get('avg_eval_versatility_score', 0)),
            avg_difficulty_score=float(monitoring_metrics.get('avg_eval_difficulty_score', 0)),
            score_variance=float(monitoring_metrics['score_variance']),
            low_score_count=int(monitoring_metrics['low_score_count']),
            total_evaluations=int(monitoring_metrics['total_evaluations']),
            system_health_status=str(monitoring_metrics['system_health_status'])
        )
        
        session.add(monitoring_record)
        session.commit()
        session.close()
        
        print(f"✅ Monitoring metrics logged to database for Grafana visualization")
        print(f"   Database URI: {MONITORING_DB_URI}")
        print(f"   Table: llm_evaluation_monitoring")
        
        # Generate Evidently report for detailed analysis
        if len(runs) >= 2:
            # Create a simple dataset for drift analysis on evaluation scores
            eval_data = runs[['metrics.eval_overall', 'metrics.eval_defense', 'metrics.eval_attack', 
                            'metrics.eval_synergy', 'metrics.eval_versatility', 'metrics.eval_difficulty']].dropna()
            
            if len(eval_data) >= 2:
                # Split data into reference (first half) and current (second half)
                split_idx = len(eval_data) // 2
                reference_data = eval_data.iloc[:split_idx]
                current_data = eval_data.iloc[split_idx:]
                
                # Create Evidently datasets
                reference_dataset = Dataset.from_pandas(reference_data)
                current_dataset = Dataset.from_pandas(current_data)
                
                # Create report to detect evaluation drift
                report = Report([DataDriftPreset()])
                report.run(current_data=current_dataset, reference_data=reference_dataset)
                
                print(f"\n📈 Evidently Report Generated:")
                print(f"   • Reference evaluations: {len(reference_data)}")
                print(f"   • Current evaluations: {len(current_data)}")
                print(f"   • Drift analysis completed")
    else:
        print("❌ No evaluation runs found in MLflow experiment")
        
except Exception as e:
    print(f"❌ Error setting up monitoring: {e}")
    print("Make sure PostgreSQL is running and accessible")

✅ LLM Evaluation monitoring database created
🔍 Monitoring 23 evaluation runs...

📊 LLM Evaluation Monitoring Report:
   • Total evaluations: 23
   • Average overall score: 7.64
   • Score variance: 0.229
   • Low scores detected: 0
   • System status: HEALTHY
✅ Monitoring metrics logged to database for Grafana visualization
   Database URI: postgresql+psycopg2://admin:admin@127.0.0.1:5432/monitoring_db
   Table: llm_evaluation_monitoring

📈 Evidently Report Generated:
   • Reference evaluations: 10
   • Current evaluations: 11
   • Drift analysis completed

📈 Evidently Report Generated:
   • Reference evaluations: 10
   • Current evaluations: 11
   • Drift analysis completed
