# üß™ Semantic Similarity Testing with Gemini Embeddings

This notebook tests the updated `deduplicate.py` with:
1. **Generic frame detection** - penalizes minified stacks like `app.js:1`
2. **Gemini semantic embeddings** - understands meaning, not just word overlap
3. **Error type matching** - penalizes mismatched error types when stack is generic

In [32]:
# Setup - Import the updated deduplicate module
import os
import json
import pandas as pd

# Set your Gemini API key (you can also set it as environment variable)
os.environ['GEMINI_API_KEY'] = 'AIzaSyCtu2O_qGF9HsVFD3Alhb20E_ty_w-jWmk'

# Import the updated deduplicate module
from deduplicate import (
    run_pipeline, 
    USE_SEMANTIC_EMBEDDINGS,
    get_frame_quality,
    calculate_semantic_similarities
)

print(f"‚úÖ Semantic embeddings enabled: {USE_SEMANTIC_EMBEDDINGS}")

‚úÖ Semantic embeddings enabled: True


In [33]:
# Load real Veoci entries
veoci_entries = pd.read_json('Get_Veoci_Form_Entries.json')
print(f"Loaded {len(veoci_entries)} entries from Veoci")
print(f"Columns: {list(veoci_entries.columns)[:10]}...")

Loaded 1 entries from Veoci
Columns: ['json', 'pairedItem']...


In [34]:
# Convert DataFrame to list of dicts for the pipeline
# The JSON structure is: {"json": {"entries": [...]}}
if 'json' in veoci_entries.columns and not veoci_entries.empty:
    candidates_raw = veoci_entries['json'][0]['entries']
    print(f"Extracted {len(candidates_raw)} candidate entries from Veoci JSON")
else:
    candidates_raw = veoci_entries.to_dict('records')
    print(f"Converted {len(candidates_raw)} entries to candidate format")

# Preview a sample entry structure
if candidates_raw:
    sample = candidates_raw[0]
    print(f"\nSample entry keys: {list(sample.keys())[:8]}...")
    print(f"Sample entry ID: {sample.get('id', 'N/A')}")

Extracted 213 candidate entries from Veoci JSON

Sample entry keys: ['id', 'orgSequenceId', 'objectType', 'container', 'name', 'lastModified', 'properties', 'created']...
Sample entry ID: 1580841464


## üìù Define Incoming Error to Test

Use the same incoming entry from your n8n workflow, or modify it to test different scenarios:

In [35]:
# Real incoming entry from your n8n workflow (modify as needed)
incoming_entry_raw = [
  {
    "body": {
      "5": "TypeError: undefined is not an object (evaluating 't.address')",
      "6": "/",
      "8": "Stacktrace ionic://localhost/js/app.js:1 - /******/ (function() { // webpackBootstrap src-cordova/www/js/webpack:/node_modules/chart.js/dist/chart.mjs:2726 - if ((reverse && position !== 'right') || (!reverse && position === 'right')) { src-cordova/www/js/webpack:/node_modules/chart.js/dist/chart.mjs:2726",
      "13": "Mozilla/5.0 (iPhone; CPU iPhone OS 18_2 like Mac OS X)",
      "15": "iOS",
      "16": "Mobile Safari UI/WKWebView 18.2",
      "18": "6.0.545322-ios",
      "21": "5f1875273e1741000a1a4a25",  # Project ID
      "24": [],
      "25": "PROD-MOBILE",
      "26": "veoci-mobile-client",
      "27": "TypeError",
      "39": "https://app.veoci.com/v/c/67813/dashboard",
      "id": "test-incoming-001",
      "formId": "35430484",
      "name": "Test: TypeError address property"
    }
  }
]

print("üì• Incoming Error:")
print(f"   Message: {incoming_entry_raw[0]['body']['5']}")
print(f"   Error Type: {incoming_entry_raw[0]['body'].get('27', 'Unknown')}")
print(f"   Environment: {incoming_entry_raw[0]['body'].get('25', 'Unknown')}")

üì• Incoming Error:
   Message: TypeError: undefined is not an object (evaluating 't.address')
   Error Type: TypeError
   Environment: PROD-MOBILE


## üöÄ Run the Pipeline with Semantic Embeddings

In [36]:
# Run the updated pipeline
import time

start_time = time.time()
result = run_pipeline(incoming_entry_raw, candidates_raw)
elapsed = time.time() - start_time

print(f"‚è±Ô∏è  Pipeline completed in {elapsed:.2f} seconds")
print(f"üìä Method used: {result.get('metadata', {}).get('similarity_method', 'unknown')}")
print(f"‚ö†Ô∏è  Generic stack detected: {result.get('metadata', {}).get('generic_stack_detected', False)}")
print(f"üìà Frame quality: {result.get('metadata', {}).get('incoming_frame_quality', 'unknown')}")

‚è±Ô∏è  Pipeline completed in 66.24 seconds
üìä Method used: semantic_embeddings
‚ö†Ô∏è  Generic stack detected: True
üìà Frame quality: low


Using Gemini embeddings for message similarity


In [37]:
# Debug: Check if candidates have matching project IDs
from deduplicate import normalize_veoci_entry, normalize_incoming_entry, passes_hard_gates

incoming_normalized = normalize_incoming_entry(incoming_entry_raw)
print(f"Incoming Project ID: {incoming_normalized.get('project')}")

# Check first few candidates
passed_count = 0
failed_reasons = []
for i, cand in enumerate(candidates_raw[:5]):
    cand_normalized = normalize_veoci_entry(cand)
    passed, reasons = passes_hard_gates(incoming_normalized, cand_normalized)
    print(f"\nCandidate {i+1} (ID: {cand_normalized['entry_id']}):")
    print(f"  Project: {cand_normalized.get('project')}")
    print(f"  Passed: {passed}")
    if not passed:
        print(f"  Reasons: {reasons}")
        failed_reasons.extend(reasons)
    else:
        passed_count += 1

print(f"\n‚úÖ Passed: {passed_count}/5 candidates checked")

Incoming Project ID: 5f1875273e1741000a1a4a25

Candidate 1 (ID: 1580841464):
  Project: 5f1875273e1741000a1a4a25
  Passed: True

Candidate 2 (ID: 1574820389):
  Project: 5f1875273e1741000a1a4a25
  Passed: True

Candidate 3 (ID: 1578047584):
  Project: 5f1875273e1741000a1a4a25
  Passed: True

Candidate 4 (ID: 1577908710):
  Project: 5f1875273e1741000a1a4a25
  Passed: True

Candidate 5 (ID: 1578115614):
  Project: 5f1875273e1741000a1a4a25
  Passed: True

‚úÖ Passed: 5/5 candidates checked


In [38]:
# Display summary
summary = result['batchSummary']
print("=" * 60)
print("üìã BATCH SUMMARY")
print("=" * 60)
print(f"Total Analyzed: {summary['totalAnalyzed']}")
print(f"Related Found:  {summary['relatedFound']}")
print(f"High Confidence:   {summary['confidenceCounts']['High']}")
print(f"Medium Confidence: {summary['confidenceCounts']['Medium']}")
print(f"Low Confidence:    {summary['confidenceCounts']['Low']}")

üìã BATCH SUMMARY
Total Analyzed: 213
Related Found:  206
High Confidence:   0
Medium Confidence: 0
Low Confidence:    206


In [39]:
# Display top matches as a DataFrame for easy analysis
entries = result['relatedEntries']

if entries:
    df_results = pd.DataFrame([
        {
            'Entry ID': e['entryId'],
            'Name': e.get('name', '')[:50] + '...' if e.get('name') and len(e.get('name', '')) > 50 else e.get('name', ''),
            'Score': f"{e['score']:.2f}",
            'Confidence': e['confidence'],
            'Message Sim': f"{e['breakdown']['message']:.2f}",
            'Stack Score': e['breakdown']['stack'],
            'Signals': ', '.join(e.get('signals', [])),
            'Error Type': e.get('errorType', ''),
            'Environment': e.get('environment', '')
        }
        for e in entries[:15]  # Top 15
    ])
    
    print("\nüìä TOP MATCHES (sorted by score):\n")
    display(df_results)
else:
    print("No matches found!")


üìä TOP MATCHES (sorted by score):



Unnamed: 0,Entry ID,Name,Score,Confidence,Message Sim,Stack Score,Signals,Error Type,Environment
0,1537105308,2025-Oct-16 13:59: Bugsnag ( fixed) - PROD-MO...,0.48,Low,0.96,20,message,TypeError,
1,1512402078,2025-Sep-18 10:46: Bugsnag ( fixed) - PROD-MO...,0.48,Low,0.96,20,message,TypeError,
2,1520892913,2025-Sep-28 16:02: Bugsnag ( fixed) - PROD-MO...,0.48,Low,0.96,20,message,TypeError,
3,1430181430,2025-Jun-19 16:22: Bugsnag ( fixed) - PROD-MO...,0.43,Low,0.81,20,message,TypeError,
4,1369970745,2025-Apr-16 15:26: Bugsnag ( fixed) - PROD-MO...,0.43,Low,0.81,20,message,TypeError,
5,1423564633,2025-Jun-12 15:06: Bugsnag ( fixed) - PROD-MO...,0.43,Low,0.81,20,message,TypeError,
6,1515285679,2025-Sep-21 19:14: Bugsnag ( fixed) - PROD-MO...,0.43,Low,0.71,20,message,TypeError,
7,1391369188,2025-May-10 09:27: Bugsnag ( fixed) - PROD-MO...,0.43,Low,0.71,20,message,TypeError,
8,1179746823,2024-Sep-10 06:40: Bugsnag ( fixed) - PROD-MO...,0.43,Low,0.78,20,message,TypeError,
9,1463049764,2025-Jul-24 08:19: Bugsnag ( fixed) - PROD-MO...,0.43,Low,0.71,20,message,TypeError,


In [40]:
# Detailed view of High confidence matches
print("=" * 60)
print("üéØ HIGH CONFIDENCE MATCHES - DETAILED VIEW")
print("=" * 60)

high_confidence = [e for e in entries if e['confidence'] == 'High']

for i, entry in enumerate(high_confidence[:5], 1):
    print(f"\n--- Match #{i} (Score: {entry['score']:.2f}) ---")
    print(f"Entry ID: {entry['entryId']}")
    print(f"Name: {entry.get('name', 'N/A')}")
    print(f"Message Similarity: {entry['breakdown']['message']:.2f}")
    print(f"Stack Score: {entry['breakdown']['stack']}")
    print(f"Signals: {', '.join(entry.get('signals', []))}")
    print(f"Explanation: {entry['explanation'][:150]}...")
    if entry.get('linkedTickets'):
        print(f"üîó Linked Tickets: {entry['linkedTickets']}")

if not high_confidence:
    print("\nNo high confidence matches found - this is expected with generic stacks!")

üéØ HIGH CONFIDENCE MATCHES - DETAILED VIEW

No high confidence matches found - this is expected with generic stacks!


## üî¨ Compare: Semantic vs TF-IDF Similarity

Let's see how Gemini embeddings compare to the old TF-IDF approach:

In [41]:
# Compare semantic understanding on specific error messages
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity as sklearn_cosine

test_messages = [
    "TypeError: Cannot read property 'address' of undefined",  # Incoming
    "TypeError: undefined is not an object (evaluating 't.address')",  # Same concept
    "Cannot access property 'address' on undefined value",  # Same concept, different words
    "null is not an object (evaluating 'this.barCodeValue.match')",  # Different bug
    "Attempted to assign to readonly property",  # Completely different
]

print("üî¨ SEMANTIC vs TF-IDF COMPARISON")
print("=" * 70)
print(f"\nIncoming: {test_messages[0]}\n")

# Gemini semantic similarities
semantic_sims = calculate_semantic_similarities(test_messages[0], test_messages[1:])

# TF-IDF similarities
vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words="english")
tfidf_matrix = vectorizer.fit_transform(test_messages)
tfidf_sims = sklearn_cosine(tfidf_matrix[0:1], tfidf_matrix[1:])[0]

# Compare
comparison_df = pd.DataFrame({
    'Error Message': test_messages[1:],
    'Semantic (Gemini)': [f"{s:.2f}" for s in semantic_sims],
    'TF-IDF': [f"{s:.2f}" for s in tfidf_sims],
    'Difference': [f"{semantic_sims[i] - tfidf_sims[i]:+.2f}" for i in range(len(semantic_sims))]
})

display(comparison_df)

üî¨ SEMANTIC vs TF-IDF COMPARISON

Incoming: TypeError: Cannot read property 'address' of undefined



Unnamed: 0,Error Message,Semantic (Gemini),TF-IDF,Difference
0,TypeError: undefined is not an object (evaluat...,0.93,0.24,0.69
1,Cannot access property 'address' on undefined ...,0.96,0.41,0.55
2,null is not an object (evaluating 'this.barCod...,0.66,0.0,0.66
3,Attempted to assign to readonly property,0.65,0.07,0.58


In [42]:
# Output full JSON result for debugging
print("üìÑ FULL JSON RESULT (for debugging):")
print(json.dumps(result, indent=2, default=str)[:3000] + "...")

üìÑ FULL JSON RESULT (for debugging):
{
  "batchSummary": {
    "totalAnalyzed": 213,
    "relatedFound": 206,
    "confidenceCounts": {
      "High": 0,
      "Medium": 0,
      "Low": 206
    }
  },
  "relatedEntries": [
    {
      "entryId": "1537105308",
      "name": "2025-Oct-16 13:59: Bugsnag ( fixed) -  PROD-MOBILE:undefined is not an object (evaluating 't.address')",
      "score": 0.48,
      "confidence": "Low",
      "signals": [
        "message"
      ],
      "explanation": "Top frame (app) matches (\u26a0\ufe0f generic/minified); Line 1 match (ignored - likely minified); Secondary frame match: chart.mjs; High stack similarity (100.0%) (\u26a0\ufe0f generic stack).",
      "breakdown": {
        "stack": 20,
        "message": 0.96,
        "environment": 3,
        "temporal": 0
      },
      "topFrame": "app:1",
      "stackOverlap": 100.0,
      "errorType": "TypeError",
      "route": "/",
      "environment": null,
      "linkedTickets": "{'id': '1501144846', 'or

In [43]:
import numpy as np
import pandas as pd
import re
from datetime import datetime
from typing import List, Dict, Any
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [44]:
# Read the local file
VeociEntries = pd.read_json('Get_Veoci_Form_Entries.json')

> **Note:** This file simulates Veoci API output and will later be replaced by an HTTP payload.

In [45]:
incoming_entry_raw = [
  {
    "headers": {
      "x-forwarded-for": "3.208.214.5",
      "x-forwarded-proto": "https",
      "x-forwarded-port": "443",
      "host": "n8n.stg.veoci.com",
      "x-amzn-trace-id": "Root=1-6933355c-32df78546884c9ae2876637f",
      "content-length": "1692",
      "content-type": "application/json; charset=UTF-8",
      "user-agent": "Apache-HttpClient/4.5.13 (Java/21.0.9)",
      "accept-encoding": "gzip,deflate"
    },
    "params": {},
    "query": {},
    "body": {
      "5": "Fri Dec 05 2025 14:41:07 GMT-0500 (Eastern Standard Time): User logged out",
      "6": "/",
      "8": "Stacktrace ionic://localhost/js/app.js:1 - /******/ (function() { // webpackBootstrap src-cordova/www/js/webpack:/node_modules/chart.js/dist/chart.mjs:2726 - if ((reverse && position !== 'right') || (!reverse && position === 'right')) { src-cordova/www/js/webpack:/node_modules/chart.js/dist/chart.mjs:2726 - if ((reverse && position !== 'right') || (!reverse && position === 'right')) { src-cordova/www/js/webpack:/node_modules/chart.js/dist/chart.mjs:2758 - titleX = offsetFromEdge(scale, position, offset); src-cordova/www/js/webpack:/node_modules/chart.js/dist/chart.mjs:2758 - titleX = offsetFromEdge(scale, position, offset); src-cordova/www/js/webpack:/node_modules/chart.js/dist/chart.mjs:2758 - titleX = offsetFromEdge(scale, position, offset);",
      "16": "Mobile Safari UI/WKWebView - 18.6.2",
      "18": "6.0.545322-ios - PROD-MOBILE",
      "21": "5f1875273e1741000a1a4a25",
      "24": [],
      "25": "PROD-MOBILE",
      "26": "veoci-mobile-client",
      "27": "Error",
      "34": "Error ID: 693335535d860bbd20b41487 Error:Fri Dec 05 2025 14:41:07 GMT-0500 (Eastern Standard Time): User logged out Error occurred at: ionic://localhost/js/app.js:1 - /******/ (function() { // webpackBootstrap View error in Bugsnag",
      "id": "1580841464",
      "formId": "35430484",
      "name": "2025-Dec-05 14:41: Bugsnag ( open) -  PROD-MOBILE:Fri Dec 05 2025 14:41:07 GMT-0500 (Eastern Standard Time): User logged out",
      "lastModified": "2025-12-05T19:41:11Z",
      "created": "2025-12-05T19:41:11Z",
      "containerName": "Veoci Ticketing",
      "containerId": "67813"
    },
    "webhookUrl": "https://flows.stg.veoci.com/webhook/bugsnag-triage",
    "executionMode": "production"
  }
]

In [46]:
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_colwidth', None)

# VeociEntries

In [47]:
# --- 1. HELPER FUNCTIONS ---

def is_vendor_file(file_path: str) -> bool:
    if not file_path: return False
    vendor_patterns = [
        r"^node_modules/", r"^@vue/", r"vuetify/", r"core-js", r"zone\.js",
        r"runtime-core\.esm-bundler\.js", r"reactivity\.esm-bundler\.js",
        r"proxiedModel\.mjs", r"app\..*\.js", r"vendor\..*\.js", r"chunk-.*\.js",
        r"LogbackBugsnagAppender\.java", r"AppenderBase\.java",
        r"AppenderAttachableImpl\.java", r"Logger\.java", r"ch\.qos\.logback\.",
        r"org\.springframework\.", r"org\.apache\.commons\.", r"java\.util\.",
        r"javax?\.", r"sun\.reflect\."
    ]
    return any(re.search(p, file_path) for p in vendor_patterns)

def parse_stack_frames(input_str: Any) -> List[Dict[str, Any]]:
    if not input_str or not isinstance(input_str, str): return []
    frames = []
    seen = set()

    # HTML format: <strong>file:line</strong> - code
    html_matches = re.findall(r'<strong>(.*?)</strong>', input_str)
    for raw in html_matches:
        raw = raw.strip()
        if raw in seen: continue
        seen.add(raw)
        
        # Handle "file:line - code"
        parts = raw.split(' - ', 1)
        file_line = parts[0]
        
        last_colon = file_line.rfind(':')
        if last_colon == -1: continue
        
        file_path = file_line[:last_colon]
        line_str = file_line[last_colon+1:]
        file_name = file_path.split('/')[-1]
        
        frames.append({
            'file': file_name,
            'line': int(line_str) if line_str.isdigit() else None,
            'vendor': is_vendor_file(file_path),
            'full_path': file_path
        })

    # Fallback: Plain text format
    if not frames:
        plain_matches = re.findall(r'([\w@:\/\.\-]+?\.(?:vue|js|ts|mjs|jsx|tsx|java)):(\d+)', input_str)
        for full_path, line_num in plain_matches:
            raw = f"{full_path}:{line_num}"
            if raw in seen: continue
            seen.add(raw)
            
            file_name = full_path.split('/')[-1]
            frames.append({
                'file': file_name,
                'line': int(line_num),
                'vendor': is_vendor_file(full_path),
                'full_path': full_path
            })
            
    return frames

In [48]:
# --- 2. NORMALIZATION ---

def normalize_veoci_entry(entry: Dict[str, Any]) -> Dict[str, Any]:
    """Extracts fields from Veoci API format (nested in 'values') into Canonical Schema"""
    values = entry.get('values', {})
    
    def get_val(key):
        field = values.get(key, {})
        if not field: return None
        val = field.get('data', {}).get('value')
        if isinstance(val, list): return ", ".join(val) # Handle multi-value
        return val

    return {
        'entry_id': str(entry.get('id')),
        'project': get_val('21'),          # Project ID
        'release_stage': get_val('25'),    # Release Stage
        'app_version': get_val('18'),      # App Version
        'timestamp': entry.get('lastModified'), # Using lastModified as timestamp
        'error_message': get_val('5'),     # Error message
        'stack_frames': parse_stack_frames(get_val('8')), # Stack trace
        'name': entry.get('name')          # Keep name for reference
    }

def normalize_incoming_entry(entry_wrapper: Any) -> Dict[str, Any]:
    """Extracts fields from the Incoming_entry structure into Canonical Schema"""
    # Incoming_entry is a list containing a dict with 'body'
    if isinstance(entry_wrapper, list):
        entry = entry_wrapper[0].get('body', {})
    else:
        entry = entry_wrapper.get('body', {})

    return {
        'entry_id': str(entry.get('id')),
        'project': entry.get('21'),
        'release_stage': entry.get('25'),
        'app_version': entry.get('18'),
        'timestamp': entry.get('lastModified'),
        'error_message': entry.get('5'),
        'stack_frames': parse_stack_frames(entry.get('8')),
        'name': entry.get('name')
    }

In [49]:
# --- 3. APPLY NORMALIZATION ---

# Normalize Incoming Entry
incoming_entry = normalize_incoming_entry(incoming_entry_raw)
print(f"Normalized Incoming Entry: {incoming_entry['entry_id']}")

# Normalize Candidate Entries
candidate_entries = []
if "json" in VeociEntries.columns and not VeociEntries.empty:
    raw_entries = VeociEntries["json"][0]["entries"]
    candidate_entries = [normalize_veoci_entry(e) for e in raw_entries]

print(f"Normalized {len(candidate_entries)} Candidate Entries")

Normalized Incoming Entry: 1580841464
Normalized 213 Candidate Entries


### Canonical ErrorEntry schema

```json
{
  "entry_id": "str",
  "project": "str",
  "release_stage": "str",
  "app_version": "str",
  "timestamp": "str (ISO8601) or int",
  "error_message": "str",
  "stack_frames": [
    {"file": "str", "line": "int", "vendor": "bool"}
  ]
}
```

In [50]:
# --- 4. HARD GATES ---

def passes_hard_gates(incoming: Dict[str, Any], candidate: Dict[str, Any]) -> tuple[bool, List[str]]:
    """
    Returns (passed, reasons)
    passed: True if the candidate should be considered for scoring.
    reasons: List of reasons for failure (empty if passed).
    """
    reasons = []

    # 1. Self-comparison check
    if incoming['entry_id'] == candidate['entry_id']:
        reasons.append("Self-comparison")

    # 2. Project Mismatch (Critical)
    if incoming['project'] and candidate['project']:
        if incoming['project'] != candidate['project']:
            reasons.append(f"Project mismatch: {incoming['project']} != {candidate['project']}")
            
    # 3. Release Stage Mismatch (Optional)
    # if incoming['release_stage'] and candidate['release_stage']:
    #     if incoming['release_stage'] != candidate['release_stage']:
    #         reasons.append(f"Release stage mismatch: {incoming['release_stage']} != {candidate['release_stage']}")

    return len(reasons) == 0, reasons

In [51]:
# --- 5. SCORING LOGIC ---

def calculate_stack_score(incoming_frames: List[Dict], candidate_frames: List[Dict]) -> Dict[str, Any]:
    """Computes stack trace similarity score (0-40) and reasons."""
    score = 0
    reasons = []
    
    # Helper to select top N non-vendor frames
    def select_frames(frames):
        non_vendor = [f for f in frames if not f['vendor']]
        vendor = [f for f in frames if f['vendor']]
        chosen = non_vendor if non_vendor else vendor
        unique = []
        seen = set()
        for f in chosen:
            if f['file'] not in seen:
                seen.add(f['file'])
                unique.append(f)
            if len(unique) >= (3 if non_vendor else 2): break
        return unique

    inc_frames = select_frames(incoming_frames)
    cand_frames = select_frames(candidate_frames)
    
    matched_files = set()
    
    if inc_frames and cand_frames:
        # Top frame match (Critical)
        if inc_frames[0]['file'] == cand_frames[0]['file']:
            score += 25
            reasons.append(f"Top frame match: {inc_frames[0]['file']}")
            matched_files.add(inc_frames[0]['file'])
            
        # Secondary frame match
        if len(inc_frames) > 1 and len(cand_frames) > 1:
            if inc_frames[1]['file'] == cand_frames[1]['file']:
                score += 10
                reasons.append(f"Secondary frame match: {inc_frames[1]['file']}")
                matched_files.add(inc_frames[1]['file'])
                
        # Overlap
        for f in inc_frames:
            for cf in cand_frames:
                if f['file'] == cf['file'] and f['file'] not in matched_files:
                    score += 5
                    reasons.append(f"Frame overlap: {f['file']}")
                    matched_files.add(f['file'])

    return {"score": min(score, 40), "reasons": reasons}

def calculate_time_score(incoming_ts: str, candidate_ts: str) -> Dict[str, Any]:
    """Computes time decay score (-15 to +5) and reasons."""
    score = 0
    reasons = []
    
    if incoming_ts and candidate_ts:
        try:
            dt_inc = pd.to_datetime(incoming_ts)
            dt_cand = pd.to_datetime(candidate_ts)
            days_diff = abs((dt_inc - dt_cand).days)
            
            if days_diff <= 7:
                score += 5
                reasons.append("Recent (<= 7 days)")
            
            if days_diff > 180: score -= 15
            elif days_diff > 90: score -= 8
        except:
            pass
            
    return {"score": score, "reasons": reasons}

def calculate_context_score(incoming: Dict, candidate: Dict) -> Dict[str, Any]:
    """Computes metadata context score (0-30) and reasons."""
    score = 0
    reasons = []
    
    if incoming['project'] and candidate['project'] and incoming['project'] == candidate['project']:
        score += 15
        reasons.append("Same Project")
        
    if incoming['app_version'] and candidate['app_version'] and incoming['app_version'] == candidate['app_version']:
        score += 10
        reasons.append("Same App Version")
        
    if incoming['release_stage'] and candidate['release_stage'] and incoming['release_stage'] == candidate['release_stage']:
        score += 5
        reasons.append("Same Release Stage")
        
    return {"score": score, "reasons": reasons}

def calculate_total_score(
    incoming: Dict[str, Any], 
    candidate: Dict[str, Any], 
    message_similarity: float = 0.0
) -> Dict[str, Any]:
    """
    Pure function to calculate the total similarity score between an incoming error and a candidate.
    Aggregates Stack, Message, Context, and Time scores.
    """
    # 1. Stack Score (0-40)
    stack_res = calculate_stack_score(incoming['stack_frames'], candidate['stack_frames'])
    stack_score = stack_res['score']
    
    # 2. Message Score (Raw 0-1 input -> Scaled 0-30)
    # We store the RAW score for diagnostics, but use SCALED for total
    message_score_scaled = message_similarity * 30
    
    # 3. Context Score (0-30)
    context_res = calculate_context_score(incoming, candidate)
    context_score = context_res['score']
    
    # 4. Time Score (-15 to +5)
    time_res = calculate_time_score(incoming['timestamp'], candidate['timestamp'])
    time_score = time_res['score']
    
    # Aggregate
    total_score = (
        stack_score + 
        message_score_scaled + 
        context_score + 
        time_score
    )
    
    # Cap at 100, Floor at 0
    final_score = max(0, min(total_score, 100))
    
    all_reasons = (
        stack_res['reasons'] + 
        [f"Message similarity: {message_similarity:.2f}"] + 
        context_res['reasons'] + 
        time_res['reasons']
    )
    
    return {
        "entry_id": candidate['entry_id'],
        "name": candidate['name'],
        "final_score": final_score,
        "scores": {
            "stack": stack_score,
            "message": message_similarity, # Raw 0-1
            "context": context_score,
            "time": time_score
        },
        "reasons": all_reasons
    }

In [52]:
# --- 6. BUILD TF-IDF CORPUS ---

# Build TF-IDF corpus (incoming + gated candidates)
corpus = [incoming_entry["error_message"] or ""]

eligible_candidates = []
candidate_index_map = []  # maps TF-IDF index -> candidate index

for idx, candidate in enumerate(candidate_entries):
    passed, _ = passes_hard_gates(incoming_entry, candidate)
    if passed:
        corpus.append(candidate["error_message"] or "")
        eligible_candidates.append(candidate)
        candidate_index_map.append(idx)

print(f"TF-IDF Corpus Size: {len(corpus)} (1 Incoming + {len(eligible_candidates)} Candidates)")

TF-IDF Corpus Size: 213 (1 Incoming + 212 Candidates)


In [53]:
# --- 7. FIT TF-IDF ---

if len(corpus) > 1:
    vectorizer = TfidfVectorizer(
        ngram_range=(1, 2),
        stop_words="english",
        min_df=1,
        max_df=0.95
    )

    tfidf_matrix = vectorizer.fit_transform(corpus)
    print("TF-IDF Matrix Shape:", tfidf_matrix.shape)
else:
    print("Not enough data for TF-IDF.")
    tfidf_matrix = None

TF-IDF Matrix Shape: (213, 1109)


In [54]:
# --- 8. COMPUTE COSINE SIMILARITIES ---

if tfidf_matrix is not None:
    incoming_vector = tfidf_matrix[0]
    candidate_vectors = tfidf_matrix[1:]

    message_similarities = cosine_similarity(
        incoming_vector,
        candidate_vectors
    )[0]
    print(f"Computed {len(message_similarities)} similarity scores.")
else:
    print("Skipping TF-IDF computation (no eligible candidates or corpus too small).")
    message_similarities = []

Computed 212 similarity scores.


### Context score
Context score captures metadata alignment (e.g., app version, release stage).
- **Range**: 0‚Äì30
- **Weight**: Low (supporting signal only)
- **Purpose**: Boosts candidates that match the environment of the incoming error, even if the stack/message match is imperfect.

In [55]:
# --- 9. EXECUTION PIPELINE (INTEGRATED) ---

print("Starting Integrated Similarity Pipeline...")

results = []

for i, candidate in enumerate(eligible_candidates):
    # Get pre-computed message similarity
    raw_msg_sim = message_similarities[i] if i < len(message_similarities) else 0.0
    
    # Calculate Total Score using the pure function
    result = calculate_total_score(incoming_entry, candidate, raw_msg_sim)
    
    if result['final_score'] > 0:
        results.append(result)

# Sort and Display
results.sort(key=lambda x: x['final_score'], reverse=True)
print(f"Found {len(results)} matches.")

Starting Integrated Similarity Pipeline...
Found 212 matches.


In [56]:
# --- 10. SANITY CHECK ---

if results:
    display_df = pd.DataFrame(results)
    
    # Flatten scores for display
    display_df['stack_score'] = display_df['scores'].apply(lambda x: x['stack'])
    display_df['message_raw'] = display_df['scores'].apply(lambda x: x['message']) # Raw 0-1
    display_df['context_score'] = display_df['scores'].apply(lambda x: x['context'])
    display_df['time_score'] = display_df['scores'].apply(lambda x: x['time'])
    
    cols = ['entry_id', 'final_score', 'stack_score', 'message_raw', 'context_score', 'time_score', 'name']
    display(display_df[cols].head(10))
else:
    print("No matches found.")

Unnamed: 0,entry_id,final_score,stack_score,message_raw,context_score,time_score,name
0,1577493686,80.549355,35,0.518312,30,0,2025-Dec-01 21:36: Bugsnag ( open) - PROD-MOB...
1,1573468818,72.746747,35,0.258225,30,0,2025-Nov-26 10:22: Bugsnag ( open) - PROD-MOB...
2,1574883386,66.411104,35,0.047037,30,0,2025-Nov-28 20:41: Bugsnag ( ignored) - PROD-...
3,1578456898,65.934662,35,0.031155,30,0,2025-Dec-03 11:15: Bugsnag ( ignored) - PROD-...
4,1577908710,65.0,35,0.0,30,0,2025-Dec-02 20:41: Bugsnag ( ignored) - PROD-...
5,1574414416,65.0,35,0.0,30,0,2025-Nov-28 04:43: Bugsnag ( open) - PROD-MOB...
6,1530220994,61.269356,35,0.208979,20,0,2025-Oct-09 06:28: Bugsnag ( open) - PROD-MOB...
7,1574820389,55.0,25,0.0,30,0,2025-Nov-28 07:55: Bugsnag ( open) - PROD-MOB...
8,1575214338,55.0,25,0.0,30,0,2025-Nov-29 08:20: Bugsnag ( open) - PROD-MOB...
9,1537105308,55.0,35,0.0,20,0,2025-Oct-16 13:59: Bugsnag ( fixed) - PROD-MO...


In [57]:
# --- 11. SIGNAL DISAGREEMENT ANALYSIS (FALSE POSITIVE REVIEW) ---

if results:
    analysis_df = pd.DataFrame(results)
    
    # Normalize scores to 0-1 for comparison
    # Stack max = 40, Message max = 1 (raw)
    analysis_df['stack_norm'] = analysis_df['scores'].apply(lambda x: x['stack']) / 40.0
    analysis_df['message_norm'] = analysis_df['scores'].apply(lambda x: x['message']) 
    
    # Calculate Disagreement (Message - Stack)
    # Positive: Message is stronger signal
    # Negative: Stack is stronger signal
    analysis_df['signal_diff'] = analysis_df['message_norm'] - analysis_df['stack_norm']
    
    # Prepare display columns
    analysis_df['stack_score'] = analysis_df['scores'].apply(lambda x: x['stack'])
    analysis_df['message_raw'] = analysis_df['scores'].apply(lambda x: x['message'])
    cols = ['entry_id', 'final_score', 'stack_score', 'message_raw', 'signal_diff', 'name']
    
    print("--- High Message / Low Stack (Potential Text-Driven Matches) ---")
    print("Look for: Different errors that happen to have similar words.")
    high_msg = analysis_df[analysis_df['signal_diff'] > 0.3].sort_values('signal_diff', ascending=False)
    if not high_msg.empty:
        display(high_msg[cols].head(5))
    else:
        print("No significant High Message / Low Stack cases found.")
        
    print("\n--- High Stack / Low Message (Potential Generic Stack Matches) ---")
    print("Look for: Same error code path, but different error message (e.g. dynamic error strings).")
    high_stack = analysis_df[analysis_df['signal_diff'] < -0.3].sort_values('signal_diff', ascending=True)
    if not high_stack.empty:
        display(high_stack[cols])
    else:
        print("No significant High Stack / Low Message cases found.")

--- High Message / Low Stack (Potential Text-Driven Matches) ---
Look for: Different errors that happen to have similar words.
No significant High Message / Low Stack cases found.

--- High Stack / Low Message (Potential Generic Stack Matches) ---
Look for: Same error code path, but different error message (e.g. dynamic error strings).


Unnamed: 0,entry_id,final_score,stack_score,message_raw,signal_diff,name
5,1574414416,65.000000,35,0.000000,-0.875000,2025-Nov-28 04:43: Bugsnag ( open) - PROD-MOB...
4,1577908710,65.000000,35,0.000000,-0.875000,2025-Dec-02 20:41: Bugsnag ( ignored) - PROD-...
13,1512402078,55.000000,35,0.000000,-0.875000,2025-Sep-18 10:46: Bugsnag ( fixed) - PROD-MO...
14,1321167631,55.000000,35,0.000000,-0.875000,2025-Feb-24 07:03: Bugsnag ( fixed) - PROD-MO...
12,1369970745,55.000000,35,0.000000,-0.875000,2025-Apr-16 15:26: Bugsnag ( fixed) - PROD-MO...
...,...,...,...,...,...,...
121,1548648440,40.000000,25,0.000000,-0.625000,2025-Oct-30 06:14: Bugsnag ( open) - STAGE-MO...
1,1573468818,72.746747,35,0.258225,-0.616775,2025-Nov-26 10:22: Bugsnag ( open) - PROD-MOB...
0,1577493686,80.549355,35,0.518312,-0.356688,2025-Dec-01 21:36: Bugsnag ( open) - PROD-MOB...
39,1561273567,53.183985,25,0.272799,-0.352201,2025-Nov-13 13:54: Bugsnag ( open) - PROD-MOB...


In [58]:
# --- 12. GENERATE TRIAGE REPORT (AGENT-STYLE) ---
import json

def generate_triage_report(results: List[Dict[str, Any]]) -> Dict[str, Any]:
    """
    Generates a report similar to the Bugsnag Triage Agent's output.
    """
    
    report = {
        "batchSummary": {
            "totalAnalyzed": len(eligible_candidates),
            "relatedFound": len(results),
            "confidenceCounts": {"High": 0, "Medium": 0, "Low": 0}
        },
        "relatedEntries": []
    }
    
    for res in results:
        # Map 0-100 score to Confidence
        score = res['final_score']
        if score >= 85: confidence = "High"
        elif score >= 70: confidence = "Medium"
        else: confidence = "Low"
        
        report["batchSummary"]["confidenceCounts"][confidence] += 1
        
        # Generate Explanation from Reasons
        explanation = f"Match found with {confidence} confidence ({score:.1f}/100). "
        explanation += "Key signals: " + "; ".join(res['reasons'][:3]) + "."
        
        entry = {
            "entryId": res['entry_id'],
            "score": score / 100.0, # Normalize to 0-1 for compatibility
            "confidence": confidence,
            "signals": list(res['scores'].keys()),
            "explanation": explanation,
            "breakdown": res['scores']
        }
        report["relatedEntries"].append(entry)
        
    return report

# Generate and Display Report
triage_report = generate_triage_report(results)

print("--- TRIAGE REPORT (AGENT STYLE) ---")
# Print only the first 2 entries to avoid token limits
short_report = triage_report.copy()
short_report['relatedEntries'] = short_report['relatedEntries'][:2]
print(json.dumps(short_report, indent=2))

--- TRIAGE REPORT (AGENT STYLE) ---
{
  "batchSummary": {
    "totalAnalyzed": 212,
    "relatedFound": 212,
    "confidenceCounts": {
      "High": 0,
      "Medium": 2,
      "Low": 210
    }
  },
  "relatedEntries": [
    {
      "entryId": "1577493686",
      "score": 0.8054935456015423,
      "confidence": "Medium",
      "signals": [
        "stack",
        "message",
        "context",
        "time"
      ],
      "explanation": "Match found with Medium confidence (80.5/100). Key signals: Top frame match: app.js; Secondary frame match: chart.mjs; Message similarity: 0.52.",
      "breakdown": {
        "stack": 35,
        "message": 0.5183118186718078,
        "context": 30,
        "time": 0
      }
    },
    {
      "entryId": "1573468818",
      "score": 0.7274674666421115,
      "confidence": "Medium",
      "signals": [
        "stack",
        "message",
        "context",
        "time"
      ],
      "explanation": "Match found with Medium confidence (72.7/100). Key 