In [1]:
import re
from transformers import pipeline
import numpy as np

class TextModerationSystem:
    def __init__(self):
        # Initialize pre-trained models
        self.toxicity_classifier = pipeline(
            "text-classification",
            model="unitary/toxic-bert"
        )
        self.spam_detector = pipeline(
            "text-classification",
            model="madhurjindal/autonlp-Gibberish-Detector-492513457"
        )

        # Define moderation categories for your platform
        self.moderation_rules = {
            'spam_keywords': [
                'buy followers', 'fake engagement', 'guaranteed viral',
                'instant fame', 'bot followers', 'cheap views'
            ],
            'inappropriate_contact': [
                'whatsapp', 'telegram', 'personal phone', 'meet offline'
            ],
            'scam_patterns': [
                'send money first', 'payment upfront', 'wire transfer',
                'bitcoin payment', 'western union'
            ]
        }


In [2]:
def moderate_message(self, text, context="general"):
    """
    Moderate text messages between influencers and brands
    """
    results = {
        "approved": True,
        "confidence": 0.0,
        "flags": [],
        "severity": "safe"
    }

    # 1. Toxicity Detection
    toxicity_result = self.toxicity_classifier(text)
    toxicity_score = toxicity_result[0]['score'] if toxicity_result[0]['label'] == 'TOXIC' else 0

    if toxicity_score > 0.8:
        results["approved"] = False
        results["flags"].append("high_toxicity")
        results["severity"] = "high"
    elif toxicity_score > 0.5:
        results["flags"].append("moderate_toxicity")
        results["severity"] = "medium"

    # 2. Spam Detection
    spam_result = self.spam_detector(text)
    if spam_result[0]['label'] == 'spam' and spam_result[0]['score'] > 0.7:
        results["approved"] = False
        results["flags"].append("spam_content")

    # 3. Platform-specific rule checking
    text_lower = text.lower()

    # Check for spam keywords
    for keyword in self.moderation_rules['spam_keywords']:
        if keyword in text_lower:
            results["flags"].append(f"spam_keyword: {keyword}")
            results["severity"] = "medium"

    # Check for inappropriate contact attempts
    for contact_attempt in self.moderation_rules['inappropriate_contact']:
        if contact_attempt in text_lower:
            results["flags"].append("inappropriate_contact_sharing")
            results["severity"] = "medium"

    # Check for scam patterns
    for scam_pattern in self.moderation_rules['scam_patterns']:
        if scam_pattern in text_lower:
            results["approved"] = False
            results["flags"].append("potential_scam")
            results["severity"] = "high"

    results["confidence"] = max(toxicity_score, 0.5 if results["flags"] else 0.1)

    return results


In [3]:

def moderate_profile_data(self, profile_data):
    """
    Moderate influencer/brand profile information
    """
    moderation_results = {}

    # Moderate different profile fields
    fields_to_check = ['bio', 'description', 'company_name', 'tagline']

    for field in fields_to_check:
        if field in profile_data:
            field_result = self.moderate_message(
                profile_data[field],
                context="profile"
            )
            moderation_results[field] = field_result

    # Overall profile approval
    overall_approved = all(
        result["approved"] for result in moderation_results.values()
    )

    return {
        "profile_approved": overall_approved,
        "field_results": moderation_results,
        "action_required": not overall_approved
    }


In [14]:
# --- FIXED MODERATION SYSTEM (Case Sensitivity Bug Fixed) ---

from transformers import pipeline
import re

class FixedModerationSystem:
    def __init__(self):
        print("🔧 Loading AI moderation model...")

        self.toxicity_classifier = pipeline(
            "text-classification",
            model="unitary/toxic-bert"
        )

        # Platform-specific rules
        self.platform_rules = {
            'spam_keywords': [
                'buy followers', 'fake followers', 'guaranteed viral',
                'instant fame', 'cheap views', 'bot followers'
            ],
            'scam_patterns': [
                'send money first', 'payment upfront', 'wire transfer',
                'bitcoin payment', 'advance payment'
            ],
            'contact_sharing': [
                'whatsapp', 'telegram', 'my number', 'call me at'
            ]
        }

        print("✅ Fixed moderation system ready!")

    def moderate_message(self, text):
        """
        FIXED: Case-insensitive AI toxicity detection
        """
        results = {
            "approved": True,
            "confidence": 0.0,
            "flags": [],
            "ai_decision": None
        }

        # 1. AI Toxicity Detection (FIXED: case-insensitive)
        try:
            ai_result = self.toxicity_classifier(text)

            for item in ai_result:
                label = item['label'].lower()  # FIXED: Convert to lowercase
                score = item['score']

                results["ai_decision"] = {
                    "label": label,
                    "score": score,
                    "original_label": item['label']  # Keep original for debugging
                }

                # FIXED: Check for lowercase 'toxic'
                if label == 'toxic' and score > 0.5:
                    results["approved"] = False
                    results["flags"].append("ai_toxic")
                    results["confidence"] = score
                    print(f"🚫 BLOCKING: AI detected toxic content (score: {score:.3f})")
                    return results
                elif label == 'toxic':
                    results["confidence"] = score
                    print(f"⚠️ Low toxicity detected (score: {score:.3f})")

        except Exception as e:
            print(f"❌ AI Error: {e}")

        # 2. Platform-specific rules
        text_lower = text.lower()

        # Spam detection
        for spam in self.platform_rules['spam_keywords']:
            if spam in text_lower:
                results["approved"] = False
                results["flags"].append("spam")
                results["confidence"] = 0.8
                return results

        # Scam detection
        for scam in self.platform_rules['scam_patterns']:
            if scam in text_lower:
                results["approved"] = False
                results["flags"].append("scam")
                results["confidence"] = 0.9
                return results

        # Contact sharing (flag only)
        for contact in self.platform_rules['contact_sharing']:
            if contact in text_lower:
                results["flags"].append("contact_sharing")

        # Set default confidence if not set by AI
        if results["confidence"] == 0.0:
            results["confidence"] = 0.1

        return results

def fixed_display(text, result):
    """Clean result display"""
    print(f"\n📝 TESTING: '{text}'")
    print("=" * 50)

    # Status with clear indication
    if result["approved"]:
        print("✅ STATUS: APPROVED")
    else:
        print("❌ STATUS: BLOCKED")

    print(f"🎯 CONFIDENCE: {result['confidence']:.3f}")

    if result["flags"]:
        print(f"🚩 FLAGS: {', '.join(result['flags'])}")

    # Show AI decision details
    if result.get("ai_decision"):
        ai_info = result["ai_decision"]
        print(f"🤖 AI DECISION:")
        print(f"   Label: {ai_info['label']} (original: {ai_info['original_label']})")
        print(f"   Score: {ai_info['score']:.3f}")

    print("=" * 50)

def test_fixed_system():
    """Test the fixed moderation system"""
    moderator = FixedModerationSystem()

    # Test the cases that were failing
    print("\n🧪 TESTING PREVIOUSLY FAILING CASES:")
    failing_cases = ["fuck you", "kill you", "hello there", "buy followers"]

    for test_text in failing_cases:
        result = moderator.moderate_message(test_text)
        fixed_display(test_text, result)

    print("\n🔬 INTERACTIVE TESTING:")
    while True:
        user_input = input("\nEnter message to test (or 'quit'): ").strip()

        if user_input.lower() == 'quit':
            break
        elif not user_input:
            continue

        result = moderator.moderate_message(user_input)
        fixed_display(user_input, result)

# Run the fixed system
test_fixed_system()



🔧 Loading AI moderation model...


Device set to use cpu


✅ Fixed moderation system ready!

🧪 TESTING PREVIOUSLY FAILING CASES:
🚫 BLOCKING: AI detected toxic content (score: 0.998)

📝 TESTING: 'fuck you'
❌ STATUS: BLOCKED
🎯 CONFIDENCE: 0.998
🚩 FLAGS: ai_toxic
🤖 AI DECISION:
   Label: toxic (original: toxic)
   Score: 0.998
🚫 BLOCKING: AI detected toxic content (score: 0.922)

📝 TESTING: 'kill you'
❌ STATUS: BLOCKED
🎯 CONFIDENCE: 0.922
🚩 FLAGS: ai_toxic
🤖 AI DECISION:
   Label: toxic (original: toxic)
   Score: 0.922
⚠️ Low toxicity detected (score: 0.001)

📝 TESTING: 'hello there'
✅ STATUS: APPROVED
🎯 CONFIDENCE: 0.001
🤖 AI DECISION:
   Label: toxic (original: toxic)
   Score: 0.001
⚠️ Low toxicity detected (score: 0.003)

📝 TESTING: 'buy followers'
❌ STATUS: BLOCKED
🎯 CONFIDENCE: 0.800
🚩 FLAGS: spam
🤖 AI DECISION:
   Label: toxic (original: toxic)
   Score: 0.003

🔬 INTERACTIVE TESTING:

Enter message to test (or 'quit'): kill you
🚫 BLOCKING: AI detected toxic content (score: 0.922)

📝 TESTING: 'kill you'
❌ STATUS: BLOCKED
🎯 CONFIDENCE: 0.92

In [17]:
# To save the toxicity classifier model, you can use the save_pretrained method.
# First, instantiate the FixedModerationSystem to load the model
moderation_system = FixedModerationSystem()

# Now save the toxicity_classifier
save_directory = "./saved_toxicity_model"
moderation_system.toxicity_classifier.save_pretrained(save_directory)

print(f"Toxicity classifier model saved to {save_directory}")

🔧 Loading AI moderation model...


Device set to use cpu


✅ Fixed moderation system ready!
Toxicity classifier model saved to ./saved_toxicity_model


In [19]:
def interpret_toxicity_result(text, ai_result):
    """
    Convert AI toxicity scores into actionable decisions
    """
    score = ai_result[0]['score']

    if score > 0.8:
        return {
            "action": "block",
            "reason": "high_toxicity",
            "confidence": score,
            "message": f"Content blocked - {score:.1%} toxic confidence"
        }
    elif score > 0.5:
        return {
            "action": "flag",
            "reason": "moderate_toxicity",
            "confidence": score,
            "message": f"Content flagged for review - {score:.1%} toxic confidence"
        }
    elif score > 0.3:
        return {
            "action": "warn",
            "reason": "borderline_content",
            "confidence": score,
            "message": f"Borderline content detected - {score:.1%} toxic confidence"
        }
    else:
        return {
            "action": "approve",
            "reason": "safe_content",
            "confidence": score,
            "message": f"Content approved - {score:.1%} toxic confidence"
        }

# Test with your examples
test_texts = [
    "This is a completely harmless message.",
    "You are a terrible person!",
    "I hate everything about this.",
    "What a wonderful day!"
]

print("🧪 INTERPRETING YOUR MODEL RESULTS:")
print("=" * 50)

for text in test_texts:
    ai_result = loaded_toxicity_classifier(text)
    interpretation = interpret_toxicity_result(text, ai_result)

    print(f"\n📝 Text: '{text}'")
    print(f"🤖 AI Score: {ai_result[0]['score']:.1%}")
    print(f"⚡ Action: {interpretation['action'].upper()}")
    print(f"💬 Message: {interpretation['message']}")


🧪 INTERPRETING YOUR MODEL RESULTS:

📝 Text: 'This is a completely harmless message.'
🤖 AI Score: 0.1%
⚡ Action: APPROVE
💬 Message: Content approved - 0.1% toxic confidence

📝 Text: 'You are a terrible person!'
🤖 AI Score: 92.5%
⚡ Action: BLOCK
💬 Message: Content blocked - 92.5% toxic confidence

📝 Text: 'I hate everything about this.'
🤖 AI Score: 31.9%
⚡ Action: WARN
💬 Message: Borderline content detected - 31.9% toxic confidence

📝 Text: 'What a wonderful day!'
🤖 AI Score: 0.1%
⚡ Action: APPROVE
💬 Message: Content approved - 0.1% toxic confidence


In [20]:
# --- SAVE COMPLETE TOXICITY MODEL PIPELINE ---

from transformers import pipeline
import os
import json
from datetime import datetime

# Your current loaded model
toxicity_classifier = pipeline(
    "text-classification",
    model="unitary/toxic-bert"
)

def save_toxicity_model_complete(model_pipeline, save_path="toxicity_model_v1"):
    """
    Save the complete toxicity model pipeline with metadata
    """
    print(f"💾 Saving toxicity model to {save_path}...")

    # Create directory structure
    os.makedirs(save_path, exist_ok=True)

    # Save the model pipeline
    model_pipeline.save_pretrained(save_path)

    # Save model metadata
    metadata = {
        "model_name": "unitary/toxic-bert",
        "model_type": "toxicity_classifier",
        "saved_date": datetime.now().isoformat(),
        "performance_thresholds": {
            "block_threshold": 0.7,
            "flag_threshold": 0.4,
            "approve_threshold": 0.1
        },
        "test_cases": [
            {"text": "This is harmless", "expected_score": "< 0.1"},
            {"text": "You are terrible", "expected_score": "> 0.8"},
            {"text": "I hate this", "expected_score": "0.3-0.4"}
        ]
    }

    with open(os.path.join(save_path, "model_metadata.json"), 'w') as f:
        json.dump(metadata, f, indent=2)

    print(f"✅ Model saved successfully to {save_path}/")
    return save_path

# Save your model
saved_path = save_toxicity_model_complete(toxicity_classifier)


Device set to use cpu


💾 Saving toxicity model to toxicity_model_v1...
✅ Model saved successfully to toxicity_model_v1/


In [21]:
# --- TEST SAVED TOXICITY MODEL ---

from transformers import pipeline
import json
import pickle

def test_saved_toxicity_model():
    """Test your saved toxicity detection model"""
    print("🧪 TESTING SAVED TOXICITY MODEL")
    print("=" * 40)

    try:
        # Load the saved model (adjust path as needed)
        loaded_model = pipeline(
            "text-classification",
            model="./saved_toxicity_basic"  # or your saved path
        )
        print("✅ Model loaded successfully!")

        # Test cases to verify functionality
        test_cases = [
            ("Safe content", "Hi! I'd love to collaborate on your campaign"),
            ("Toxic content", "You are a terrible person"),
            ("Spam content", "Buy 10k followers for $20"),
            ("Borderline", "I really hate this situation"),
            ("Professional", "When can we discuss partnership terms?")
        ]

        print("\n📋 TESTING RESULTS:")
        print("-" * 30)

        for category, text in test_cases:
            result = loaded_model(text)
            score = result[0]['score']

            # Determine action based on score
            if score > 0.7:
                action = "🚫 BLOCK"
            elif score > 0.4:
                action = "⚠️ FLAG"
            else:
                action = "✅ APPROVE"

            print(f"{category.upper():12} | {action} | Score: {score:.3f}")
            print(f"             | Text: '{text[:50]}...'")
            print()

        return True

    except Exception as e:
        print(f"❌ Error loading model: {e}")
        return False

# Run the test
model_works = test_saved_toxicity_model()


🧪 TESTING SAVED TOXICITY MODEL
❌ Error loading model: Repo id must use alphanumeric chars or '-', '_', '.', '--' and '..' are forbidden, '-' and '.' cannot start or end the name, max length is 96: './saved_toxicity_basic'.


In [22]:
# --- CHECK WHAT MODELS EXIST ---

import os
import glob

def check_saved_models():
    """Check what model files actually exist"""
    print("🔍 CHECKING FOR SAVED MODELS")
    print("=" * 35)

    # Check current directory for model files
    current_dir_files = os.listdir('.')
    print("📁 Files in current directory:")
    for file in current_dir_files:
        if 'model' in file.lower() or 'toxicity' in file.lower():
            print(f"  • {file}")

    # Check for common model directories
    possible_model_dirs = [
        'saved_toxicity_basic',
        'toxicity_model_v1',
        'automated_content_moderation_v1',
        'influencer_brand_model'
    ]

    print(f"\n📂 Checking for model directories:")
    existing_dirs = []
    for dir_name in possible_model_dirs:
        if os.path.exists(dir_name):
            print(f"  ✅ {dir_name} - EXISTS")
            existing_dirs.append(dir_name)
        else:
            print(f"  ❌ {dir_name} - NOT FOUND")

    # Check for pickle files
    pickle_files = glob.glob("*.pkl")
    if pickle_files:
        print(f"\n🥒 Pickle files found:")
        for file in pickle_files:
            print(f"  • {file}")

    return existing_dirs, pickle_files

# Check what you have
existing_models, pickle_files = check_saved_models()


🔍 CHECKING FOR SAVED MODELS
📁 Files in current directory:
  • saved_toxicity_model
  • toxicity_model_v1

📂 Checking for model directories:
  ❌ saved_toxicity_basic - NOT FOUND
  ✅ toxicity_model_v1 - EXISTS
  ❌ automated_content_moderation_v1 - NOT FOUND
  ❌ influencer_brand_model - NOT FOUND


In [23]:
# --- TEST YOUR EXISTING MODELS ---

from transformers import pipeline
import os

def test_existing_models():
    """Test the models you already have"""
    print("🧪 TESTING YOUR EXISTING MODELS")
    print("=" * 40)

    models_to_test = [
        ("toxicity_model_v1", "directory"),
        ("saved_toxicity_model", "file_or_directory")
    ]

    successful_models = []

    for model_path, model_type in models_to_test:
        print(f"\n📥 Testing: {model_path}")
        print("-" * 30)

        try:
            # Try loading the model
            model = pipeline("text-classification", model=model_path)
            print(f"✅ {model_path} loaded successfully!")

            # Test with sample texts
            test_cases = [
                ("Safe", "Hello! I'd love to collaborate."),
                ("Toxic", "You are terrible!"),
                ("Profanity", "fuck you"),
                ("Spam", "Buy followers cheap!")
            ]

            print("Testing results:")
            for category, text in test_cases:
                result = model(text)
                score = result[0]['score']
                action = "BLOCK" if score > 0.5 else "ALLOW"
                print(f"  {category:<8} | {action:<5} | {score:.3f} | '{text}'")

            successful_models.append((model_path, model))

        except Exception as e:
            print(f"❌ Failed to load {model_path}: {e}")

    return successful_models

# Test your existing models
working_models = test_existing_models()


Device set to use cpu


🧪 TESTING YOUR EXISTING MODELS

📥 Testing: toxicity_model_v1
------------------------------
✅ toxicity_model_v1 loaded successfully!
Testing results:
  Safe     | ALLOW | 0.001 | 'Hello! I'd love to collaborate.'
  Toxic    | BLOCK | 0.939 | 'You are terrible!'
  Profanity | BLOCK | 0.998 | 'fuck you'
  Spam     | ALLOW | 0.004 | 'Buy followers cheap!'

📥 Testing: saved_toxicity_model
------------------------------


Device set to use cpu


✅ saved_toxicity_model loaded successfully!
Testing results:
  Safe     | ALLOW | 0.001 | 'Hello! I'd love to collaborate.'
  Toxic    | BLOCK | 0.939 | 'You are terrible!'
  Profanity | BLOCK | 0.998 | 'fuck you'
  Spam     | ALLOW | 0.004 | 'Buy followers cheap!'


In [24]:
# --- DETAILED TESTING OF WORKING MODELS ---

def detailed_model_testing(working_models):
    """Run comprehensive tests on working models"""
    print(f"\n🔬 COMPREHENSIVE MODEL TESTING")
    print("=" * 50)

    if not working_models:
        print("❌ No working models found to test")
        return

    # Use the first working model for detailed testing
    model_path, model = working_models[0]
    print(f"🎯 Using model: {model_path}")

    # Comprehensive test cases
    comprehensive_tests = [
        # Safe content
        ("Business", "Hi! I'd love to collaborate on your skincare campaign."),
        ("Professional", "When would be a good time to discuss partnership details?"),
        ("Friendly", "Thank you for considering me for this opportunity."),

        # Toxic content
        ("Profanity", "fuck you and your stupid company"),
        ("Insult", "You are a terrible person!"),
        ("Harassment", "I hate you so much"),

        # Threats
        ("Threat", "I'll kill you"),
        ("Violence", "I'm gonna hurt you"),

        # Spam/Scam
        ("Spam", "Buy 10k followers for just $20!"),
        ("Scam", "Send money first via Bitcoin payment"),

        # Borderline cases
        ("Frustration", "This is really annoying"),
        ("Criticism", "Your service sucks"),
        ("Mild negative", "I don't like this deal")
    ]

    print(f"\n📊 DETAILED TEST RESULTS:")
    print("-" * 70)
    print(f"{'CATEGORY':<12} | {'ACTION':<8} | {'SCORE':<6} | {'CONFIDENCE':<10} | TEXT")
    print("-" * 70)

    results_summary = {
        "total": len(comprehensive_tests),
        "blocked": 0,
        "allowed": 0,
        "high_confidence": 0
    }

    for category, text in comprehensive_tests:
        try:
            result = model(text)
            score = result[0]['score']

            # Determine action and confidence
            if score > 0.7:
                action = "🚫 BLOCK"
                confidence = "HIGH"
                results_summary["blocked"] += 1
                results_summary["high_confidence"] += 1
            elif score > 0.4:
                action = "⚠️ FLAG"
                confidence = "MEDIUM"
                results_summary["blocked"] += 1
            else:
                action = "✅ ALLOW"
                confidence = "LOW"
                results_summary["allowed"] += 1

            if score > 0.8:
                results_summary["high_confidence"] += 1

            print(f"{category:<12} | {action:<8} | {score:<6.3f} | {confidence:<10} | {text[:25]}...")

        except Exception as e:
            print(f"{category:<12} | ERROR    | ------ | ERROR      | {text[:25]}...")

    print("-" * 70)
    print(f"📈 SUMMARY:")
    print(f"  Total tests: {results_summary['total']}")
    print(f"  Blocked/Flagged: {results_summary['blocked']}")
    print(f"  Allowed: {results_summary['allowed']}")
    print(f"  High confidence decisions: {results_summary['high_confidence']}")

    # Calculate accuracy estimate
    expected_blocks = 8  # Toxic, threats, spam categories
    actual_blocks = results_summary['blocked']
    accuracy_estimate = min(actual_blocks / expected_blocks, 1.0) * 100

    print(f"  Estimated accuracy: {accuracy_estimate:.1f}%")

    return results_summary

# Run detailed testing
if working_models:
    test_summary = detailed_model_testing(working_models)



🔬 COMPREHENSIVE MODEL TESTING
🎯 Using model: toxicity_model_v1

📊 DETAILED TEST RESULTS:
----------------------------------------------------------------------
CATEGORY     | ACTION   | SCORE  | CONFIDENCE | TEXT
----------------------------------------------------------------------
Business     | ✅ ALLOW  | 0.001  | LOW        | Hi! I'd love to collabora...
Professional | ✅ ALLOW  | 0.001  | LOW        | When would be a good time...
Friendly     | ✅ ALLOW  | 0.001  | LOW        | Thank you for considering...
Profanity    | 🚫 BLOCK  | 0.998  | HIGH       | fuck you and your stupid ...
Insult       | 🚫 BLOCK  | 0.925  | HIGH       | You are a terrible person...
Harassment   | 🚫 BLOCK  | 0.930  | HIGH       | I hate you so much...
Threat       | 🚫 BLOCK  | 0.902  | HIGH       | I'll kill you...
Violence     | 🚫 BLOCK  | 0.864  | HIGH       | I'm gonna hurt you...
Spam         | ✅ ALLOW  | 0.004  | LOW        | Buy 10k followers for jus...
Scam         | ✅ ALLOW  | 0.004  | LOW        | 

In [25]:
# --- INTERACTIVE TESTING ---

def interactive_testing(working_models):
    """Interactive testing interface for your models"""
    if not working_models:
        print("❌ No working models available for interactive testing")
        return

    model_path, model = working_models[0]
    print(f"🎮 INTERACTIVE TESTING WITH: {model_path}")
    print("=" * 50)
    print("Type messages to test toxicity detection.")
    print("Commands: 'quit' to exit, 'stats' for statistics")
    print("-" * 50)

    test_stats = {"total": 0, "blocked": 0, "flagged": 0, "allowed": 0}

    while True:
        user_input = input("\nEnter text to test: ").strip()

        if user_input.lower() == 'quit':
            break
        elif user_input.lower() == 'stats':
            print(f"\n📊 SESSION STATS:")
            print(f"  Tests run: {test_stats['total']}")
            print(f"  Blocked: {test_stats['blocked']}")
            print(f"  Flagged: {test_stats['flagged']}")
            print(f"  Allowed: {test_stats['allowed']}")
            continue
        elif not user_input:
            print("⚠️ Please enter some text to test")
            continue

        try:
            # Test the input
            result = model(user_input)
            score = result[0]['score']
            test_stats["total"] += 1

            # Determine action
            if score > 0.7:
                action = "🚫 BLOCKED"
                reason = "High toxicity detected"
                test_stats["blocked"] += 1
            elif score > 0.4:
                action = "⚠️ FLAGGED"
                reason = "Moderate toxicity detected"
                test_stats["flagged"] += 1
            else:
                action = "✅ ALLOWED"
                reason = "Content appears safe"
                test_stats["allowed"] += 1

            print(f"\n📝 TEXT: '{user_input}'")
            print(f"🤖 SCORE: {score:.3f} ({score*100:.1f}% toxic)")
            print(f"⚡ ACTION: {action}")
            print(f"💭 REASON: {reason}")

        except Exception as e:
            print(f"❌ Error testing text: {e}")

# Run interactive testing
if working_models:
    interactive_testing(working_models)


🎮 INTERACTIVE TESTING WITH: toxicity_model_v1
Type messages to test toxicity detection.
Commands: 'quit' to exit, 'stats' for statistics
--------------------------------------------------

Enter text to test: i have to know about you

📝 TEXT: 'i have to know about you'
🤖 SCORE: 0.001 (0.1% toxic)
⚡ ACTION: ✅ ALLOWED
💭 REASON: Content appears safe

Enter text to test: can you provide your website link

📝 TEXT: 'can you provide your website link'
🤖 SCORE: 0.001 (0.1% toxic)
⚡ ACTION: ✅ ALLOWED
💭 REASON: Content appears safe

Enter text to test: i will kill you

📝 TEXT: 'i will kill you'
🤖 SCORE: 0.907 (90.7% toxic)
⚡ ACTION: 🚫 BLOCKED
💭 REASON: High toxicity detected

Enter text to test: you are tribble

📝 TEXT: 'you are tribble'
🤖 SCORE: 0.681 (68.1% toxic)
⚡ ACTION: ⚠️ FLAGGED
💭 REASON: Moderate toxicity detected

Enter text to test: quit
