# OCR-Based Spam and Gambling Detection System


In [1]:
!pip install -q transformers torch easyocr pillow opencv-python

print("[INFO] All libraries installed successfully")

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.9 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.7/2.9 MB[0m [31m20.1 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m2.9/2.9 MB[0m [31m56.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m39.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m180.7/180.7 kB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m978.2/978.2 kB[0m [31m65.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m300.6/300.6 kB[0m [31m33.7 MB/s[0m eta [36m0:00:00[0m
[?25h[INFO] All libraries installed successfully


In [2]:
import torch
from transformers import pipeline
import easyocr
import cv2
from PIL import Image
import numpy as np
import os
import json
from datetime import datetime

# Check GPU availability
device = 0 if torch.cuda.is_available() else -1
device_name = 'GPU (CUDA)' if device == 0 else 'CPU'
print(f"[INFO] Device: {device_name}")
print(f"[INFO] PyTorch version: {torch.__version__}")

[INFO] Device: GPU (CUDA)
[INFO] PyTorch version: 2.9.0+cu126


In [3]:
print("[INFO] Loading models... This may take 2-3 minutes on first run.\n")

# Load EasyOCR
print("[INFO] Loading EasyOCR...")
start = datetime.now()
ocr_reader = easyocr.Reader(['en'], gpu=torch.cuda.is_available())
elapsed = (datetime.now() - start).total_seconds()
print(f"[SUCCESS] OCR loaded in {elapsed:.1f}s\n")

# Load Zero-Shot Classifier
print("[INFO] Loading Zero-Shot Classifier (facebook/bart-large-mnli)...")
start = datetime.now()
zero_shot_classifier = pipeline(
    "zero-shot-classification",
    model="facebook/bart-large-mnli",
    device=device
)
elapsed = (datetime.now() - start).total_seconds()
print(f"[SUCCESS] Zero-Shot Classifier loaded in {elapsed:.1f}s\n")

print("[SUCCESS] All models ready")



[INFO] Loading models... This may take 2-3 minutes on first run.

[INFO] Loading EasyOCR...
Progress: |██████████████████████████████████████████████████| 100.0% Complete



Progress: |--------------------------------------------------| 0.0% CompleteProgress: |--------------------------------------------------| 0.1% CompleteProgress: |--------------------------------------------------| 0.1% CompleteProgress: |--------------------------------------------------| 0.2% CompleteProgress: |--------------------------------------------------| 0.2% CompleteProgress: |--------------------------------------------------| 0.3% CompleteProgress: |--------------------------------------------------| 0.4% CompleteProgress: |--------------------------------------------------| 0.4% CompleteProgress: |--------------------------------------------------| 0.5% CompleteProgress: |--------------------------------------------------| 0.5% CompleteProgress: |--------------------------------------------------| 0.6% CompleteProgress: |--------------------------------------------------| 0.6% CompleteProgress: |--------------------------------------------------| 0.7% Complet

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Loading weights:   0%|          | 0/515 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

[SUCCESS] Zero-Shot Classifier loaded in 26.8s

[SUCCESS] All models ready


In [4]:
def extract_text_from_video(video_path, num_frames=16):
    """
    Extract text from video frames using OCR

    Args:
        video_path (str): Path to video file
        num_frames (int): Number of frames to sample evenly from video

    Returns:
        dict: Contains combined_text, frame_texts, frames_processed, video_duration
    """

    # Validate video file
    if not os.path.exists(video_path):
        return {
            'error': 'Video file not found',
            'combined_text': '',
            'frame_texts': [],
            'frames_processed': 0
        }

    # Open video
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        return {
            'error': 'Cannot open video',
            'combined_text': '',
            'frame_texts': [],
            'frames_processed': 0
        }

    # Get video metadata
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    duration = total_frames / fps if fps > 0 else 0

    print(f"[INFO] Video metadata: {total_frames} frames, {fps:.1f} FPS, {duration:.1f}s duration")

    # Calculate frame sampling interval
    skip = max(total_frames // num_frames, 1)

    frame_texts = []
    frames_processed = 0

    print(f"[INFO] Extracting text from {num_frames} frames...\n")

    for i in range(num_frames):
        frame_num = i * skip

        if frame_num >= total_frames:
            break

        # Set frame position
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
        ret, frame = cap.read()

        if not ret:
            continue

        # Convert BGR to RGB
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

        # Run OCR
        try:
            results = ocr_reader.readtext(frame_rgb)

            # Extract text with confidence threshold
            frame_text = ' '.join([text for (bbox, text, conf) in results if conf > 0.3])

            if frame_text.strip():
                frame_texts.append(frame_text)
                preview = frame_text[:50] + '...' if len(frame_text) > 50 else frame_text
                print(f"Frame {i+1}/{num_frames}: '{preview}'")
            else:
                print(f"Frame {i+1}/{num_frames}: (no text detected)")

            frames_processed += 1

        except Exception as e:
            print(f"Frame {i+1}/{num_frames}: Error - {e}")

    cap.release()

    # Combine all extracted text
    combined_text = ' '.join(frame_texts).strip()

    print(f"\n[SUCCESS] Processed {frames_processed} frames")
    print(f"[INFO] Total text extracted: {len(combined_text)} characters")

    return {
        'combined_text': combined_text,
        'frame_texts': frame_texts,
        'frames_processed': frames_processed,
        'video_duration': duration
    }

In [5]:
def check_gambling_keywords(text):
    """
    Pattern matching for gambling keywords and phrases

    Args:
        text (str): Text to analyze

    Returns:
        dict: Detection results with gambling_detected flag, score, and matched keywords
    """
    text_lower = text.lower()

    # Gambling-specific keywords
    gambling_keywords = [
        'baccarat', 'poker', 'blackjack', 'roulette', 'slot', 'casino',
        'bet', 'wager', 'odds', 'betting', 'gamble', 'dice game',
        'jackpot'
    ]

    # Money and promotion keywords
    money_keywords = [
        'deposit', 'cashback', 'bonus', 'win big', 'big win',
        'easy money', 'fast cash'
    ]

    # Urgency indicators
    urgency_keywords = [
        'play now', 'bet now', 'limited time', 'expires',
        'hurry', 'last chance'
    ]

    # Find matches
    gambling_matches = [kw for kw in gambling_keywords if kw in text_lower]
    money_matches = [kw for kw in money_keywords if kw in text_lower]
    urgency_matches = [kw for kw in urgency_keywords if kw in text_lower]

    # Calculate weighted score
    score = 0
    score += len(gambling_matches) * 20  # Gambling words heavily weighted
    score += len(money_matches) * 15     # Money indicators moderately weighted
    score += len(urgency_matches) * 10  # Urgency adds pressure

    # Combination bonus: gambling + money is highly suspicious
    if gambling_matches and money_matches:
        score += 25

    return {
        'gambling_detected': score >= 30,  # Lowered threshold for better recall
        'pattern_score': score,
        'matched_keywords': gambling_matches + money_matches + urgency_matches,
        'gambling_words': gambling_matches,
        'money_words': money_matches
    }

In [18]:
def check_scam_keywords(text):
    """
    Pattern matching for scam and phishing indicators
    Uses weighted severity scoring to avoid false positives

    Args:
        text (str): Text to analyze

    Returns:
        dict: Detection results with scam_detected flag, score, and matched keywords
    """
    text_lower = text.lower()

    # HIGH-SEVERITY scam keywords (clear fraud indicators)
    high_severity_scam = [
        'congratulations', 'you won', 'claim prize', 'claim now',
        'verify account', 'click link', 'tap here',
        'selected', 'winner', 'you have been chosen'
    ]

    # MEDIUM-SEVERITY scam keywords (common in both scams and legitimate promotions)
    medium_severity_scam = [
        'win', 'free', 'free trip', 'free gift',
        'exclusive deal', 'limited offer', 'limited time',
        'urgent', 'expires soon', 'act now'
    ]

    # URL and link patterns
    link_patterns = ['http://', 'https://', '.com/', 'bit.ly', 'tinyurl', 'tapit']

    # Find matches
    high_matches = [kw for kw in high_severity_scam if kw in text_lower]
    medium_matches = [kw for kw in medium_severity_scam if kw in text_lower]
    link_found = any(lp in text_lower for lp in link_patterns)

    # Calculate weighted score
    score = 0
    score += len(high_matches) * 25      # High-severity: 25 points each
    score += len(medium_matches) * 10    # Medium-severity: 10 points each
    if link_found:
        score += 30  # Suspicious links add significant weight

    all_matches = high_matches + medium_matches

    return {
        'scam_detected': score >= 60,  # Threshold for automatic detection
        'pattern_score': score,
        'matched_keywords': all_matches,
        'high_severity_matches': high_matches,
        'medium_severity_matches': medium_matches,
        'link_found': link_found
    }

In [19]:
def classify_text_spam(text, classifier):
    """
    Classify text using hybrid zero-shot AI + keyword matching
    Only overrides AI when there's strong evidence and agreement

    Args:
        text (str): Text to classify
        classifier: Loaded zero-shot classification pipeline

    Returns:
        dict: Classification results with category, confidence, scores, and detection flags
    """

    # Handle empty or very short text
    if not text or len(text.strip()) < 3:
        return {
            'category': 'normal video content',
            'confidence': 1.0,
            'all_scores': {},
            'is_spam': False,
            'is_safe': True,
            'reasoning': 'No text detected in video',
            'keyword_detection': None,
            'scam_keyword_detection': None,
            'gambling_score': 0.0,
            'scam_score': 0.0,
            'text_detected': ''
        }

    # Step 1: Keyword pattern matching (fast)
    print("\n[INFO] Running keyword pattern matching...")
    keyword_result = check_gambling_keywords(text)

    if keyword_result['matched_keywords']:
        keywords_str = ', '.join(keyword_result['matched_keywords'][:5])
        print(f"[DETECT] Gambling keywords found: {keywords_str}")
        print(f"[INFO] Gambling keyword score: {keyword_result['pattern_score']}")

    scam_keyword_result = check_scam_keywords(text)

    if scam_keyword_result['matched_keywords']:
        keywords_str = ', '.join(scam_keyword_result['matched_keywords'][:5])
        print(f"[DETECT] Scam keywords found: {keywords_str}")
        print(f"[INFO] Scam keyword score: {scam_keyword_result['pattern_score']}")
        if scam_keyword_result.get('link_found'):
            print(f"[DETECT] Suspicious link detected in text")

    # Step 2: Zero-shot classification (smart)
    candidate_labels = [
        "gambling or casino advertisement",
        "lottery or prize scam message",
        "general spam or phishing message",
        "legitimate promotional content",
        "normal video content"
    ]

    print("\n[INFO] Running zero-shot classification...")
    text_preview = text[:120] + '...' if len(text) > 120 else text
    print(f"[INFO] Text preview: '{text_preview}'\\n")

    try:
        # Run AI classification
        result = classifier(text, candidate_labels, multi_label=False)

        # Extract scores
        scores = {label: score for label, score in zip(result['labels'], result['scores'])}

        # Get individual category scores
        gambling_score = scores.get("gambling or casino advertisement", 0)
        scam_score = scores.get("lottery or prize scam message", 0) + \
                     scores.get("general spam or phishing message", 0)

        # Boost scam score if link found
        scam_score_total = scam_score + (0.20 if scam_keyword_result.get('link_found', False) else 0)

        # Get AI's top choice
        ai_category = result['labels'][0]
        ai_confidence = result['scores'][0]

        # Step 3: Smart boosting logic - only override when there's strong evidence

        # CASE 1: Very strong scam - high-severity keywords OR keywords + link
        high_severity_scam = scam_keyword_result.get('high_severity_matches', [])
        has_link = scam_keyword_result.get('link_found', False)
        scam_keyword_score = scam_keyword_result.get('pattern_score', 0)

        if (len(high_severity_scam) >= 2) or (scam_keyword_score >= 60 and has_link):
            print("\n[DECISION] VERY STRONG SCAM SIGNAL (high-severity keywords or keywords + link)")
            category = "lottery or prize scam message"
            confidence = 0.95
            scam_score_total = 0.95
            is_spam = True
            is_safe = False

        # CASE 2: Very strong gambling - keywords (60+) AND AI agrees (>20%)
        elif keyword_result.get('gambling_detected', False) and keyword_result.get('pattern_score', 0) >= 60 and gambling_score > 0.20:
            print("\n[DECISION] STRONG GAMBLING SIGNAL (keywords + AI agreement)")
            category = "gambling or casino advertisement"
            confidence = 0.95
            gambling_score = 0.95
            is_spam = True
            is_safe = False

        # CASE 3: High AI scam confidence (>60%)
        elif scam_score_total > 0.60:
            print("\n[DECISION] HIGH AI SCAM CONFIDENCE")
            category = "lottery or prize scam message"
            confidence = scam_score_total
            is_spam = True
            is_safe = False

        # CASE 4: High AI gambling confidence (>45%) OR moderate keywords with AI support
        elif gambling_score > 0.45 or (keyword_result.get('gambling_detected', False) and gambling_score > 0.15):
            print("\n[DECISION] MODERATE GAMBLING SIGNAL")
            category = "gambling or casino advertisement"
            if keyword_result.get('gambling_detected', False):
                confidence = max(gambling_score, 0.70)
                gambling_score = confidence
            else:
                confidence = gambling_score
            is_spam = True
            is_safe = False

        # CASE 5: Trust AI (no strong keyword override)
        else:
            print("\n[DECISION] USING AI CLASSIFICATION (no strong keyword override)")
            category = ai_category
            confidence = ai_confidence
            # Only mark as spam if AI confidently thinks it's spam
            is_spam = (category in ["gambling or casino advertisement",
                                   "lottery or prize scam message",
                                   "general spam or phishing message"]) and confidence > 0.50
            is_safe = not is_spam

        # Display final scores
        print("\n[RESULTS] Final Classification Scores:")
        display_scores = [
            ("gambling or casino advertisement", gambling_score),
            ("lottery or prize scam message", scam_score_total),
            ("legitimate promotional content", scores.get("legitimate promotional content", 0)),
            ("normal video content", scores.get("normal video content", 0))
        ]

        for label, score in sorted(display_scores, key=lambda x: x[1], reverse=True):
            indicator = "[HIGH]" if score > 0.7 else "[MEDIUM]" if score > 0.4 else "[LOW]"
            boost_note = " (BOOSTED)" if (label == "gambling or casino advertisement" and
                                          keyword_result.get('gambling_detected', False) and
                                          category == "gambling or casino advertisement") else ""
            print(f"  {indicator} {label}: {score*100:.1f}%{boost_note}")

        return {
            'category': category,
            'confidence': confidence,
            'all_scores': {
                'gambling or casino advertisement': gambling_score,
                'lottery or prize scam message': scam_score_total,
                'legitimate promotional content': scores.get('legitimate promotional content', 0),
                'normal video content': scores.get('normal video content', 0)
            },
            'is_spam': is_spam,
            'is_safe': is_safe,
            'gambling_score': gambling_score,
            'scam_score': scam_score_total,
            'keyword_detection': keyword_result,
            'scam_keyword_detection': scam_keyword_result,
            'text_detected': text
        }

    except Exception as e:
        print(f"[ERROR] Classification failed: {e}")

        # Fallback to keyword detection only
        if keyword_result.get('gambling_detected', False):
            return {
                'category': 'gambling or casino advertisement',
                'confidence': 0.90,
                'all_scores': {'gambling or casino advertisement': 0.90},
                'is_spam': True,
                'is_safe': False,
                'gambling_score': 0.90,
                'scam_score': 0.0,
                'keyword_detection': keyword_result,
                'scam_keyword_detection': scam_keyword_result,
                'text_detected': text
            }

        return {
            'error': str(e),
            'category': 'unknown',
            'confidence': 0.0,
            'all_scores': {},
            'is_spam': False,
            'is_safe': True,
            'gambling_score': 0.0,
            'scam_score': 0.0,
            'keyword_detection': keyword_result,
            'scam_keyword_detection': scam_keyword_result,
            'text_detected': text
        }

In [20]:
def make_spam_decision(classification_result, text_length):
    """
    Convert classification results to BLOCK/REVIEW/APPROVE decision
    Prioritizes strong signals, then moderate, then borderline

    Args:
        classification_result (dict): Output from classify_text_spam
        text_length (int): Length of extracted text

    Returns:
        dict: Decision with recommended_action, reasoning, confidence_level
    """

    # Handle no text case
    if text_length == 0:
        return {
            'recommended_action': 'APPROVE',
            'reasoning': 'No text detected in video',
            'confidence_level': 'HIGH'
        }

    # Handle errors
    if 'error' in classification_result:
        return {
            'recommended_action': 'REVIEW',
            'reasoning': f"Classification error: {classification_result['error']}",
            'confidence_level': 'LOW'
        }

    # Extract scores and detection results
    gambling_score = classification_result.get('gambling_score', 0)
    scam_score = classification_result.get('scam_score', 0)
    keyword_detection = classification_result.get('keyword_detection', {})
    scam_keyword_detection = classification_result.get('scam_keyword_detection', {})
    text_detected = classification_result.get('text_detected', '')

    keyword_score = keyword_detection.get('pattern_score', 0) if keyword_detection else 0
    scam_keyword_score = scam_keyword_detection.get('pattern_score', 0) if scam_keyword_detection else 0
    has_link = scam_keyword_detection.get('link_found', False) if scam_keyword_detection else False
    has_high_severity = len(scam_keyword_detection.get('high_severity_matches', [])) > 0 if scam_keyword_detection else False

    keyword_gambling = keyword_detection.get('gambling_detected', False) if keyword_detection else False

    # ========== PRIORITY 1: VERY STRONG SIGNALS (BLOCK) ==========

    # Very strong gambling keywords (70+)
    if keyword_gambling and keyword_score >= 70:
        gambling_words = keyword_detection.get('gambling_words', [])
        words_str = ', '.join(gambling_words[:3])
        return {
            'recommended_action': 'BLOCK',
            'reasoning': f'Very strong gambling indicators detected (keywords: {words_str}, score: {keyword_score}, AI confidence: {gambling_score*100:.1f}%)',
            'confidence_level': 'HIGH'
        }

    # Very strong scam - either high-severity keywords OR (60+ with link) OR (80+ without link)
    if has_high_severity or (scam_keyword_score >= 60 and has_link) or scam_keyword_score >= 80:
        scam_words = scam_keyword_detection.get('matched_keywords', [])
        words_str = ', '.join(scam_words[:3])
        reason_detail = "high-severity keywords" if has_high_severity else \
                       "keywords + suspicious link" if has_link else "very high keyword score"
        return {
            'recommended_action': 'BLOCK',
            'reasoning': f'Very strong scam indicators detected ({reason_detail}: {words_str}, score: {scam_keyword_score})',
            'confidence_level': 'HIGH'
        }

    # High AI gambling confidence (70%+)
    if gambling_score > 0.7:
        return {
            'recommended_action': 'BLOCK',
            'reasoning': f'Gambling advertisement detected (AI confidence: {gambling_score*100:.1f}%)',
            'confidence_level': 'HIGH'
        }

    # High AI scam confidence (70%+)
    if scam_score > 0.7:
        return {
            'recommended_action': 'BLOCK',
            'reasoning': f'Spam/scam detected (AI confidence: {scam_score*100:.1f}%)',
            'confidence_level': 'HIGH'
        }

    # ========== PRIORITY 2: MODERATE SIGNALS (REVIEW) ==========

    # Moderate gambling keywords (30-69)
    if keyword_gambling and 30 <= keyword_score < 70:
        keywords = keyword_detection.get('matched_keywords', [])
        keywords_str = ', '.join(keywords[:3])
        return {
            'recommended_action': 'REVIEW',
            'reasoning': f'Moderate gambling language detected ({keywords_str}, score: {keyword_score}) - requires human review',
            'confidence_level': 'MEDIUM'
        }

    # Moderate scam keywords (40-79 without link, OR 30-59 with link)
    if (40 <= scam_keyword_score < 80 and not has_link) or \
       (30 <= scam_keyword_score < 60 and has_link):
        keywords = scam_keyword_detection.get('matched_keywords', [])
        keywords_str = ', '.join(keywords[:3])
        link_note = " with suspicious link" if has_link else ""
        return {
            'recommended_action': 'REVIEW',
            'reasoning': f'Moderate scam indicators detected ({keywords_str}, score: {scam_keyword_score}{link_note}) - requires human review',
            'confidence_level': 'MEDIUM'
        }

    # Moderate AI confidence (40-70%)
    if gambling_score > 0.4 or scam_score > 0.4:
        max_score = max(gambling_score, scam_score)
        spam_type = 'gambling' if gambling_score > scam_score else 'scam'
        return {
            'recommended_action': 'REVIEW',
            'reasoning': f'Possible {spam_type} content detected (AI confidence: {max_score*100:.1f}%) - requires human review',
            'confidence_level': 'MEDIUM'
        }

    # ========== PRIORITY 3: BORDERLINE SIGNALS (REVIEW) ==========

    # Borderline scam keywords (20-39 without high severity)
    if 20 <= scam_keyword_score < 40 and not has_high_severity:
        keywords = scam_keyword_detection.get('matched_keywords', [])
        keywords_str = ', '.join(keywords[:3])
        return {
            'recommended_action': 'REVIEW',
            'reasoning': f'Borderline scam indicators detected ({keywords_str}, score: {scam_keyword_score}) - requires human review',
            'confidence_level': 'MEDIUM'
        }

    # Suspicious combination: deposit + games
    text_lower = text_detected.lower()
    has_deposit = 'deposit' in keyword_detection.get('money_words', [])
    has_games = 'game' in text_lower or 'play' in text_lower

    if has_deposit and has_games:
        return {
            'recommended_action': 'REVIEW',
            'reasoning': 'Suspicious combination detected: deposit + games context - may be gambling disguised as promotion',
            'confidence_level': 'MEDIUM'
        }

    # Prize language + gambling context
    if scam_keyword_detection and gambling_score > 0.10:
        scam_words = scam_keyword_detection.get('matched_keywords', [])
        prize_words = [w for w in scam_words if w in ['win', 'free', 'free trip', 'prize']]
        if prize_words:
            prize_str = ', '.join(prize_words)
            return {
                'recommended_action': 'REVIEW',
                'reasoning': f'Prize language ({prize_str}) with gambling context detected - requires verification',
                'confidence_level': 'MEDIUM'
            }

    # ========== PRIORITY 4: SAFE CONTENT (APPROVE) ==========

    category = classification_result.get('category', '')
    confidence = classification_result.get('confidence', 0)
    return {
        'recommended_action': 'APPROVE',
        'reasoning': f'Content classified as: {category} (AI confidence: {confidence*100:.1f}%)',
        'confidence_level': 'HIGH'
    }

In [32]:
# Combined validation set + sweep in one cell.
# label: 1 = spam/gambling/scam, 0 = safe/legitimate
manual_samples = [
    {"text": "Win big now. Claim your prize today", "label": 1},
    {"text": "Official brand promo: limited time discount", "label": 0},
    {"text": "Deposit bonus for roulette players", "label": 1},
    {"text": "Subscribe for more tech tips", "label": 0},
    {"text": "You won a free gift. Click link to verify", "label": 1},
    {"text": "New episode tonight at 9 PM", "label": 0},
    {"text": "Casino jackpot odds boosted", "label": 1},
    {"text": "Huge cashback offer on betting app", "label": 1},
    {"text": "Community event: free workshop signup", "label": 0},
    {"text": "Limited time: play now and win", "label": 1},
    {"text": "Product review and honest feedback", "label": 0},
    {"text": "Tap here to claim your reward", "label": 1},
    {"text": "Sports highlights and commentary", "label": 0},
    {"text": "Bet now for higher odds", "label": 1},
    {"text": "Meet our team and mission", "label": 0},
    {"text": "Winner selected. Verify account now", "label": 1},
    {"text": "Cooking tips for beginners", "label": 0},
    {"text": "Free trip giveaway ends soon", "label": 1},
    {"text": "Normal gameplay video and chat", "label": 0},
    {"text": "Casino bonus: fast cash offer", "label": 1},
    {"text": "Claim your prize now, click the link", "label": 1},
    {"text": "Congratulations, you won. Verify account", "label": 1},
    {"text": "Limited time casino bonus, deposit now", "label": 1},
    {"text": "Play now to win big jackpot", "label": 1},
    {"text": "Betting odds boosted this weekend", "label": 1},
    {"text": "Exclusive deal: free gift, act now", "label": 1},
    {"text": "Roulette and blackjack bonus", "label": 1},
    {"text": "Fast cash with betting app", "label": 1},
    {"text": "Winner selected, claim prize today", "label": 1},
    {"text": "Click link to receive bonus", "label": 1},
    {"text": "Deposit and play casino games", "label": 1},
    {"text": "Win big with slots today", "label": 1},
    {"text": "Urgent: verify account to receive reward", "label": 1},
    {"text": "Get cashback on wagers", "label": 1},
    {"text": "Lucky draw winner, claim now", "label": 1},
    {"text": "Casino promo limited time", "label": 1},
    {"text": "Spin the wheel and win", "label": 1},
    {"text": "Prize message: http://bit.ly/offer", "label": 1},
    {"text": "Today weather forecast update", "label": 0},
    {"text": "Music video release announcement", "label": 0},
    {"text": "Travel vlog day 3", "label": 0},
    {"text": "University open day registration", "label": 0},
    {"text": "Charity fundraiser livestream", "label": 0},
    {"text": "Official movie trailer", "label": 0},
    {"text": "Fitness routine for beginners", "label": 0},
    {"text": "Recipe: pasta in 10 minutes", "label": 0},
    {"text": "Tech tutorial: setup wifi", "label": 0},
    {"text": "Art showcase and behind the scenes", "label": 0},
    {"text": "Local community news update", "label": 0}
]

def _predict_spam_with_thresholds(text, classifier, thresholds):
    keyword_result = check_gambling_keywords(text)
    scam_keyword_result = check_scam_keywords(text)

    candidate_labels = [
        "gambling or casino advertisement",
        "lottery or prize scam message",
        "general spam or phishing message",
        "legitimate promotional content",
        "normal video content"
    ]
    result = classifier(text, candidate_labels, multi_label=False)
    scores = {label: score for label, score in zip(result["labels"], result["scores"])}
    gambling_score = scores.get("gambling or casino advertisement", 0.0)
    scam_score = scores.get("lottery or prize scam message", 0.0) + scores.get("general spam or phishing message", 0.0)
    scam_score += 0.20 if scam_keyword_result.get("link_found", False) else 0.0

    keyword_score = keyword_result.get("pattern_score", 0)
    scam_keyword_score = scam_keyword_result.get("pattern_score", 0)
    has_link = scam_keyword_result.get("link_found", False)
    has_high_severity = len(scam_keyword_result.get("high_severity_matches", [])) > 0

    # Decision logic (BLOCK/REVIEW treated as spam for evaluation)
    if keyword_score >= thresholds["gambling_block"]:
        return 1
    if has_high_severity or (scam_keyword_score >= thresholds["scam_block"] and has_link) or scam_keyword_score >= thresholds["scam_block_strict"]:
        return 1
    if gambling_score >= thresholds["ai_block"] or scam_score >= thresholds["ai_block"]:
        return 1
    if keyword_score >= thresholds["gambling_review"]:
        return 1
    if scam_keyword_score >= thresholds["scam_review"]:
        return 1
    if max(gambling_score, scam_score) >= thresholds["ai_review"]:
        return 1
    return 0

def evaluate_thresholds(samples, classifier, thresholds):
    tp = fp = tn = fn = 0
    for sample in samples:
        pred = _predict_spam_with_thresholds(sample["text"], classifier, thresholds)
        label = sample["label"]
        if pred == 1 and label == 1:
            tp += 1
        elif pred == 1 and label == 0:
            fp += 1
        elif pred == 0 and label == 0:
            tn += 1
        else:
            fn += 1
    precision = tp / (tp + fp) if (tp + fp) else 0.0
    recall = tp / (tp + fn) if (tp + fn) else 0.0
    f1 = (2 * precision * recall / (precision + recall)) if (precision + recall) else 0.0
    return {"precision": precision, "recall": recall, "f1": f1, "tp": tp, "fp": fp, "tn": tn, "fn": fn}

def sweep_thresholds(samples, classifier):
    best = None
    ai_block_grid = [0.60, 0.65, 0.70, 0.75]
    ai_review_grid = [0.35, 0.40, 0.45, 0.50]

    for ai_block in ai_block_grid:
        for ai_review in ai_review_grid:
            if ai_review >= ai_block:
                continue
            thresholds = {
                "ai_block": ai_block,
                "ai_review": ai_review,
                "gambling_block": 70,
                "gambling_review": 30,
                "scam_block": 60,
                "scam_block_strict": 80,
                "scam_review": 40
            }
            metrics = evaluate_thresholds(samples, classifier, thresholds)
            score = metrics["f1"]
            if (best is None) or (score > best["f1"]):
                best = {**thresholds, **metrics}
    return best

# Run after models are loaded
best = sweep_thresholds(manual_samples, zero_shot_classifier)
print("Best thresholds (by F1):")
print(best)

Best thresholds (by F1):
{'ai_block': 0.6, 'ai_review': 0.35, 'gambling_block': 70, 'gambling_review': 30, 'scam_block': 60, 'scam_block_strict': 80, 'scam_review': 40, 'precision': 1.0, 'recall': 0.9655172413793104, 'f1': 0.9824561403508771, 'tp': 28, 'fp': 0, 'tn': 20, 'fn': 1}


In [23]:
def analyze_video_for_spam(video_path, num_frames=16):
    """
    Complete spam/gambling detection pipeline

    Args:
        video_path (str): Path to video file
        num_frames (int): Number of frames to analyze

    Returns:
        dict: Complete analysis results in standardized format
    """

    print("="*70)
    print(f"SPAM/GAMBLING DETECTION ANALYSIS")
    print(f"Video: {video_path.split('/')[-1]}")
    print("="*70)

    # STEP 1: OCR Text Extraction
    print("\n[STEP 1] OCR Text Extraction")
    print("-"*70)
    ocr_result = extract_text_from_video(video_path, num_frames)

    if 'error' in ocr_result:
        return {
            'status': 'error',
            'error': ocr_result['error'],
            'recommended_action': 'REVIEW',
            'is_safe': False,
            'is_spam': True
        }

    combined_text = ocr_result['combined_text']

    # STEP 2: Text Classification
    print("\n[STEP 2] Text Classification")
    print("-"*70)
    classification = classify_text_spam(combined_text, zero_shot_classifier)

    # STEP 3: Decision Making
    print("\n[STEP 3] Decision Logic")
    print("-"*70)
    decision = make_spam_decision(classification, len(combined_text))

    # CRITICAL FIX: Sync is_spam and is_safe with recommended_action
    recommended_action = decision['recommended_action']

    if recommended_action == 'BLOCK':
        is_safe = False
        is_spam = True
    elif recommended_action == 'REVIEW':
        is_safe = False  # Not confirmed safe
        is_spam = True   # Potentially spam
    else:  # APPROVE
        is_safe = True
        is_spam = False

    # Compile final results
    final_result = {
        'text_detected': combined_text,
        'text_length': len(combined_text),
        'frames_processed': ocr_result['frames_processed'],
        'spam_category': classification.get('category', 'unknown'),
        'confidence': classification.get('confidence', 0),
        'all_scores': classification.get('all_scores', {}),
        'recommended_action': recommended_action,
        'reasoning': decision['reasoning'],
        'confidence_level': decision['confidence_level'],
        'is_safe': is_safe,      # ← FIXED: Now synced with recommended_action
        'is_spam': is_spam       # ← FIXED: Now synced with recommended_action
    }

    # Display final verdict
    print("\n" + "="*70)
    print("FINAL VERDICT")
    print("="*70)

    action = recommended_action
    print(f"\nRecommended Action: {action}")
    print(f"Confidence Level: {decision['confidence_level']}")
    print(f"Reasoning: {decision['reasoning']}")
    print(f"Is Spam: {is_spam}")
    print(f"Is Safe: {is_safe}")

    if combined_text:
        text_preview = combined_text[:200] + '...' if len(combined_text) > 200 else combined_text
        print(f"\nText Detected: '{text_preview}'")

    print("\n" + "="*70)

    return final_result


def check_spam_gambling(video_path):
    """
    Integration wrapper for Person 3's system

    Args:
        video_path (str): Path to video file

    Returns:
        dict: Analysis results with recommended_action, reasoning, confidence
    """
    return analyze_video_for_spam(video_path)

In [24]:
# Test on gambling advertisement video
test_video = '/content/video.mp4'  # Replace with actual path

result = analyze_video_for_spam(test_video)

# Display JSON output
print("\n\nJSON OUTPUT:")
print("="*70)
print(json.dumps(result, indent=2))

SPAM/GAMBLING DETECTION ANALYSIS
Video: video.mp4

[STEP 1] OCR Text Extraction
----------------------------------------------------------------------
[INFO] Video metadata: 115 frames, 30.0 FPS, 3.8s duration
[INFO] Extracting text from 16 frames...

Frame 1/16: 'crichatch PLAY. Big Win BTG G E R BACCARAT BILLION...'
Frame 2/16: 'crichatch PLAY. Big Win BACCARAT BILLIONAIRE BASH ...'
Frame 3/16: 'crichatch PLAY. Big Wi BTG G E R BACCARAT BILLIONA...'
Frame 4/16: 'crichatch PLAY. Big Wi BTG G ER BACCARAT BILLIONPI...'
Frame 5/16: 'crichatch PLAY. Big Wi BTG G ER BACCARAT BILLIONAI...'
Frame 6/16: 'crichatch PLAY. Big Wi BTG G ER BACCARAT BILLIONAI...'
Frame 7/16: 'crichatch PLAY. Big Wi BTG G ER BACCARAT BILLIONPI...'
Frame 8/16: 'crichatch PLAY. Big Wi BTG G ER BACCARAT BILLIONPI...'
Frame 9/16: 'crichatch PLAY. Big Wi BTG G ER BACCARAT BILLIONPI...'
Frame 10/16: 'crichatch PLAY. Big Wi BTG G ER BACCARAT BILLIONPI...'
Frame 11/16: 'crichatch PLAY. Big Wi BTG G ER BACCARAT BILLIONPI...

In [25]:
# Test on borderline promotional content
test_video = '/content/mixed.mp4'  # Replace with actual path

result = analyze_video_for_spam(test_video)

# Display JSON output
print("\n\nJSON OUTPUT:")
print("="*70)
print(json.dumps(result, indent=2))

SPAM/GAMBLING DETECTION ANALYSIS
Video: mixed.mp4

[STEP 1] OCR Text Extraction
----------------------------------------------------------------------
[INFO] Video metadata: 54 frames, 30.0 FPS, 1.8s duration
[INFO] Extracting text from 16 frames...

Frame 1/16: '"Win a free trip when you shop today:" 'Get bonus ...'
Frame 2/16: '"Win a free trip when you shop today:" 'Get bonus ...'
Frame 3/16: '"Win a free trip when you shop today:" 'Get bonus ...'
Frame 4/16: '"Win a free trip when you shop today:' "Get bonus ...'
Frame 5/16: '"Win a free trip when you shop today:' "Get bonus ...'
Frame 6/16: ''Win a free trip when you shop "Get bonus cash on ...'
Frame 7/16: ''Win a free trip when you shop "Get bonus cash on ...'
Frame 8/16: ''Win a free trip when you shop "Get bonus cash on ...'
Frame 9/16: ''Win a free trip when you shop "Get bonus cash on ...'
Frame 10/16: ''Win a free trip when you shop "Get bonus cash on ...'
Frame 11/16: ''Win a free trip when you shop "Get bonus cash on ...'

In [26]:
# Test on legitimate content
test_video = '/content/legitimate.mp4'  # Replace with actual path

result = analyze_video_for_spam(test_video)

# Display JSON output
print("\n\nJSON OUTPUT:")
print("="*70)
print(json.dumps(result, indent=2))

SPAM/GAMBLING DETECTION ANALYSIS
Video: legitimate.mp4

[STEP 1] OCR Text Extraction
----------------------------------------------------------------------
[INFO] Video metadata: 32 frames, 30.0 FPS, 1.1s duration
[INFO] Extracting text from 16 frames...

Frame 1/16: '"Get 30% off all summer collections this weekend a...'
Frame 2/16: '"Get 30% off all summer collections this weekend a...'
Frame 3/16: '"Get 30% off all summer collections this weekend a...'
Frame 4/16: '"Get 30% off all summer collections this weekend a...'
Frame 5/16: '"Get 30% off all summer collections this weekend a...'
Frame 6/16: '"Get 30% off all summer collections this weekend a...'
Frame 7/16: '"Get 30% off all summer collections this weekend a...'
Frame 8/16: '"Get 30% off all summer collections this weekend a...'
Frame 9/16: '"Get 30% off all summer collections this weekend a...'
Frame 10/16: '"Get 30% off all summer collections this weekend a...'
Frame 11/16: '"Get 30% off all summer collections this weekend 

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Frame 16/16: '"Get 30% off all summer collections this weekend a...'

[SUCCESS] Processed 16 frames
[INFO] Total text extracted: 1902 characters

[STEP 2] Text Classification
----------------------------------------------------------------------

[INFO] Running keyword pattern matching...
[DETECT] Scam keywords found: free
[INFO] Scam keyword score: 10

[INFO] Running zero-shot classification...
[INFO] Text preview: '"Get 30% off all summer collections this weekend at UrbanWear" 1 Get 1 Free on select shoes limited store offer' "Buy "G...'\n

[DECISION] USING AI CLASSIFICATION (no strong keyword override)

[RESULTS] Final Classification Scores:
  [HIGH] legitimate promotional content: 81.5%
  [LOW] lottery or prize scam message: 8.5%
  [LOW] normal video content: 5.1%
  [LOW] gambling or casino advertisement: 4.9%

[STEP 3] Decision Logic
----------------------------------------------------------------------

FINAL VERDICT

Recommended Action: APPROVE
Confidence Level: HIGH
Reasoning: 

In [27]:

test_video = '/content/spam.mp4'  # Replace with actual path

result = analyze_video_for_spam(test_video)

# Display JSON output
print("\n\nJSON OUTPUT:")
print("="*70)
print(json.dumps(result, indent=2))

SPAM/GAMBLING DETECTION ANALYSIS
Video: spam.mp4

[STEP 1] OCR Text Extraction
----------------------------------------------------------------------
[INFO] Video metadata: 56 frames, 30.0 FPS, 1.9s duration
[INFO] Extracting text from 16 frames...

Frame 1/16: 'AT&T 12:15 PM 94% Messages JBStore Contact Text Me...'
Frame 2/16: 'AT&T 12:15 PM 94% Messages JBStore Contact Text Me...'
Frame 3/16: 'AT&T 12:15 PM 94% Messages JBStore Contact Text Me...'
Frame 4/16: 'AT&T 12:15 PM 94% Messages JBStore Contact Text Me...'
Frame 5/16: 'AT&T 12:15 PM 94% Messages JBStore Contact Text Me...'
Frame 6/16: 'AT&T 12:15 PM 94% Messages JBStore Contact Text Me...'
Frame 7/16: 'AT&T 12:15 PM 94% Messages JBStore Contact Text Me...'
Frame 8/16: 'AT&T 12:15 PM 94% Messages JBStore Contact Text Me...'
Frame 9/16: 'AT&T 12:15 PM 94% Messages JBStore Contact Text Me...'
Frame 10/16: 'AT&T 12:15 PM 94% Messages JBStore Contact Text Me...'
Frame 11/16: 'AT&T 12:15 PM 94% Messages JBStore Contact Text Me...'


In [14]:

test_video = '/content/gamb.mp4'  # Replace with actual path

result = analyze_video_for_spam(test_video)

# Display JSON output
print("\n\nJSON OUTPUT:")
print("="*70)
print(json.dumps(result, indent=2))

SPAM/GAMBLING DETECTION ANALYSIS
Video: gamb.mp4

[STEP 1] OCR Text Extraction
----------------------------------------------------------------------
[INFO] Video metadata: 66 frames, 30.0 FPS, 2.2s duration
[INFO] Extracting text from 16 frames...

Frame 1/16: ''Bet now and win big with Zx odds tonightl" "Join ...'
Frame 2/16: ''Bet now and win big with Zx odds tonightl" "Join ...'
Frame 3/16: ''Bet now and win big with Zx odds tonightl" "Join ...'
Frame 4/16: ''Bet now and win big with Zx odds tonightl" "Join ...'
Frame 5/16: ''Bet now and win big with Zx odds tonightl" "Join ...'
Frame 6/16: ''Bet now and win big with Zx odds tonightl" "Join ...'
Frame 7/16: ''Bet now and win big with Zx odds tonightl" "Join ...'
Frame 8/16: ''Bet now and win big with Zx odds tonightl" "Join ...'
Frame 9/16: ''Bet now and win big with Zx odds tonightl" "Join ...'
Frame 10/16: ''Bet now and win big with Zx odds tonightl" "Join ...'
Frame 11/16: ''Bet now and win big with Zx odds tonightl" "Join ...'


In [15]:
# Test on legitimate content
test_video = '/content/normal.mp4'  # Replace with actual path

result = analyze_video_for_spam(test_video)

# Display JSON output
print("\n\nJSON OUTPUT:")
print("="*70)
print(json.dumps(result, indent=2))

SPAM/GAMBLING DETECTION ANALYSIS
Video: normal.mp4

[STEP 1] OCR Text Extraction
----------------------------------------------------------------------
[INFO] Video metadata: 47 frames, 30.0 FPS, 1.6s duration
[INFO] Extracting text from 16 frames...

Frame 1/16: ''Meeting moved to 3 PM" "Can you send me the repor...'
Frame 2/16: ''Meeting moved to 3 PM" "Can you send me the repor...'
Frame 3/16: ''Meeting moved to 3 PM" "Can you send me the repor...'
Frame 4/16: '"Meeting moved to 3 PM" "Can you send me the repor...'
Frame 5/16: ''Meeting moved to 3 PM" "Can you send me the repor...'
Frame 6/16: '"Meeting moved to 3 PM" "Can you send me the repor...'
Frame 7/16: '"Meeting moved to 3 PM" "Can you send me the repor...'
Frame 8/16: ''Meeting moved to 3 PM" "Can you send me the repor...'
Frame 9/16: '"Meeting moved to 3 PM" "Can you send me the repor...'
Frame 10/16: ''Meeting moved to 3 PM" "Can you send me the repor...'
Frame 11/16: '"Meeting moved to 3 PM" "Can you send me the repor...

In [28]:

test_video = '/content/spam_gen.mp4'  # Replace with actual path

result = analyze_video_for_spam(test_video)

# Display JSON output
print("\n\nJSON OUTPUT:")
print("="*70)
print(json.dumps(result, indent=2))

SPAM/GAMBLING DETECTION ANALYSIS
Video: spam_gen.mp4

[STEP 1] OCR Text Extraction
----------------------------------------------------------------------
[INFO] Video metadata: 59 frames, 30.0 FPS, 2.0s duration
[INFO] Extracting text from 16 frames...

Frame 1/16: '"Congratulations! You have won a free iPhone: Clic...'
Frame 2/16: '"Congratulations! You have won a free iPhone. Clic...'
Frame 3/16: '"Congratulations! You have won a free iPhone: Clic...'
Frame 4/16: '"Congratulations! You have won a free iPhone: Clic...'
Frame 5/16: '"Congratulations! You have won a free iPhone: Clic...'
Frame 6/16: '"Congratulations! You have won a free iPhone: Clic...'
Frame 7/16: '"Congratulations! You have won a free iPhone: Clic...'
Frame 8/16: '"Congratulations! You have won a free iPhone: Clic...'
Frame 9/16: '"Congratulations! You have won a free iPhone: Clic...'
Frame 10/16: '"Congratulations! You have won a free iPhone: Clic...'
Frame 11/16: '"Congratulations! You have won a free iPhone: Clic.