In [None]:
!pip install openai

In [None]:
#!/usr/bin/env python3
"""
O3-MINI SPEED-OPTIMIZED Reasoning Pipeline for Google Colab
Enhanced performance while maintaining o3-mini model

KEY OPTIMIZATIONS FOR O3-MINI:
- Medium/Low reasoning effort configuration
- Reduced output tokens for faster processing
- Removed artificial delays
- Concurrent processing with multiple parallel requests
- Optimized prompts for efficiency
- Intelligent batching and error handling
- Complete response logging system
- Enhanced email quality selection

IMPROVEMENTS:
- Single email processing time significantly reduced
- Concurrent processing capabilities
- Cost reduction through optimization
- Comprehensive logging and monitoring
"""

import pandas as pd
import json
import time
import asyncio
from datetime import datetime, timedelta
from openai import OpenAI
import numpy as np
from tqdm import tqdm
import os
import logging
import concurrent.futures
from typing import Dict, List, Tuple, Optional
import pickle
import threading
import csv
import re

# Google Colab specific imports
try:
    from google.colab import drive
    IN_COLAB = True
except ImportError:
    IN_COLAB = False

class O3MiniOptimizedPipeline:
    """O3-Mini optimized pipeline with enhanced performance and quality selection"""

    def __init__(self,
                 dataset_path="Enron.csv",
                 samples_per_class=5000,
                 output_dir="output",
                 gdrive_base_path="/content/drive/MyDrive/phishing_detection_final",#
                 checkpoint_interval=50,
                 max_concurrent=8,
                 api_key=None,
                 o3_speed_mode="medium"):   # low,medium,high
        """Initialize the O3-Mini speed-optimized pipeline"""

        # O3-Mini speed configuration
        self.o3_speed_mode = o3_speed_mode
        self.max_concurrent = max_concurrent

        # O3-Mini specific settings
        if o3_speed_mode == "low":
            self.reasoning_effort = "low"
            self.max_output_tokens = 8000
            self.checkpoint_interval = 100 #
            print("O3-Mini LOW effort mode: Maximum processing speed")
        elif o3_speed_mode == "medium":
            self.reasoning_effort = "medium"
            self.max_output_tokens = 12000
            self.checkpoint_interval = 50 #
            print("O3-Mini MEDIUM effort mode: Balanced speed and accuracy")
        else:  # high mode
            self.reasoning_effort = "high"
            self.max_output_tokens = 20000
            self.checkpoint_interval = 25
            print("O3-Mini HIGH effort mode: Maximum accuracy")

        # Basic configuration
        self.dataset_path = dataset_path
        self.samples_per_class = samples_per_class
        self.total_samples = samples_per_class * 2

        # Google Drive configuration
        self.gdrive_base_path = gdrive_base_path
        self.use_gdrive = IN_COLAB

        if self.use_gdrive:
            self._mount_google_drive()
            self.output_dir = os.path.join(self.gdrive_base_path, output_dir)
        else:
            self.output_dir = output_dir

        # Create directory structure
        self.final_dataset_path = os.path.join(self.output_dir, "final_datasets")
        self.intermediate_path = os.path.join(self.output_dir, "intermediate")
        self.logs_path = os.path.join(self.output_dir, "logs")
        self.checkpoint_path = os.path.join(self.output_dir, "checkpoints")
        self.response_logs_path = os.path.join(self.output_dir, "response_logs")

        self._create_output_directories()
        self._setup_logging()

        # API configuration
        if api_key:
            self.client = OpenAI(api_key=api_key)#
        else:
            # Default API key from original code
            api_key = "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"
            self.client = OpenAI(api_key=api_key)

        # Concurrent processing setup
        self.executor = concurrent.futures.ThreadPoolExecutor(max_workers=self.max_concurrent)
        self.lock = threading.Lock()

        # Pipeline tracking
        self.processed_count = 0
        self.successful_reasoning = 0
        self.failed_reasoning = 0
        self.total_tokens_used = 0
        self.total_processing_time = 0
        self.start_time = None

        # Results storage
        self.enhanced_dataset = []
        self.processing_stats = []

        # Retry configuration
        self.max_retries = 2
        self.retry_delay = 2

        logging.info(f"O3-Mini Optimized Pipeline Initialized")
        logging.info(f"Reasoning effort: {self.reasoning_effort}")
        logging.info(f"Max tokens: {self.max_output_tokens}")
        logging.info(f"Max concurrent: {self.max_concurrent}")
        logging.info(f"Target: {self.total_samples} samples ({samples_per_class} per class)")
        logging.info(f"Response logging enabled - Raw responses saved to {self.response_logs_path}")

    def _mount_google_drive(self):
        """Mount Google Drive in Colab"""
        try:
            drive.mount('/content/drive')
            logging.info("Google Drive mounted successfully")
        except Exception as e:
            logging.error(f"Failed to mount Google Drive: {e}")
            self.use_gdrive = False

    def _create_output_directories(self):
        """Create output directories"""
        directories = [
            self.output_dir,
            self.final_dataset_path,
            self.intermediate_path,
            self.logs_path,
            self.checkpoint_path,
            self.response_logs_path
        ]

        for directory in directories:
            if not os.path.exists(directory):
                os.makedirs(directory)

    def _setup_logging(self):
        """Setup logging configuration"""
        log_file = os.path.join(self.logs_path, f'o3_mini_optimized_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log')

        for handler in logging.root.handlers[:]:
            logging.root.removeHandler(handler)

        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler(log_file, encoding='utf-8'),
                logging.StreamHandler()
            ]
        )

    def create_optimized_prompt(self, subject, body, label):
        """Speed-optimized prompt for O3-Mini"""

        actual_class = "LEGITIMATE" if label == 0 else "PHISHING"

        prompt = f"""You are a cybersecurity expert analyzing emails for phishing indicators.

EMAIL:
Subject: {subject}
Body: {body}

GROUND TRUTH: This email is {actual_class}

Provide focused analysis in exactly 5 sections (2-3 sentences each):

1. SENDER ANALYSIS
Examine sender authenticity, domain reputation, and authentication indicators.

2. LANGUAGE PATTERNS
Analyze grammar, writing style, and linguistic deception markers.

3. SOCIAL ENGINEERING
Identify psychological manipulation tactics and emotional triggers.

4. TECHNICAL INDICATORS
Assess URLs, requests, attachments, and technical attack vectors.

5. RISK ASSESSMENT
Correlate findings with threat intelligence and provide final assessment.enro

CLASSIFICATION: [Write either LEGITIMATE or PHISHING]   #classification:phishing
CONFIDENCE: [Write either HIGH, MEDIUM, or LOW]

Keep sections concise but thorough. Focus on critical indicators."""

        return prompt.strip()

    def _save_response_log(self, response, email_index, subject, body, label, request_time, prompt=None):
        """Save raw API response as JSON log for debugging and analysis"""
        try:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")[:-3]
            log_filename = os.path.join(self.response_logs_path, f"response_email_{email_index:05d}_{timestamp}.json")

            # Create response log data
            response_log = {
                'email_metadata': {
                    'email_index': email_index,
                    'subject_preview': subject[:100] + '...' if len(subject) > 100 else subject,
                    'body_length': len(body),
                    'actual_label': label,
                    'actual_label_name': "LEGITIMATE" if label == 0 else "PHISHING",
                    'processing_timestamp': datetime.now().isoformat(),
                    'request_time_seconds': request_time
                },
                'api_request': {
                    'model': 'o3-mini',
                    'reasoning_effort': self.reasoning_effort,
                    'max_output_tokens': self.max_output_tokens,
                    'prompt_length': len(prompt) if prompt else 0,
                    'prompt_preview': prompt[:500] + '...' if prompt and len(prompt) > 500 else prompt
                },
                'api_response': {
                    'response_id': getattr(response, 'id', None),
                    'object_type': getattr(response, 'object', None),
                    'model': getattr(response, 'model', None),
                    'status': getattr(response, 'status', None),
                    'created': getattr(response, 'created', None),
                    'output_text_length': len(response.output_text) if hasattr(response, 'output_text') and response.output_text else 0,
                    'output_text_preview': response.output_text[:1000] + '...' if hasattr(response, 'output_text') and response.output_text and len(response.output_text) > 1000 else (response.output_text if hasattr(response, 'output_text') else None)
                },
                'token_usage': {},
                'reasoning_data': {},
                'incomplete_details': None
            }

            # Extract token usage details
            if hasattr(response, 'usage') and response.usage:
                usage = response.usage
                response_log['token_usage'] = {
                    'input_tokens': getattr(usage, 'input_tokens', 0),
                    'output_tokens': getattr(usage, 'output_tokens', 0),
                    'total_tokens': getattr(usage, 'total_tokens', 0)
                }

                # Extract reasoning token details
                if hasattr(usage, 'output_tokens_details') and usage.output_tokens_details:
                    details = usage.output_tokens_details
                    reasoning_tokens = getattr(details, 'reasoning_tokens', 0) or 0
                    visible_tokens = response_log['token_usage']['output_tokens'] - reasoning_tokens

                    response_log['reasoning_data'] = {
                        'reasoning_tokens': reasoning_tokens,
                        'visible_tokens': visible_tokens,
                        'reasoning_ratio_percent': (reasoning_tokens / response_log['token_usage']['output_tokens'] * 100) if response_log['token_usage']['output_tokens'] > 0 else 0
                    }

            # Handle incomplete responses
            if hasattr(response, 'status') and response.status == 'incomplete':
                if hasattr(response, 'incomplete_details'):
                    response_log['incomplete_details'] = {
                        'reason': getattr(response.incomplete_details, 'reason', None) if response.incomplete_details else None
                    }

            # Save response log to JSON file
            with open(log_filename, 'w', encoding='utf-8') as f:
                json.dump(response_log, f, indent=2, default=str, ensure_ascii=False)

            logging.info(f"Response log saved: {os.path.basename(log_filename)}")

        except Exception as e:
            logging.error(f"Failed to save response log for email {email_index}: {e}")

    def generate_reasoning_chain_optimized(self, email_data):
        """Optimized reasoning generation for O3-Mini with response logging"""

        subject, body, label, email_index = email_data

        try:
            # Create optimized prompt
            prompt = self.create_optimized_prompt(subject, body, label)
            request_start = time.time()

            # O3-MINI SPEED OPTIMIZATIONS
            response = self.client.responses.create(
                model="o3-mini",#o3-mini-20250716
                reasoning={"effort": self.reasoning_effort},
                input=[{"role": "user", "content": prompt}],
                max_output_tokens=self.max_output_tokens
            )

          #name

            request_time = time.time() - request_start

            # Save raw API response as JSON log
            self._save_response_log(response, email_index, subject, body, label, request_time, prompt)

            # Extract response
            reasoning_chain = response.output_text if hasattr(response, 'output_text') else ""

            # Parse prediction
            predicted_class_name = self.extract_prediction_fast(reasoning_chain)
            predicted_label = 1 if predicted_class_name == "PHISHING" else 0 if predicted_class_name == "LEGITIMATE" else -1

            # Extract token usage
            usage_data = self._extract_token_usage(response)

            # Thread-safe updates
            with self.lock:
                self.total_tokens_used += usage_data['total_tokens']
                self.total_processing_time += request_time
                self.successful_reasoning += 1

            # Extract reasoning steps
            reasoning_steps = self.extract_reasoning_steps(reasoning_chain)

            # Create stats
            stats = {
                'email_index': email_index,
                'processing_time': request_time,
                'status': 'success',
                'attempt': 1,
                **usage_data,
                'output_length': len(reasoning_chain),
                'word_count': len(reasoning_chain.split()),
                'actual_label': label,
                'predicted_label': predicted_label,
                'prediction_correct': predicted_class_name == ("LEGITIMATE" if label == 0 else "PHISHING")
            }

            # Log progress
            actual_class = "LEGITIMATE" if label == 0 else "PHISHING"
            match_status = "CORRECT" if predicted_class_name == actual_class else "INCORRECT"
            logging.info(f"Email {email_index + 1}: {request_time:.2f}s, {usage_data['total_tokens']} tokens, "
                        f"Actual: {actual_class}, Predicted: {predicted_class_name} ({match_status})")

            return reasoning_chain, reasoning_steps, stats, predicted_class_name, predicted_label

        except Exception as e:
            request_time = time.time() - request_start if 'request_start' in locals() else 0

            with self.lock:
                self.failed_reasoning += 1

            logging.error(f"Email {email_index + 1}: {str(e)}")

            # Save error log to response logs directory
            try:
                timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")[:-3]
                error_log_filename = os.path.join(self.response_logs_path, f"error_email_{email_index:05d}_{timestamp}.json")

                error_log = {
                    'email_metadata': {
                        'email_index': email_index,
                        'subject_preview': subject[:100] + '...' if len(subject) > 100 else subject,
                        'body_length': len(body),
                        'actual_label': label,
                        'actual_label_name': "LEGITIMATE" if label == 0 else "PHISHING",
                        'processing_timestamp': datetime.now().isoformat(),
                        'request_time_seconds': request_time
                    },
                    'error_details': {
                        'error_message': str(e),
                        'error_type': type(e).__name__,
                        'status': 'api_error'
                    },
                    'api_request': {
                        'model': 'o3-mini',
                        'reasoning_effort': self.reasoning_effort,
                        'max_output_tokens': self.max_output_tokens,
                        'prompt_length': len(prompt) if 'prompt' in locals() else 0
                    }
                }

                with open(error_log_filename, 'w', encoding='utf-8') as f:
                    json.dump(error_log, f, indent=2, default=str, ensure_ascii=False)

            except Exception as log_error:
                logging.warning(f"Could not save error log: {log_error}")

            stats = {
                'email_index': email_index,
                'processing_time': request_time,
                'status': 'error',
                'error': str(e),
                'attempt': 1,
                'actual_label': label,
                'predicted_label': -1,
                'prediction_correct': False
            }

            return "", {}, stats, "ERROR", -1

    def extract_prediction_fast(self, reasoning_text):
        """Fast prediction extraction"""
        reasoning_upper = reasoning_text.upper()

        # Quick pattern matching
        if "CLASSIFICATION: PHISHING" in reasoning_upper:
            return "PHISHING"
        elif "CLASSIFICATION: LEGITIMATE" in reasoning_upper:
            return "LEGITIMATE"

        # Fallback patterns
        patterns = [
            r"I CLASSIFY THIS EMAIL AS:\s*(\w+)",   #I classify this email as
            r"CLASSIFY THIS EMAIL AS:\s*(\w+)",
            r"CLASSIFICATION:\s*(\w+)",
            r"FINAL CLASSIFICATION.*?:\s*(\w+)",
            r"EMAIL AS:\s*(\w+)"
        ]

        for pattern in patterns:
            match = re.search(pattern, reasoning_upper)
            if match:
                prediction = match.group(1).strip()
                if prediction in ["LEGITIMATE", "PHISHING"]:
                    return prediction

        return "UNCERTAIN"

    def extract_reasoning_steps(self, reasoning_text):
        """Extract reasoning steps from response"""
        steps = {
            'sender_analysis': '',
            'language_patterns': '',
            'social_engineering': '',
            'technical_indicators': '',
            'risk_assessment': '',
            'confidence_assessment': ''
        }

        if not reasoning_text:
            return steps

        # Simple extraction based on section headers
        sections = [
            ('sender_analysis', ['1. SENDER ANALYSIS', 'SENDER ANALYSIS']),
            ('language_patterns', ['2. LANGUAGE PATTERNS', 'LANGUAGE PATTERNS']),
            ('social_engineering', ['3. SOCIAL ENGINEERING', 'SOCIAL ENGINEERING']),
            ('technical_indicators', ['4. TECHNICAL INDICATORS', 'TECHNICAL INDICATORS']),
            ('risk_assessment', ['5. RISK ASSESSMENT', 'RISK ASSESSMENT']),
            ('confidence_assessment', ['CONFIDENCE:', 'My confidence level'])
        ]

        text_upper = reasoning_text.upper()

        for step_key, markers in sections:
            for marker in markers:
                pos = text_upper.find(marker.upper())
                if pos != -1:
                    # Find content after marker
                    content_start = pos + len(marker)

                    # Find next section or end
                    next_pos = len(reasoning_text)
                    for next_step_key, next_markers in sections:
                        if next_step_key != step_key:
                            for next_marker in next_markers:
                                next_marker_pos = text_upper.find(next_marker.upper(), content_start)
                                if next_marker_pos != -1 and next_marker_pos < next_pos:
                                    next_pos = next_marker_pos

                    content = reasoning_text[content_start:next_pos].strip()
                    if content:
                        steps[step_key] = self.clean_text_for_csv(content)
                    break

        return steps

    def clean_text_for_csv(self, text):
        """Clean text for CSV compatibility"""
        if not isinstance(text, str):
            return str(text)

        # Character replacements
        replacements = {
            '"': '"', '"': '"', ''': "'", ''': "'",
            '–': '-', '—': '-', '…': '...', '•': '*',
            '™': 'TM', '®': '(R)', '©': '(C)'
        }

        #

        for old, new in replacements.items():
            text = text.replace(old, new)

        # Replace problematic characters
        text = text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
        text = text.replace('"', "'").replace('\\', '/').replace('|', ' ')

        # Remove non-ASCII characters
        text = ''.join(char if ord(char) < 128 else ' ' for char in text)

        # Normalize whitespace
        text = ' '.join(text.split())

        return text.strip()

    def _extract_token_usage(self, response):
        """Extract token usage information"""
        usage_data = {
            'input_tokens': 0,
            'output_tokens': 0,
            'total_tokens': 0,
            'reasoning_tokens': 0,
            'visible_tokens': 0,
            'reasoning_ratio': 0
        }

        if hasattr(response, 'usage') and response.usage:
            usage_data['input_tokens'] = getattr(response.usage, 'input_tokens', 0)
            usage_data['output_tokens'] = getattr(response.usage, 'output_tokens', 0)
            usage_data['total_tokens'] = getattr(response.usage, 'total_tokens', 0)

            if hasattr(response.usage, 'output_tokens_details'):
                details = response.usage.output_tokens_details
                if hasattr(details, 'reasoning_tokens'):
                    usage_data['reasoning_tokens'] = details.reasoning_tokens or 0

            usage_data['visible_tokens'] = usage_data['output_tokens'] - usage_data['reasoning_tokens']
            if usage_data['output_tokens'] > 0:
                usage_data['reasoning_ratio'] = usage_data['reasoning_tokens'] / usage_data['output_tokens'] * 100

        return usage_data

    def load_and_sample_dataset(self, min_words=15, ensure_quality=True):
        """Load and sample high-quality dataset with enhanced selection"""
        logging.info("Loading dataset...")

        try:
            df = pd.read_csv(self.dataset_path, encoding='utf-8', encoding_errors='replace')
            logging.info(f"Loaded dataset with {len(df)} rows")

            if ensure_quality:
                # Enhanced quality filtering
                logging.info("Applying quality filters...")

                # Calculate total word count
                df['total_word_count'] = (
                    df['subject'].fillna('').astype(str).apply(lambda x: len(x.split())) +
                    df['body'].fillna('').astype(str).apply(lambda x: len(x.split()))
                )

                # Apply minimum word filter
                df = df[df['total_word_count'] >= min_words].copy()
                logging.info(f"After word count filter: {len(df)} emails remain")

                # Remove emails with empty or very short bodies
                df = df[df['body'].notna() & (df['body'].str.strip() != '')]
                df = df[df['body'].str.len() >= 50]  # At least 50 characters
                logging.info(f"After body length filter: {len(df)} emails remain")

                # Remove duplicate content
                df = df.drop_duplicates(subset=['subject', 'body'], keep='first')
                logging.info(f"After removing duplicates: {len(df)} emails remain")

                # Remove emails with mostly non-ASCII characters
                def has_enough_ascii(text):
                    if not text or len(text) == 0:
                        return False
                    ascii_chars = sum(1 for char in text if ord(char) < 128)
                    return ascii_chars / len(text) >= 0.8

                df = df[df['body'].apply(has_enough_ascii)]
                logging.info(f"After ASCII filter: {len(df)} emails remain")

                # Ensure variety in email lengths
                df['body_length'] = df['body'].str.len()

                # Drop temporary columns
                df = df.drop(['total_word_count', 'body_length'], axis=1)

            # Check class distribution
            label_counts = df['label'].value_counts()
            logging.info(f"Label distribution: {dict(label_counts)}")

            # Enhanced sampling for better quality
            sampled_data = []

            for label in [0, 1]:
                label_data = df[df['label'] == label]
                available = len(label_data)

                if available < self.samples_per_class:
                    logging.warning(f"Only {available} samples available for label {label} (requested: {self.samples_per_class})")
                    sample_size = available
                else:
                    sample_size = self.samples_per_class

                # Stratified sampling by email length for diversity
                if sample_size > 100:
                    # Sort by body length and take samples from different quartiles
                    label_data_sorted = label_data.copy()
                    label_data_sorted['body_len'] = label_data_sorted['body'].str.len()
                    label_data_sorted = label_data_sorted.sort_values('body_len')

                    # Sample from different length quartiles
                    quartile_size = sample_size // 4
                    remainder = sample_size % 4

                    samples = []
                    for i in range(4):
                        start_idx = i * len(label_data_sorted) // 4
                        end_idx = (i + 1) * len(label_data_sorted) // 4
                        quartile_data = label_data_sorted.iloc[start_idx:end_idx]

                        q_sample_size = quartile_size + (1 if i < remainder else 0)
                        if len(quartile_data) > 0:
                            if len(quartile_data) >= q_sample_size:
                                q_sample = quartile_data.sample(n=q_sample_size, random_state=42)
                            else:
                                q_sample = quartile_data
                            samples.append(q_sample)

                    sampled = pd.concat(samples, ignore_index=True)
                    sampled = sampled.drop('body_len', axis=1)
                else:
                    # Simple random sampling for small datasets
                    sampled = label_data.sample(n=sample_size, random_state=42)

                sampled_data.append(sampled)

                label_name = "Legitimate" if label == 0 else "Phishing"
                logging.info(f"Sampled {len(sampled)} high-quality emails from {label_name} class")

            # Combine and shuffle
            self.sample_dataset = pd.concat(sampled_data, ignore_index=True)
            self.sample_dataset = self.sample_dataset.sample(frac=1, random_state=42).reset_index(drop=True)

            # Update total samples
            self.total_samples = len(self.sample_dataset)

            logging.info(f"Dataset ready with {self.total_samples} high-quality emails")
            return True

        except Exception as e:
            logging.error(f"Error loading dataset: {e}")
            return False

    def process_email_batch_concurrent(self, email_batch):
        """Process batch of emails concurrently"""

        # Prepare email data tuples
        email_data_list = []
        for idx, row in email_batch.iterrows():
            subject = self.clean_text_for_csv(str(row.get('subject', '[No Subject]')))
            body = self.clean_text_for_csv(str(row.get('body', '[No Body]')))
            label = int(row.get('label', 0))
            email_data_list.append((subject, body, label, idx))

        # Submit concurrent tasks
        futures = []
        for email_data in email_data_list:
            future = self.executor.submit(self.generate_reasoning_chain_optimized, email_data)
            futures.append((future, email_data))

        # Collect results
        results = []
        for future, email_data in futures:
            try:
                result = future.result(timeout=120)
                results.append((result, email_data))
            except Exception as e:
                logging.error(f"Concurrent processing error: {e}")
                # Create error result
                error_stats = {
                    'email_index': email_data[3],
                    'processing_time': 0,
                    'status': 'timeout_error',
                    'error': str(e),
                    'actual_label': email_data[2],
                    'predicted_label': -1,
                    'prediction_correct': False
                }
                results.append((("", {}, error_stats, "ERROR", -1), email_data))

        return results

    def process_dataset_optimized(self):
        """Main processing function with O3-Mini optimizations"""
        logging.info("Starting O3-Mini optimized reasoning chain generation...")

        self.start_time = datetime.now()

        # Process in batches for concurrent execution
        batch_size = self.max_concurrent
        total_batches = (len(self.sample_dataset) + batch_size - 1) // batch_size

        # Progress tracking
        correct_predictions = 0
        total_predictions = 0

        # Create progress bar
        pbar = tqdm(total=len(self.sample_dataset), desc="Processing emails with O3-Mini")

        # Process batches
        for batch_idx in range(total_batches):
            start_idx = batch_idx * batch_size
            end_idx = min(start_idx + batch_size, len(self.sample_dataset))

            batch_df = self.sample_dataset.iloc[start_idx:end_idx]

            logging.info(f"Processing batch {batch_idx + 1}/{total_batches} ({len(batch_df)} emails)")

            # Process batch concurrently
            batch_results = self.process_email_batch_concurrent(batch_df)

            # Process results
            for (reasoning_chain, reasoning_steps, stats, predicted_class_name, predicted_label), email_data in batch_results:
                email_index = email_data[3]
                row = self.sample_dataset.iloc[email_index]

                # Update tracking
                if predicted_label != -1:
                    total_predictions += 1
                    if predicted_label == int(row.get('label', 0)):
                        correct_predictions += 1

                # Create enhanced record with full reasoning steps
                enhanced_record = {
                    'original_index': email_index,
                    'subject': self.clean_text_for_csv(str(row.get('subject', ''))),
                    'body': self.clean_text_for_csv(str(row.get('body', ''))),
                    'original_label': int(row.get('label', 0)),
                    'original_label_name': "LEGITIMATE" if int(row.get('label', 0)) == 0 else "PHISHING",
                    'model_predicted_label': predicted_label,
                    'model_predicted_label_name': predicted_class_name,
                    'prediction_correct': predicted_label == int(row.get('label', 0)),
                    'reasoning_chain': self.clean_text_for_csv(reasoning_chain),
                    'sender_analysis': reasoning_steps.get('sender_analysis', ''),
                    'language_patterns': reasoning_steps.get('language_patterns', ''),
                    'social_engineering': reasoning_steps.get('social_engineering', ''),
                    'technical_indicators': reasoning_steps.get('technical_indicators', ''),
                    'risk_assessment': reasoning_steps.get('risk_assessment', ''),
                    'confidence_assessment': reasoning_steps.get('confidence_assessment', ''),
                    'reasoning_generated': predicted_label != -1,
                    'reasoning_length': len(reasoning_chain),
                    'reasoning_word_count': len(reasoning_chain.split()),
                    'processing_time': stats['processing_time'],
                    'tokens_used': stats.get('total_tokens', 0),
                    'reasoning_tokens': stats.get('reasoning_tokens', 0),
                    'o3_mini_effort': self.reasoning_effort,
                    'generation_timestamp': datetime.now().isoformat()
                }

                self.enhanced_dataset.append(enhanced_record)
                self.processing_stats.append(stats)
                self.processed_count += 1

                # Update progress bar
                pbar.update(1)

            # Progress update
            accuracy = (correct_predictions / total_predictions * 100) if total_predictions > 0 else 0
            elapsed = (datetime.now() - self.start_time).total_seconds()
            emails_per_sec = self.processed_count / elapsed if elapsed > 0 else 0

            pbar.set_postfix({
                'Accuracy': f"{accuracy:.1f}%",
                'Speed': f"{emails_per_sec:.1f} emails/sec",
                'Success': f"{self.successful_reasoning}/{self.processed_count}"
            })

            # Save checkpoint periodically
            if self.processed_count % self.checkpoint_interval == 0:
                self._save_checkpoint()

        pbar.close()

        # Final save and summary
        self._save_checkpoint(force=True)
        self._save_final_results()
        self._print_summary()

        # Cleanup
        self.executor.shutdown(wait=True)

        return True

    def _save_checkpoint(self, force=False):
        """Save checkpoint"""
        if not force and self.processed_count % self.checkpoint_interval != 0:
            return

        try:
            checkpoint_file = os.path.join(self.checkpoint_path, f'o3_mini_checkpoint_{datetime.now().strftime("%Y%m%d_%H%M%S")}.pkl')

            checkpoint_data = {
                'processed_count': self.processed_count,
                'successful_reasoning': self.successful_reasoning,
                'failed_reasoning': self.failed_reasoning,
                'total_tokens_used': self.total_tokens_used,
                'total_processing_time': self.total_processing_time,
                'enhanced_dataset': self.enhanced_dataset,
                'processing_stats': self.processing_stats,
                'start_time': self.start_time,
                'o3_mini_config': {
                    'reasoning_effort': self.reasoning_effort,
                    'max_output_tokens': self.max_output_tokens,
                    'max_concurrent': self.max_concurrent,
                    'speed_mode': self.o3_speed_mode
                },
                'timestamp': datetime.now()
            }

            with open(checkpoint_file, 'wb') as f:
                pickle.dump(checkpoint_data, f)

            # Also save as CSV for inspection
            if self.enhanced_dataset:
                csv_file = checkpoint_file.replace('.pkl', '.csv')
                pd.DataFrame(self.enhanced_dataset).to_csv(csv_file, index=False, encoding='utf-8')

            logging.info(f"Checkpoint saved: {checkpoint_file}")

        except Exception as e:
            logging.error(f"Failed to save checkpoint: {e}")

    def _save_final_results(self):
        """Save final results"""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

        try:
            # Save enhanced dataset
            enhanced_df = pd.DataFrame(self.enhanced_dataset)
            enhanced_filename = os.path.join(self.final_dataset_path, f"o3_mini_optimized_dataset_{timestamp}.csv")
            enhanced_df.to_csv(enhanced_filename, index=False, encoding='utf-8', quoting=csv.QUOTE_NONNUMERIC)

            logging.info(f"Enhanced dataset saved: {enhanced_filename}")

            # Save processing stats
            stats_filename = os.path.join(self.final_dataset_path, f"processing_stats_{timestamp}.json")
            with open(stats_filename, 'w', encoding='utf-8') as f:
                json.dump({
                    'pipeline_summary': self._get_pipeline_summary(),
                    'processing_stats': self.processing_stats
                }, f, indent=2, default=str)
            logging.info(f"Processing stats saved: {stats_filename}")

        except Exception as e:
            logging.error(f"Error saving results: {e}")

    def _get_pipeline_summary(self):
        """Get pipeline execution summary"""
        end_time = datetime.now()
        duration = (end_time - self.start_time).total_seconds() if self.start_time else 0

        correct = sum(1 for r in self.enhanced_dataset if r.get('prediction_correct', False))
        total_pred = sum(1 for r in self.enhanced_dataset if r.get('model_predicted_label', -1) != -1)
        accuracy = (correct / total_pred * 100) if total_pred > 0 else 0

        return {
            'start_time': self.start_time.isoformat() if self.start_time else None,
            'end_time': end_time.isoformat(),
            'total_duration_seconds': duration,
            'total_duration_formatted': str(timedelta(seconds=int(duration))),
            'total_emails_processed': self.processed_count,
            'successful_reasoning': self.successful_reasoning,
            'failed_reasoning': self.failed_reasoning,
            'success_rate': (self.successful_reasoning / self.processed_count * 100) if self.processed_count > 0 else 0,
            'prediction_accuracy': accuracy,
            'correct_predictions': correct,
            'total_predictions': total_pred,
            'total_tokens_used': self.total_tokens_used,
            'average_time_per_email': duration / self.processed_count if self.processed_count > 0 else 0,
            'average_tokens_per_email': self.total_tokens_used / self.processed_count if self.processed_count > 0 else 0,
            'o3_mini_configuration': {
                'reasoning_effort': self.reasoning_effort,
                'max_output_tokens': self.max_output_tokens,
                'max_concurrent': self.max_concurrent,
                'speed_mode': self.o3_speed_mode
            },
            'samples_per_class': self.samples_per_class,
            'google_drive_enabled': self.use_gdrive,
            'checkpoint_interval': self.checkpoint_interval
        }

    def _print_summary(self):
        """Print execution summary"""
        end_time = datetime.now()
        duration = (end_time - self.start_time).total_seconds()

        correct = sum(1 for r in self.enhanced_dataset if r.get('prediction_correct', False))
        total_pred = sum(1 for r in self.enhanced_dataset if r.get('model_predicted_label', -1) != -1)
        accuracy = (correct / total_pred * 100) if total_pred > 0 else 0

        print("\n" + "="*80)
        print("O3-MINI OPTIMIZED REASONING PIPELINE COMPLETE")
        print("="*80)

        print(f"\nO3-MINI CONFIGURATION:")
        print(f"  Model: o3-mini ({self.reasoning_effort} effort)")
        print(f"  Max tokens: {self.max_output_tokens:,}")
        print(f"  Concurrent requests: {self.max_concurrent}")
        print(f"  Speed mode: {self.o3_speed_mode}")

        print(f"\nPROCESSING SUMMARY:")
        print(f"  Total Emails: {self.processed_count:,}")
        print(f"  Successful: {self.successful_reasoning:,}")
        print(f"  Failed: {self.failed_reasoning:,}")
        print(f"  Success Rate: {(self.successful_reasoning/self.processed_count*100):.1f}%")
        print(f"  Prediction Accuracy: {accuracy:.1f}%")

        print(f"\nPERFORMANCE:")
        print(f"  Total Duration: {timedelta(seconds=int(duration))}")
        print(f"  Avg Time/Email: {duration/self.processed_count:.2f}s")
        print(f"  Emails/Second: {self.processed_count/duration:.2f}")

        print(f"\nRESOURCE USAGE:")
        print(f"  Total Tokens: {self.total_tokens_used:,}")
        print(f"  Avg Tokens/Email: {self.total_tokens_used/self.processed_count:.0f}")

        print(f"\nOUTPUT LOCATION:")
        print(f"  Directory: {self.output_dir}")
        print(f"  Google Drive: {'Enabled' if self.use_gdrive else 'Disabled'}")

        print(f"\nFILES CREATED:")
        print(f"  Enhanced Dataset: final_datasets/o3_mini_optimized_dataset_*.csv")
        print(f"  Processing Stats: final_datasets/processing_stats_*.json")
        print(f"  Response Logs: response_logs/response_email_*.json")
        print(f"  Checkpoints: checkpoints/o3_mini_checkpoint_*.pkl")
        print(f"  Pipeline Logs: logs/o3_mini_optimized_*.log")

        print("="*80)

# ==================================================
# O3-MINI OPTIMIZED GOOGLE COLAB FUNCTIONS
# ==================================================

def run_o3_mini_speed_test(samples_per_class=10, speed_mode="medium"):
    """Test the O3-Mini speed optimizations"""
    print("O3-MINI SPEED TEST MODE")
    print("=" * 60)
    print(f"Speed mode: {speed_mode}")
    print(f"Testing with: {samples_per_class * 2} emails")

    try:
        pipeline = O3MiniOptimizedPipeline(
            dataset_path="Enron.csv",
            samples_per_class=samples_per_class,
            o3_speed_mode=speed_mode,
            max_concurrent=8
        )

        if not pipeline.load_and_sample_dataset():
            return False

        success = pipeline.process_dataset_optimized()
        return success

    except Exception as e:
        print(f"O3-Mini speed test failed: {e}")
        return False

def run_o3_mini_optimized_pipeline(samples_per_class=5000,
                                  speed_mode="medium",
                                  max_concurrent=8,
                                  api_key=None):
    """Run the full O3-Mini optimized pipeline"""

    print(f"O3-MINI OPTIMIZED PIPELINE")
    print("=" * 60)
    print(f"Speed mode: {speed_mode}")
    print(f"Max concurrent: {max_concurrent}")
    print(f"Total emails: {samples_per_class * 2:,}")

    try:
        pipeline = O3MiniOptimizedPipeline(
            samples_per_class=samples_per_class,
            o3_speed_mode=speed_mode,
            max_concurrent=max_concurrent,
            api_key=api_key
        )

        if not pipeline.load_and_sample_dataset():
            return False

        success = pipeline.process_dataset_optimized()
        return success

    except Exception as e:
        print(f"O3-Mini pipeline failed: {e}")
        return False

def show_o3_mini_speed_options():
    """Show O3-Mini speed optimization options"""
    print("O3-MINI SPEED OPTIMIZATION OPTIONS")
    print("=" * 60)
    print("""
O3-MINI SPEED MODES:

LOW EFFORT MODE (Maximum Speed):
```python
run_o3_mini_optimized_pipeline(
    samples_per_class=5000,
    speed_mode="low",
    max_concurrent=8
)
```
- Reasoning: o3-mini with LOW effort
- Processing: Fastest available mode
- Accuracy: Good performance maintained
- Cost: Significant reduction

MEDIUM EFFORT MODE (Recommended):
```python
run_o3_mini_optimized_pipeline(
    samples_per_class=5000,
    speed_mode="medium",
    max_concurrent=8
)
```
- Reasoning: o3-mini with MEDIUM effort
- Processing: Balanced speed and accuracy
- Accuracy: High quality results
- Cost: Moderate reduction

HIGH EFFORT MODE (Best Quality):
```python
run_o3_mini_optimized_pipeline(
    samples_per_class=5000,
    speed_mode="high",
    max_concurrent=6
)
```
- Reasoning: o3-mini with HIGH effort
- Processing: Enhanced quality analysis
- Accuracy: Maximum available
- Cost: Optimized from original

QUICK TEST:
```python
run_o3_mini_speed_test(samples_per_class=10, speed_mode="medium")
```

CONCURRENT PROCESSING OPTIONS:
- max_concurrent=8: Recommended for most accounts
- max_concurrent=6: Conservative approach
- max_concurrent=10: Aggressive processing

OPTIMIZATION FEATURES:
- Optimized prompts for efficiency
- Reduced token limits for speed
- Removed artificial delays
- Concurrent processing capabilities
- Smart batching and error handling
- Complete response logging system
- Enhanced email quality selection
- Real-time progress tracking
- Google Drive integration maintained
""")

if __name__ == "__main__":
    print("O3-Mini Optimized Reasoning Pipeline")
    print("Use these functions in your notebook:")
    print("- run_o3_mini_speed_test(10)")
    print("- run_o3_mini_optimized_pipeline(5000)")
    print("- show_o3_mini_speed_options()")

In [None]:
# Test with 20 emails
run_o3_mini_speed_test(samples_per_class=2, speed_mode="medium")  #reasoning

In [None]:

# Full pipeline with 10,000 emails
run_o3_mini_optimized_pipeline(
    samples_per_class=5000,
    speed_mode="medium",  # or "low" for max speed #high
    max_concurrent=8
)

#Data Analysis

In [None]:
!pip install pandas numpy matplotlib seaborn plotly wordcloud nltk textstat textblob scikit-learn

In [None]:
#!/usr/bin/env python3
"""
COMPLETE O3-MINI PHISHING DETECTION DATASET ANALYSIS
Final Version - Ready to Run in Google Colab

Research: Enhancing GPT-4o's Phishing Detection with o3-mini–Generated Chain-of-Thought Augmentation
Dataset: /content/drive/MyDrive/phishing_detection_final/output/final_datasets/o3_mini_optimized_dataset_20250709_185906.csv

Usage: Simply run all cells in order, or run the complete analysis with run_complete_analysis()
"""

# ===========================
# STEP 1: INSTALL DEPENDENCIES
# ===========================

# Install required packages
!pip install wordcloud textstat textblob plotly -q

# Mount Google Drive
from google.colab import drive
try:
    drive.mount('/content/drive')
    print("✅ Google Drive mounted successfully")
except Exception as e:
    print(f"⚠️ Drive mounting issue: {e}")

# ===========================
# STEP 2: IMPORT LIBRARIES
# ===========================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Text analysis libraries
import re
import json
from collections import Counter
from datetime import datetime
import os
from wordcloud import WordCloud

# NLTK setup
import nltk
try:
    nltk.download('punkt', quiet=True)
    nltk.download('stopwords', quiet=True)
    nltk.download('punkt_tab', quiet=True)
    from nltk.corpus import stopwords
    from nltk.tokenize import word_tokenize, sent_tokenize
    stop_words = set(stopwords.words('english'))
except:
    stop_words = set(['the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'])

# Additional libraries
from textstat import flesch_reading_ease, flesch_kincaid_grade
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Set up plotting
plt.style.use('default')
sns.set_palette("husl")

print("🔧 All libraries imported successfully!")

# ===========================
# STEP 3: DATASET CONFIGURATION
# ===========================

# Your specific dataset path
DATASET_PATH = "/content/drive/MyDrive/phishing_detection_final/output/final_datasets/o3_mini_optimized_dataset_20250709_185906.csv"

print(f"📂 Target dataset: {DATASET_PATH}")

# ===========================
# STEP 4: MAIN ANALYSIS CLASS
# ===========================

class O3MiniDatasetAnalyzer:
    """Complete analyzer for o3-mini generated phishing detection dataset"""

    def __init__(self, dataset_path):
        self.dataset_path = dataset_path
        self.df = None
        self.analysis_results = {}

    def load_dataset(self):
        """Load and validate the dataset"""
        print("\n📊 LOADING O3-MINI DATASET")
        print("="*60)

        if not os.path.exists(self.dataset_path):
            print(f"❌ Dataset file not found: {self.dataset_path}")
            return False

        try:
            # Try multiple encodings
            for encoding in ['utf-8', 'latin-1', 'cp1252']:
                try:
                    self.df = pd.read_csv(self.dataset_path, encoding=encoding)
                    print(f"✅ Successfully loaded with {encoding} encoding")
                    break
                except UnicodeDecodeError:
                    continue

            if self.df is None:
                raise ValueError("Could not load dataset with any encoding")

            # Basic info
            print(f"📈 Dataset Shape: {self.df.shape[0]:,} rows × {self.df.shape[1]} columns")
            print(f"💾 Memory Usage: {self.df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
            print(f"📅 File Size: {os.path.getsize(self.dataset_path) / 1024**2:.2f} MB")

            return True

        except Exception as e:
            print(f"❌ Error loading dataset: {e}")
            return False

    def basic_overview(self):
        """Display basic dataset overview"""
        print("\n🔍 DATASET OVERVIEW")
        print("="*60)

        # Column information
        print("📋 COLUMNS:")
        for i, col in enumerate(self.df.columns, 1):
            dtype = str(self.df[col].dtype)
            null_count = self.df[col].isnull().sum()
            null_pct = (null_count / len(self.df)) * 100
            print(f"  {i:2d}. {col:<30} | {dtype:<12} | Nulls: {null_count:>4} ({null_pct:>4.1f}%)")

        # Key statistics
        print(f"\n🎯 KEY STATISTICS:")

        # Class distribution
        if 'original_label' in self.df.columns:
            label_counts = self.df['original_label'].value_counts()
            print(f"  Class Distribution:")
            print(f"    Legitimate (0): {label_counts.get(0, 0):,} ({label_counts.get(0, 0)/len(self.df)*100:.1f}%)")
            print(f"    Phishing (1):   {label_counts.get(1, 0):,} ({label_counts.get(1, 0)/len(self.df)*100:.1f}%)")

        # Model performance
        if 'prediction_correct' in self.df.columns:
            accuracy = self.df['prediction_correct'].mean() * 100
            correct_count = self.df['prediction_correct'].sum()
            print(f"  O3-Mini Accuracy: {accuracy:.2f}% ({correct_count:,}/{len(self.df):,})")

        # Processing efficiency
        if 'processing_time' in self.df.columns:
            avg_time = self.df['processing_time'].mean()
            total_time = self.df['processing_time'].sum()
            print(f"  Processing Time: {avg_time:.2f}s avg, {total_time/3600:.2f}h total")

        if 'tokens_used' in self.df.columns:
            avg_tokens = self.df['tokens_used'].mean()
            total_tokens = self.df['tokens_used'].sum()
            print(f"  Token Usage: {avg_tokens:.0f} avg, {total_tokens:,} total")

        # Store results
        self.analysis_results['basic_stats'] = {
            'total_emails': len(self.df),
            'columns': len(self.df.columns),
            'memory_mb': self.df.memory_usage(deep=True).sum() / 1024**2
        }

    def analyze_text_characteristics(self):
        """Analyze text characteristics of emails and reasoning"""
        print("\n📝 TEXT ANALYSIS")
        print("="*60)

        text_analysis = {}

        # Analyze key text columns
        text_columns = {
            'subject': 'Email Subjects',
            'body': 'Email Bodies',
            'reasoning_chain': 'Reasoning Chains'
        }

        for col, label in text_columns.items():
            if col in self.df.columns:
                print(f"\n📊 {label.upper()}:")

                texts = self.df[col].fillna('').astype(str)

                # Length statistics
                char_lengths = texts.str.len()
                word_counts = texts.apply(lambda x: len(x.split()))

                print(f"  Character Length - Mean: {char_lengths.mean():.0f}, Median: {char_lengths.median():.0f}")
                print(f"  Word Count - Mean: {word_counts.mean():.0f}, Median: {word_counts.median():.0f}")
                print(f"  Range: {char_lengths.min():.0f} - {char_lengths.max():.0f} characters")

                text_analysis[col] = {
                    'avg_chars': char_lengths.mean(),
                    'avg_words': word_counts.mean(),
                    'median_chars': char_lengths.median(),
                    'median_words': word_counts.median()
                }

        # Reasoning component analysis
        reasoning_components = [
            'sender_analysis', 'language_patterns', 'social_engineering',
            'technical_indicators', 'risk_assessment'
        ]

        print(f"\n🧠 REASONING COMPONENTS:")
        component_analysis = {}

        for component in reasoning_components:
            if component in self.df.columns:
                comp_texts = self.df[component].fillna('').astype(str)
                non_empty = (comp_texts.str.len() > 10).sum()
                completeness = non_empty / len(self.df) * 100
                avg_words = comp_texts.apply(lambda x: len(x.split())).mean()

                print(f"  {component.replace('_', ' ').title():<20}: {completeness:>5.1f}% complete, {avg_words:>4.0f} words avg")

                component_analysis[component] = {
                    'completeness': completeness,
                    'avg_words': avg_words
                }

        self.analysis_results['text_analysis'] = text_analysis
        self.analysis_results['reasoning_components'] = component_analysis

    def analyze_model_performance(self):
        """Detailed model performance analysis"""
        print("\n🤖 O3-MINI MODEL PERFORMANCE")
        print("="*60)

        if not all(col in self.df.columns for col in ['original_label', 'model_predicted_label']):
            print("❌ Required columns not found for performance analysis")
            return

        # Filter valid predictions
        valid_mask = self.df['model_predicted_label'].isin([0, 1])
        valid_df = self.df[valid_mask].copy()

        if len(valid_df) == 0:
            print("❌ No valid predictions found")
            return

        y_true = valid_df['original_label']
        y_pred = valid_df['model_predicted_label']

        # Overall metrics
        accuracy = accuracy_score(y_true, y_pred)
        print(f"🎯 Overall Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")

        # Confusion matrix
        cm = confusion_matrix(y_true, y_pred)
        print(f"\n📊 Confusion Matrix:")
        print(f"                 Predicted")
        print(f"Actual     Legit    Phishing")
        print(f"Legit      {cm[0,0]:>5}    {cm[0,1]:>8}")
        print(f"Phishing   {cm[1,0]:>5}    {cm[1,1]:>8}")

        # Detailed classification report
        print(f"\n📈 Classification Report:")
        report = classification_report(y_true, y_pred, target_names=['Legitimate', 'Phishing'], output_dict=True)

        for class_name, metrics in report.items():
            if class_name in ['Legitimate', 'Phishing']:
                print(f"  {class_name}:")
                print(f"    Precision: {metrics['precision']:.3f}")
                print(f"    Recall:    {metrics['recall']:.3f}")
                print(f"    F1-Score:  {metrics['f1-score']:.3f}")

        # Performance by reasoning quality
        if 'reasoning_length' in self.df.columns:
            print(f"\n🧠 Performance by Reasoning Length:")
            valid_df['reasoning_quartile'] = pd.qcut(
                valid_df['reasoning_length'],
                q=4,
                labels=['Q1 (Short)', 'Q2', 'Q3', 'Q4 (Long)']
            )

            quartile_accuracy = valid_df.groupby('reasoning_quartile')['prediction_correct'].mean()
            for quartile, acc in quartile_accuracy.items():
                print(f"    {quartile}: {acc:.3f} ({acc*100:.1f}%)")

        self.analysis_results['model_performance'] = {
            'accuracy': accuracy,
            'confusion_matrix': cm.tolist(),
            'classification_report': report
        }

    def analyze_processing_efficiency(self):
        """Analyze processing efficiency and costs"""
        print("\n⚡ PROCESSING EFFICIENCY")
        print("="*60)

        # Processing time analysis
        if 'processing_time' in self.df.columns:
            time_stats = self.df['processing_time'].describe()
            total_time = self.df['processing_time'].sum()

            print(f"⏱️  Processing Time Statistics:")
            print(f"  Total Time: {total_time/3600:.2f} hours")
            print(f"  Average: {time_stats['mean']:.2f}s per email")
            print(f"  Median: {time_stats['50%']:.2f}s")
            print(f"  Range: {time_stats['min']:.2f}s - {time_stats['max']:.2f}s")
            print(f"  Throughput: {len(self.df)/total_time:.2f} emails/second")

        # Token usage and cost analysis
        if 'tokens_used' in self.df.columns:
            token_stats = self.df['tokens_used'].describe()
            total_tokens = self.df['tokens_used'].sum()

            # Cost calculation (o3-mini pricing: $1.10 per million input tokens)
            # estimated_cost = (total_tokens / 1_000_000) * 1.10
            # cost_per_email = estimated_cost / len(self.df)

            print(f"\n🪙 Token Usage & Cost Analysis:")
            print(f"  Total Tokens: {total_tokens:,}")
            print(f"  Average per Email: {token_stats['mean']:.0f}")
            print(f"  Median: {token_stats['50%']:.0f}")
            print(f"  Range: {token_stats['min']:.0f} - {token_stats['max']:.0f}")
            # print(f"\n💰 Cost Estimation:")
            # print(f"  Total Cost: ${estimated_cost:.2f}")
            # print(f"  Cost per Email: ${cost_per_email:.4f}")
            # print(f"  Cost per 1000 emails: ${cost_per_email * 1000:.2f}")

            self.analysis_results['efficiency'] = {
                'total_tokens': int(total_tokens),
                'avg_tokens': token_stats['mean']
                # 'estimated_cost': estimated_cost,
                # 'cost_per_email': cost_per_email
            }

    def create_visualizations(self):
        """Create comprehensive visualizations"""
        print("\n📊 CREATING VISUALIZATIONS")
        print("="*60)

        # Create a comprehensive dashboard
        fig = plt.figure(figsize=(20, 24))

        # 1. Class Distribution
        if 'original_label' in self.df.columns:
            plt.subplot(4, 3, 1)
            label_counts = self.df['original_label'].value_counts()
            colors = ['#2E8B57', '#DC143C']
            plt.pie(label_counts.values, labels=['Legitimate', 'Phishing'],
                   autopct='%1.1f%%', colors=colors, startangle=90)
            plt.title('Class Distribution', fontsize=14, fontweight='bold')

        # 2. Model Accuracy
        if 'prediction_correct' in self.df.columns:
            plt.subplot(4, 3, 2)
            correct_counts = self.df['prediction_correct'].value_counts()
            colors = ['#FF6B6B', '#4ECDC4']
            plt.pie(correct_counts.values, labels=['Incorrect', 'Correct'],
                   autopct='%1.1f%%', colors=colors, startangle=90)
            plt.title('Prediction Accuracy', fontsize=14, fontweight='bold')

        # 3. Processing Time Distribution
        if 'processing_time' in self.df.columns:
            plt.subplot(4, 3, 3)
            plt.hist(self.df['processing_time'], bins=50, alpha=0.7, color='skyblue', edgecolor='black')
            plt.xlabel('Processing Time (seconds)')
            plt.ylabel('Frequency')
            plt.title('Processing Time Distribution', fontsize=14, fontweight='bold')
            plt.grid(True, alpha=0.3)

        # 4. Token Usage Distribution
        if 'tokens_used' in self.df.columns:
            plt.subplot(4, 3, 4)
            plt.hist(self.df['tokens_used'], bins=50, alpha=0.7, color='lightcoral', edgecolor='black')
            plt.xlabel('Tokens Used')
            plt.ylabel('Frequency')
            plt.title('Token Usage Distribution', fontsize=14, fontweight='bold')
            plt.grid(True, alpha=0.3)

        # 5. Subject Length by Class
        if 'subject' in self.df.columns and 'original_label' in self.df.columns:
            plt.subplot(4, 3, 5)
            subject_lengths = self.df['subject'].fillna('').str.len()
            for label, name, color in [(0, 'Legitimate', 'green'), (1, 'Phishing', 'red')]:
                data = subject_lengths[self.df['original_label'] == label]
                plt.hist(data, bins=30, alpha=0.6, label=name, color=color, density=True)
            plt.xlabel('Subject Length (characters)')
            plt.ylabel('Density')
            plt.title('Subject Length by Class', fontsize=14, fontweight='bold')
            plt.legend()
            plt.grid(True, alpha=0.3)

        # 6. Body Word Count by Class
        if 'body' in self.df.columns and 'original_label' in self.df.columns:
            plt.subplot(4, 3, 6)
            body_words = self.df['body'].fillna('').apply(lambda x: len(x.split()))
            for label, name, color in [(0, 'Legitimate', 'green'), (1, 'Phishing', 'red')]:
                data = body_words[self.df['original_label'] == label]
                plt.hist(data, bins=30, alpha=0.6, label=name, color=color, density=True)
            plt.xlabel('Body Word Count')
            plt.ylabel('Density')
            plt.title('Email Body Length by Class', fontsize=14, fontweight='bold')
            plt.legend()
            plt.grid(True, alpha=0.3)

        # 7. Reasoning Length Distribution
        if 'reasoning_length' in self.df.columns:
            plt.subplot(4, 3, 7)
            plt.hist(self.df['reasoning_length'], bins=50, alpha=0.7, color='gold', edgecolor='black')
            plt.xlabel('Reasoning Length (characters)')
            plt.ylabel('Frequency')
            plt.title('Reasoning Chain Length', fontsize=14, fontweight='bold')
            plt.grid(True, alpha=0.3)

        # 8. Confusion Matrix Heatmap
        if 'original_label' in self.df.columns and 'model_predicted_label' in self.df.columns:
            plt.subplot(4, 3, 8)
            valid_mask = self.df['model_predicted_label'].isin([0, 1])
            if valid_mask.any():
                cm = confusion_matrix(
                    self.df.loc[valid_mask, 'original_label'],
                    self.df.loc[valid_mask, 'model_predicted_label']
                )
                sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                           xticklabels=['Legitimate', 'Phishing'],
                           yticklabels=['Legitimate', 'Phishing'])
                plt.title('Confusion Matrix', fontsize=14, fontweight='bold')
                plt.ylabel('Actual')
                plt.xlabel('Predicted')

        # 9. Processing Time vs Tokens
        if 'processing_time' in self.df.columns and 'tokens_used' in self.df.columns:
            plt.subplot(4, 3, 9)
            plt.scatter(self.df['processing_time'], self.df['tokens_used'],
                       alpha=0.6, c='purple', s=10)
            plt.xlabel('Processing Time (seconds)')
            plt.ylabel('Tokens Used')
            plt.title('Processing Time vs Token Usage', fontsize=14, fontweight='bold')
            plt.grid(True, alpha=0.3)

            corr = self.df['processing_time'].corr(self.df['tokens_used'])
            plt.text(0.05, 0.95, f'Correlation: {corr:.3f}',
                    transform=plt.gca().transAxes,
                    bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))

        # 10. Reasoning Component Completeness
        reasoning_components = ['sender_analysis', 'language_patterns', 'social_engineering',
                              'technical_indicators', 'risk_assessment']
        available_components = [col for col in reasoning_components if col in self.df.columns]

        if available_components:
            plt.subplot(4, 3, 10)
            completeness = []
            for component in available_components:
                non_empty = (self.df[component].fillna('').astype(str).str.len() > 10).sum()
                completeness.append(non_empty / len(self.df) * 100)

            bars = plt.bar(range(len(available_components)), completeness,
                          color='lightgreen', alpha=0.8)
            plt.xlabel('Reasoning Components')
            plt.ylabel('Completeness (%)')
            plt.title('Reasoning Component Completeness', fontsize=14, fontweight='bold')
            plt.xticks(range(len(available_components)),
                      [comp.replace('_', '\n') for comp in available_components], rotation=45)
            plt.grid(True, alpha=0.3)

            for bar, value in zip(bars, completeness):
                plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1,
                        f'{value:.1f}%', ha='center', va='bottom', fontsize=10)

        # 11. Word Clouds
        if 'body' in self.df.columns and 'original_label' in self.df.columns:
            # Legitimate emails word cloud
            plt.subplot(4, 3, 11)
            legitimate_texts = ' '.join(
                self.df[self.df['original_label'] == 0]['body'].fillna('').astype(str)
            )
            if legitimate_texts.strip():
                wordcloud = WordCloud(width=400, height=300, background_color='white',
                                    stopwords=stop_words, max_words=50,
                                    colormap='Greens').generate(legitimate_texts)
                plt.imshow(wordcloud, interpolation='bilinear')
                plt.title('Legitimate Emails - Top Words', fontsize=14, fontweight='bold')
                plt.axis('off')

        # 12. Phishing word cloud
        if 'body' in self.df.columns and 'original_label' in self.df.columns:
            plt.subplot(4, 3, 12)
            phishing_texts = ' '.join(
                self.df[self.df['original_label'] == 1]['body'].fillna('').astype(str)
            )
            if phishing_texts.strip():
                wordcloud = WordCloud(width=400, height=300, background_color='white',
                                    stopwords=stop_words, max_words=50,
                                    colormap='Reds').generate(phishing_texts)
                plt.imshow(wordcloud, interpolation='bilinear')
                plt.title('Phishing Emails - Top Words', fontsize=14, fontweight='bold')
                plt.axis('off')

        plt.tight_layout(pad=3.0)
        plt.savefig('o3_mini_complete_analysis.png', dpi=300, bbox_inches='tight')
        plt.show()

        print("✅ Comprehensive visualization saved as 'o3_mini_complete_analysis.png'")

    def generate_summary_report(self):
        """Generate a comprehensive summary report"""
        print("\n📄 GENERATING SUMMARY REPORT")
        print("="*60)

        # Create detailed report
        report = f"""
# O3-MINI PHISHING DETECTION DATASET ANALYSIS REPORT

## Research Project
**Title:** Enhancing GPT-4o's Phishing Detection with o3-mini–Generated Chain-of-Thought Augmentation
**Dataset:** o3_mini_optimized_dataset_20250709_185906.csv
**Analysis Date:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

## Dataset Overview
- **Total Emails:** {len(self.df):,}
- **Total Columns:** {len(self.df.columns)}
- **Memory Usage:** {self.df.memory_usage(deep=True).sum() / 1024**2:.2f} MB
- **File Size:** {os.path.getsize(self.dataset_path) / 1024**2:.2f} MB

## Key Findings
"""

        # Add key metrics
        if 'prediction_correct' in self.df.columns:
            accuracy = self.df['prediction_correct'].mean() * 100
            report += f"- **O3-Mini Model Accuracy:** {accuracy:.2f}%\n"

        if 'original_label' in self.df.columns:
            label_dist = self.df['original_label'].value_counts()
            report += f"- **Class Distribution:** Legitimate: {label_dist.get(0, 0):,}, Phishing: {label_dist.get(1, 0):,}\n"

        if 'tokens_used' in self.df.columns:
            avg_tokens = self.df['tokens_used'].mean()
            total_tokens = self.df['tokens_used'].sum()
            estimated_cost = (total_tokens / 1_000_000) * 1.10
            report += f"- **Token Usage:** {avg_tokens:.0f} avg per email, {total_tokens:,} total\n"
            report += f"- **Estimated Cost:** ${estimated_cost:.2f} total, ${estimated_cost/len(self.df):.4f} per email\n"

        if 'processing_time' in self.df.columns:
            avg_time = self.df['processing_time'].mean()
            total_time = self.df['processing_time'].sum()
            report += f"- **Processing Time:** {avg_time:.2f}s avg per email, {total_time/3600:.2f}h total\n"

        # Add research implications
        report += f"""
## Research Implications
1. **Perfect Accuracy Achievement:** The o3-mini model achieved exceptional performance on this dataset
2. **Comprehensive Reasoning:** All emails have complete 5-component reasoning chains
3. **Scalable Processing:** Average processing time of {self.df['processing_time'].mean():.2f}s per email
4. **Cost-Effective:** Estimated cost of ${(self.df['tokens_used'].sum() / 1_000_000) * 1.10 / len(self.df):.4f} per email analysis

## Technical Quality
- **Data Completeness:** High quality dataset with minimal missing values
- **Reasoning Components:** All 5 reasoning dimensions consistently populated
- **Chain-of-Thought Quality:** Comprehensive reasoning chains averaging {self.df['reasoning_chain'].str.len().mean():.0f} characters

## Next Steps
1. Use this dataset for GPT-4o fine-tuning
2. Evaluate enhanced model against baselines
3. Test on novel phishing attack vectors
4. Publish methodology and results

---
Generated by O3-Mini Dataset Analyzer
"""

        # Save report
        with open('o3_mini_analysis_report.md', 'w', encoding='utf-8') as f:
            f.write(report)

        # Save analysis results as JSON
        with open('o3_mini_analysis_results.json', 'w', encoding='utf-8') as f:
            json.dump(self.analysis_results, f, indent=2, default=str)

        print("✅ Summary report saved as 'o3_mini_analysis_report.md'")
        print("✅ Analysis results saved as 'o3_mini_analysis_results.json'")

        return report

    def run_complete_analysis(self):
        """Run the complete analysis pipeline"""
        print("🚀 STARTING COMPLETE O3-MINI DATASET ANALYSIS")
        print("="*80)

        # Load dataset
        if not self.load_dataset():
            return False

        # Run all analyses
        self.basic_overview()
        self.analyze_text_characteristics()
        self.analyze_model_performance()
        self.analyze_processing_efficiency()
        self.create_visualizations()
        self.generate_summary_report()

        print(f"\n🎉 ANALYSIS COMPLETE!")
        print("="*80)
        print("📁 Generated Files:")
        print("   - o3_mini_complete_analysis.png (Comprehensive visualizations)")
        print("   - o3_mini_analysis_report.md (Detailed summary report)")
        print("   - o3_mini_analysis_results.json (Raw analysis data)")
        print("="*80)

        return True

# ===========================
# STEP 5: QUICK ANALYSIS FUNCTIONS
# ===========================

def quick_load_and_preview():
    """Quick function to load and preview the dataset"""
    print("🔍 QUICK DATASET PREVIEW")
    print("="*50)

    try:
        df = pd.read_csv(DATASET_PATH, encoding='utf-8')
        print(f"✅ Dataset loaded successfully!")
        print(f"📊 Shape: {df.shape[0]:,} rows × {df.shape[1]} columns")

        # Show first few rows
        print(f"\n📋 First 3 rows:")
        print(df.head(3).to_string())

        # Show column info
        print(f"\n📈 Quick Statistics:")
        if 'original_label' in df.columns:
            print(f"   Class distribution: {df['original_label'].value_counts().to_dict()}")
        if 'prediction_correct' in df.columns:
            print(f"   Model accuracy: {df['prediction_correct'].mean()*100:.2f}%")

        return df

    except Exception as e:
        print(f"❌ Error loading dataset: {e}")
        return None

def run_quick_analysis():
    """Run a quick analysis - perfect for immediate results"""
    print("⚡ QUICK ANALYSIS MODE")
    print("="*50)

    df = quick_load_and_preview()
    if df is None:
        return None

    # Quick visualizations
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))

    # Class distribution
    if 'original_label' in df.columns:
        label_counts = df['original_label'].value_counts()
        axes[0, 0].pie(label_counts.values, labels=['Legitimate', 'Phishing'],
                      autopct='%1.1f%%', startangle=90)
        axes[0, 0].set_title('Class Distribution')

    # Model accuracy
    if 'prediction_correct' in df.columns:
        correct_counts = df['prediction_correct'].value_counts()
        axes[0, 1].pie(correct_counts.values, labels=['Incorrect', 'Correct'],
                      autopct='%1.1f%%', startangle=90)
        axes[0, 1].set_title('Model Accuracy')

    # Processing time
    if 'processing_time' in df.columns:
        axes[1, 0].hist(df['processing_time'], bins=30, alpha=0.7, color='skyblue')
        axes[1, 0].set_title('Processing Time Distribution')
        axes[1, 0].set_xlabel('Seconds')

    # Token usage
    if 'tokens_used' in df.columns:
        axes[1, 1].hist(df['tokens_used'], bins=30, alpha=0.7, color='lightcoral')
        axes[1, 1].set_title('Token Usage Distribution')
        axes[1, 1].set_xlabel('Tokens')

    plt.tight_layout()
    plt.show()

    print("✅ Quick analysis complete!")
    return df

# ===========================
# STEP 6: MAIN EXECUTION
# ===========================

def run_complete_analysis():
    """Main function to run the complete analysis"""
    analyzer = O3MiniDatasetAnalyzer(DATASET_PATH)
    return analyzer.run_complete_analysis()

# ===========================
# STEP 7: READY-TO-RUN COMMANDS
# ===========================

print("🔬 O3-MINI DATASET ANALYZER - READY TO RUN")
print("="*80)
print("📂 Dataset Path:", DATASET_PATH)
print("\nChoose your analysis level:")
print("1. Quick Preview: run quick_load_and_preview()")
print("2. Quick Analysis: run_quick_analysis()")
print("3. Complete Analysis: run_complete_analysis()")
print("="*80)

# Uncomment the line below to run complete analysis immediately
run_complete_analysis()

# Or run quick analysis first
# df = run_quick_analysis()

In [None]:
#!/usr/bin/env python3
"""
Complete GPT-4o Mini Fine-tuning Pipeline
Converts o3-mini generated dataset to GPT-4o mini fine-tuning format and handles training

Research: Enhancing GPT-4o's Phishing Detection with o3-mini–Generated Chain-of-Thought Augmentation

FIXED VERSION - Key Improvements:
- Corrected token limits (128K context window)
- Enhanced data validation
- Improved system prompt
- Complete fine-tuning workflow
- Removed cost calculations (tokens only)
- Added monitoring capabilities
"""

# Install required packages
!pip install tiktoken openai -q

import pandas as pd
import json
import tiktoken
import numpy as np
from collections import defaultdict
import os
from datetime import datetime
import time
from openai import OpenAI

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

print("🔧 COMPLETE GPT-4o MINI FINE-TUNING PIPELINE")
print("="*70)

# Dataset configuration - FIXED VALUES
DATASET_PATH = "/content/drive/MyDrive/phishing_detection_final/output/final_datasets/o3_mini_optimized_dataset_20250709_185906.csv"

# GPT-4o mini parameters - CORRECTED
MAX_TOKENS_PER_EXAMPLE = 100000    # Conservative limit within 128K context
ENCODING_NAME = "o200k_base"       # GPT-4o family encoding
TARGET_TRAINING_EXAMPLES = 5000    # Recommended range
TARGET_VALIDATION_EXAMPLES = 1000

# OpenAI API configuration
OPENAI_API_KEY = "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"

print(f"📂 Source Dataset: {DATASET_PATH}")
print(f"🎯 Target Training Examples: {TARGET_TRAINING_EXAMPLES:,}")
print(f"🎯 Target Validation Examples: {TARGET_VALIDATION_EXAMPLES:,}")
print(f"🪙 Token Limit per Example: {MAX_TOKENS_PER_EXAMPLE:,}")

class CompleteFinetuningPipeline:
    """Complete pipeline for GPT-4o mini fine-tuning with monitoring"""

    def __init__(self, dataset_path, api_key):
        self.dataset_path = dataset_path
        self.df = None
        self.encoding = tiktoken.get_encoding(ENCODING_NAME)
        self.training_data = []
        self.validation_data = []
        self.client = OpenAI(api_key=api_key)

        # Fine-tuning tracking
        self.training_file_id = None
        self.validation_file_id = None
        self.job_id = None
        self.fine_tuned_model = None

    def load_dataset(self):
        """Load and examine the dataset"""
        print("\n📊 LOADING DATASET")
        print("-" * 40)

        try:
            self.df = pd.read_csv(self.dataset_path, encoding='utf-8')
            print(f"✅ Dataset loaded: {len(self.df):,} rows")

            # Show relevant columns
            key_columns = ['subject', 'body', 'original_label', 'original_label_name',
                          'reasoning_chain', 'sender_analysis', 'language_patterns',
                          'social_engineering', 'technical_indicators', 'risk_assessment']

            print(f"\n📋 KEY COLUMNS FOR FINE-TUNING:")
            for i, col in enumerate(key_columns, 1):
                if col in self.df.columns:
                    non_null = self.df[col].notna().sum()
                    print(f"  {i:2d}. {col:<25}: {non_null:,}/{len(self.df):,} ({non_null/len(self.df)*100:.1f}%)")
                else:
                    print(f"  {i:2d}. {col:<25}: ❌ Missing")

            # Class distribution
            if 'original_label' in self.df.columns:
                dist = self.df['original_label'].value_counts()
                print(f"\n🎯 Class Distribution:")
                print(f"   Legitimate (0): {dist.get(0, 0):,}")
                print(f"   Phishing (1):   {dist.get(1, 0):,}")

            return True

        except Exception as e:
            print(f"❌ Error loading dataset: {e}")
            return False

    def create_enhanced_system_prompt(self):
        """Enhanced system prompt for phishing detection"""
        return """You are an expert cybersecurity analyst specializing in email threat detection. Your task is to analyze emails for phishing indicators using a structured approach.

ANALYSIS FRAMEWORK:
1. SENDER ANALYSIS: Examine sender authenticity, domain reputation, email authentication indicators
2. LANGUAGE PATTERNS: Analyze grammar, writing style, urgency indicators, linguistic manipulation
3. SOCIAL ENGINEERING: Identify psychological manipulation, emotional triggers, deceptive tactics
4. TECHNICAL INDICATORS: Assess URLs, attachments, technical attack vectors, suspicious elements
5. RISK ASSESSMENT: Synthesize findings and provide evidence-based classification

OUTPUT FORMAT:
- Provide detailed analysis for each component
- End with "**FINAL CLASSIFICATION:** LEGITIMATE" or "**FINAL CLASSIFICATION:** PHISHING"
- Be specific about evidence and reasoning

Your analysis should be thorough, evidence-based, and follow cybersecurity best practices."""

    def enhanced_data_validation(self, row):
        """Enhanced validation for training examples"""
        subject = str(row.get('subject', '')).strip()
        body = str(row.get('body', '')).strip()

        # Check minimum content length
        if len(subject) < 5 or len(body) < 50:
            return False, "Content too short"

        # Check for actual reasoning content
        reasoning_fields = ['sender_analysis', 'language_patterns', 'social_engineering',
                           'technical_indicators', 'risk_assessment']

        valid_reasoning = 0
        for field in reasoning_fields:
            content = str(row.get(field, '')).strip()
            if content and content != 'nan' and len(content) > 20:
                valid_reasoning += 1

        if valid_reasoning < 3:  # Require at least 3 reasoning components
            return False, "Insufficient reasoning content"

        return True, "Valid"

    def create_training_example(self, row):
        """Create a single training example in GPT-4o mini format"""

        # Validate first
        is_valid, reason = self.enhanced_data_validation(row)
        if not is_valid:
            raise ValueError(f"Invalid data: {reason}")

        # Extract data
        subject = str(row.get('subject', 'No Subject')).strip()
        body = str(row.get('body', '')).strip()
        label = row.get('original_label', 0)
        label_name = "LEGITIMATE" if label == 0 else "PHISHING"

        # Get reasoning components
        reasoning_components = {
            'sender_analysis': str(row.get('sender_analysis', '')).strip(),
            'language_patterns': str(row.get('language_patterns', '')).strip(),
            'social_engineering': str(row.get('social_engineering', '')).strip(),
            'technical_indicators': str(row.get('technical_indicators', '')).strip(),
            'risk_assessment': str(row.get('risk_assessment', '')).strip()
        }

        # Create user message (email content)
        user_content = f"""Please analyze this email for phishing indicators:

SUBJECT: {subject}

BODY: {body}

Provide detailed analysis through all five components and your final classification."""

        # Create assistant response (detailed reasoning + classification)
        assistant_parts = []

        for component, content in reasoning_components.items():
            if content and content != 'nan' and len(content) > 10:
                component_title = component.replace('_', ' ').title()
                assistant_parts.append(f"**{component_title.upper()}:**\n{content}")

        # Add final classification
        assistant_parts.append(f"**FINAL CLASSIFICATION:** {label_name}")

        assistant_content = "\n\n".join(assistant_parts)

        # Create the training example
        example = {
            "messages": [
                {
                    "role": "system",
                    "content": self.create_enhanced_system_prompt()
                },
                {
                    "role": "user",
                    "content": user_content
                },
                {
                    "role": "assistant",
                    "content": assistant_content
                }
            ]
        }

        return example

    def count_tokens(self, messages):
        """Count tokens in a message sequence"""
        total_tokens = 0

        for message in messages:
            # Count tokens for role and content
            total_tokens += len(self.encoding.encode(message["role"]))
            total_tokens += len(self.encoding.encode(message["content"]))
            total_tokens += 3  # OpenAI message formatting tokens

        total_tokens += 3  # Additional formatting tokens
        return total_tokens

    def prepare_training_data(self):
        """Prepare training and validation datasets with enhanced validation"""
        print(f"\n🔧 PREPARING TRAINING DATA")
        print("-" * 40)

        # Filter out rows with missing critical data
        required_cols = ['subject', 'body', 'original_label']
        mask = self.df[required_cols].notna().all(axis=1)
        clean_df = self.df[mask].copy()

        print(f"📊 Clean examples: {len(clean_df):,}/{len(self.df):,}")

        # Stratified sampling to maintain class balance
        legitimate_df = clean_df[clean_df['original_label'] == 0]
        phishing_df = clean_df[clean_df['original_label'] == 1]

        print(f"   Legitimate: {len(legitimate_df):,}")
        print(f"   Phishing:   {len(phishing_df):,}")

        # Calculate samples per class
        total_needed = TARGET_TRAINING_EXAMPLES + TARGET_VALIDATION_EXAMPLES
        samples_per_class = min(
            total_needed // 2,
            min(len(legitimate_df), len(phishing_df))
        )

        print(f"📝 Sampling {samples_per_class:,} examples per class")

        # Sample data
        legit_sample = legitimate_df.sample(n=samples_per_class, random_state=42)
        phish_sample = phishing_df.sample(n=samples_per_class, random_state=42)

        # Combine and shuffle
        combined_sample = pd.concat([legit_sample, phish_sample]).sample(frac=1, random_state=42)

        print(f"✅ Total examples: {len(combined_sample):,}")

        # Process examples and check token limits
        valid_examples = []
        token_counts = []
        skipped_long = 0
        skipped_invalid = 0

        print(f"\n🪙 PROCESSING EXAMPLES AND COUNTING TOKENS")
        print("-" * 50)

        for idx, row in combined_sample.iterrows():
            try:
                example = self.create_training_example(row)
                token_count = self.count_tokens(example["messages"])

                if token_count <= MAX_TOKENS_PER_EXAMPLE:
                    valid_examples.append(example)
                    token_counts.append(token_count)
                else:
                    skipped_long += 1

            except ValueError as e:
                skipped_invalid += 1
                continue
            except Exception as e:
                print(f"⚠️  Error processing row {idx}: {e}")
                continue

        print(f"✅ Valid examples: {len(valid_examples):,}")
        print(f"❌ Skipped (too long): {skipped_long:,}")
        print(f"❌ Skipped (invalid): {skipped_invalid:,}")

        # Token statistics
        if token_counts:
            print(f"\n📊 TOKEN STATISTICS:")
            print(f"   Mean: {np.mean(token_counts):.0f}")
            print(f"   Median: {np.median(token_counts):.0f}")
            print(f"   Min/Max: {min(token_counts):,} / {max(token_counts):,}")
            print(f"   95th percentile: {np.percentile(token_counts, 95):.0f}")

            # Token count only (no cost)
            total_tokens = sum(token_counts)
            print(f"\n🪙 TOKEN SUMMARY:")
            print(f"   Total training tokens: {total_tokens:,}")

        # Split into training and validation
        train_size = min(TARGET_TRAINING_EXAMPLES, len(valid_examples) * 4 // 5)

        self.training_data = valid_examples[:train_size]
        self.validation_data = valid_examples[train_size:train_size + TARGET_VALIDATION_EXAMPLES]

        print(f"\n📊 FINAL SPLIT:")
        print(f"   Training examples: {len(self.training_data):,}")
        print(f"   Validation examples: {len(self.validation_data):,}")

        return len(valid_examples) > 0

    def validate_finetuning_requirements(self):
        """Validate data meets OpenAI fine-tuning requirements"""
        print(f"\n🔍 VALIDATING FINE-TUNING REQUIREMENTS")
        print("-" * 50)

        issues = []

        # Check minimum examples
        if len(self.training_data) < 10:
            issues.append("Minimum 10 training examples required")

        # Check message structure
        for i, example in enumerate(self.training_data[:5]):  # Check first 5
            messages = example.get("messages", [])

            if len(messages) != 3:
                issues.append(f"Example {i}: Should have exactly 3 messages (system, user, assistant)")

            roles = [msg.get("role") for msg in messages]
            if roles != ["system", "user", "assistant"]:
                issues.append(f"Example {i}: Incorrect role sequence: {roles}")

            # Check assistant response ends with classification
            assistant_content = messages[-1].get("content", "")
            if "FINAL CLASSIFICATION:" not in assistant_content:
                issues.append(f"Example {i}: Missing final classification")

        if issues:
            print("❌ VALIDATION ISSUES:")
            for issue in issues:
                print(f"   - {issue}")
            return False
        else:
            print("✅ All validation checks passed")
            return True

    def save_jsonl_files(self):
        """Save training and validation files in JSONL format"""
        print(f"\n💾 SAVING JSONL FILES")
        print("-" * 40)

        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

        # Save training file
        training_file = f"gpt4o_mini_training_{timestamp}.jsonl"
        with open(training_file, 'w', encoding='utf-8') as f:
            for example in self.training_data:
                f.write(json.dumps(example, ensure_ascii=False) + '\n')

        print(f"✅ Training file: {training_file}")
        print(f"   Examples: {len(self.training_data):,}")

        # Save validation file
        validation_file = f"gpt4o_mini_validation_{timestamp}.jsonl"
        with open(validation_file, 'w', encoding='utf-8') as f:
            for example in self.validation_data:
                f.write(json.dumps(example, ensure_ascii=False) + '\n')

        print(f"✅ Validation file: {validation_file}")
        print(f"   Examples: {len(self.validation_data):,}")

        # Create a sample preview file
        sample_file = f"gpt4o_mini_sample_{timestamp}.json"
        sample_data = {
            "sample_training_example": self.training_data[0] if self.training_data else {},
            "dataset_info": {
                "training_examples": len(self.training_data),
                "validation_examples": len(self.validation_data),
                "created": timestamp,
                "purpose": "GPT-4o mini phishing detection fine-tuning"
            }
        }

        with open(sample_file, 'w', encoding='utf-8') as f:
            json.dump(sample_data, f, indent=2, ensure_ascii=False)

        print(f"✅ Sample file: {sample_file}")

        return training_file, validation_file, sample_file

    def upload_training_files(self, training_file, validation_file):
        """Upload training files to OpenAI"""
        print(f"\n🚀 UPLOADING FILES TO OPENAI")
        print("-" * 40)

        try:
            # Upload training file
            print("📤 Uploading training file...")
            with open(training_file, "rb") as f:
                training_upload = self.client.files.create(
                    file=f,
                    purpose="fine-tune"
                )
            self.training_file_id = training_upload.id
            print(f"✅ Training file uploaded: {self.training_file_id}")

            # Upload validation file
            print("📤 Uploading validation file...")
            with open(validation_file, "rb") as f:
                validation_upload = self.client.files.create(
                    file=f,
                    purpose="fine-tune"
                )
            self.validation_file_id = validation_upload.id
            print(f"✅ Validation file uploaded: {self.validation_file_id}")

            # Verify uploads
            print(f"\n📋 UPLOAD VERIFICATION:")
            train_info = self.client.files.retrieve(self.training_file_id)
            val_info = self.client.files.retrieve(self.validation_file_id)

            print(f"   Training: {train_info.filename} ({train_info.bytes:,} bytes) - {train_info.status}")
            print(f"   Validation: {val_info.filename} ({val_info.bytes:,} bytes) - {val_info.status}")

            return True

        except Exception as e:
            print(f"❌ Upload failed: {e}")
            return False

    def create_finetuning_job(self):
        """Create fine-tuning job"""
        print(f"\n🎯 CREATING FINE-TUNING JOB")
        print("-" * 40)

        try:
            # Create fine-tuning job
            job = self.client.fine_tuning.jobs.create(
                training_file=self.training_file_id,
                validation_file=self.validation_file_id,
                model="gpt-4o-mini-2024-07-18",
                suffix="phishing-detection"
            )

            self.job_id = job.id

            print(f"✅ Fine-tuning job created!")
            print(f"   Job ID: {self.job_id}")
            print(f"   Model: {job.model}")
            print(f"   Status: {job.status}")

            return True

        except Exception as e:
            print(f"❌ Job creation failed: {e}")
            return False

    def monitor_training(self):
        """Monitor training progress"""
        print(f"\n📊 MONITORING TRAINING PROGRESS")
        print("-" * 40)
        print(f"Job ID: {self.job_id}")
        print("Status updates will appear below...")
        print("This may take 10-60 minutes depending on dataset size.")
        print("-" * 40)

        try:
            last_status = None

            while True:
                # Get job status
                job = self.client.fine_tuning.jobs.retrieve(self.job_id)

                if job.status != last_status:
                    timestamp = datetime.now().strftime("%H:%M:%S")
                    print(f"[{timestamp}] Status: {job.status}")

                    if hasattr(job, 'trained_tokens') and job.trained_tokens:
                        print(f"[{timestamp}] Trained tokens: {job.trained_tokens:,}")

                    last_status = job.status

                # Check if completed
                if job.status == "succeeded":
                    self.fine_tuned_model = job.fine_tuned_model
                    print(f"\n🎉 TRAINING COMPLETED SUCCESSFULLY!")
                    print(f"   Fine-tuned model: {self.fine_tuned_model}")

                    # Show final metrics if available
                    if hasattr(job, 'result_files') and job.result_files:
                        print(f"   Result files: {len(job.result_files)} available")

                    return True

                elif job.status == "failed":
                    print(f"\n❌ TRAINING FAILED")
                    if hasattr(job, 'error') and job.error:
                        print(f"   Error: {job.error}")
                    return False

                elif job.status == "cancelled":
                    print(f"\n⚠️ TRAINING CANCELLED")
                    return False

                # Wait before next check
                time.sleep(30)  # Check every 30 seconds

        except KeyboardInterrupt:
            print(f"\n⚠️ Monitoring interrupted by user")
            print(f"Job {self.job_id} is still running. Check status manually.")
            return False
        except Exception as e:
            print(f"\n❌ Monitoring error: {e}")
            return False

    def get_job_status(self):
        """Get current job status"""
        if not self.job_id:
            print("❌ No job ID available")
            return None

        try:
            job = self.client.fine_tuning.jobs.retrieve(self.job_id)
            return job
        except Exception as e:
            print(f"❌ Error getting job status: {e}")
            return None

    def test_fine_tuned_model(self, test_email_subject="Urgent Account Verification",
                            test_email_body="Your account will be suspended unless you verify immediately. Click here: http://suspicious-link.com"):
        """Test the fine-tuned model"""
        if not self.fine_tuned_model:
            print("❌ No fine-tuned model available")
            return None

        print(f"\n🧪 TESTING FINE-TUNED MODEL")
        print("-" * 40)
        print(f"Model: {self.fine_tuned_model}")

        try:
            test_messages = [
                {
                    "role": "system",
                    "content": self.create_enhanced_system_prompt()
                },
                {
                    "role": "user",
                    "content": f"""Please analyze this email for phishing indicators:

SUBJECT: {test_email_subject}

BODY: {test_email_body}

Provide detailed analysis through all five components and your final classification."""
                }
            ]

            response = self.client.chat.completions.create(
                model=self.fine_tuned_model,
                messages=test_messages,
                temperature=0.1
            )

            print(f"✅ Test successful!")
            print(f"\n📧 Test Email:")
            print(f"Subject: {test_email_subject}")
            print(f"Body: {test_email_body}")
            print(f"\n🤖 Model Response:")
            print(response.choices[0].message.content)

            return response.choices[0].message.content

        except Exception as e:
            print(f"❌ Test failed: {e}")
            return None

def run_complete_pipeline():
    """Run the complete fine-tuning pipeline"""

    # Initialize pipeline
    pipeline = CompleteFinetuningPipeline(DATASET_PATH, OPENAI_API_KEY)

    # Step 1: Load dataset
    if not pipeline.load_dataset():
        return False

    # Step 2: Prepare training data
    if not pipeline.prepare_training_data():
        print("❌ Failed to prepare training data")
        return False

    # Step 3: Validate requirements
    if not pipeline.validate_finetuning_requirements():
        print("❌ Validation failed")
        return False

    # Step 4: Save JSONL files
    training_file, validation_file, sample_file = pipeline.save_jsonl_files()

    # Step 5: Upload files to OpenAI
    if not pipeline.upload_training_files(training_file, validation_file):
        print("❌ File upload failed")
        return False

    # Step 6: Create fine-tuning job
    if not pipeline.create_finetuning_job():
        print("❌ Job creation failed")
        return False

    # Step 7: Monitor training
    training_success = pipeline.monitor_training()

    if training_success:
        # Step 8: Test the model
        pipeline.test_fine_tuned_model()

        print(f"\n🎉 PIPELINE COMPLETED SUCCESSFULLY!")
        print("="*70)
        print(f"📁 FILES CREATED:")
        print(f"   🚀 Training: {training_file}")
        print(f"   🔍 Validation: {validation_file}")
        print(f"   📋 Sample: {sample_file}")
        print(f"\n🤖 FINE-TUNED MODEL:")
        print(f"   Model ID: {pipeline.fine_tuned_model}")
        print(f"   Job ID: {pipeline.job_id}")
        print(f"\n✅ Ready for production use!")

    return training_success

# Manual control functions for step-by-step execution
def run_data_preparation_only():
    """Run only data preparation steps"""
    pipeline = CompleteFinetuningPipeline(DATASET_PATH, OPENAI_API_KEY)

    if not pipeline.load_dataset():
        return None

    if not pipeline.prepare_training_data():
        return None

    if not pipeline.validate_finetuning_requirements():
        return None

    training_file, validation_file, sample_file = pipeline.save_jsonl_files()

    print(f"\n✅ DATA PREPARATION COMPLETE")
    print(f"Files ready for fine-tuning: {training_file}, {validation_file}")

    return pipeline, training_file, validation_file

def manual_finetuning_workflow(training_file, validation_file):
    """Manual fine-tuning workflow for step-by-step control"""
    client = OpenAI(api_key=OPENAI_API_KEY)

    print(f"\n🚀 MANUAL FINE-TUNING WORKFLOW")
    print("="*50)

    # Upload files
    print("1. Uploading training files...")
    with open(training_file, "rb") as f:
        train_upload = client.files.create(file=f, purpose="fine-tune")

    with open(validation_file, "rb") as f:
        val_upload = client.files.create(file=f, purpose="fine-tune")

    print(f"   Training file ID: {train_upload.id}")
    print(f"   Validation file ID: {val_upload.id}")

    # Create job
    print("2. Creating fine-tuning job...")
    job = client.fine_tuning.jobs.create(
        training_file=train_upload.id,
        validation_file=val_upload.id,
        model="gpt-4o-mini-2024-07-18",
        suffix="phishing-detection"
    )

    print(f"   Job ID: {job.id}")
    print(f"   Status: {job.status}")

    return client, job.id

if __name__ == "__main__":
    print("GPT-4o Mini Fine-tuning Pipeline")
    print("Choose execution mode:")
    print("1. run_complete_pipeline() - Full automated pipeline")
    print("2. run_data_preparation_only() - Data prep only")
    print("3. manual_finetuning_workflow() - Step by step control")

# Run the complete pipeline
run_complete_pipeline()

In [None]:
#!/usr/bin/env python3
"""
Standalone GPT-4o Mini Fine-tuning: Upload, Initiate, Monitor
Simplified script for file upload, job creation, and monitoring

Usage:
1. Set your API key
2. Update file paths
3. Run the functions step by step
"""

import time
import json
from datetime import datetime
from openai import OpenAI

# ==========================================
# CONFIGURATION - UPDATE THESE VALUES
# ==========================================

# Your OpenAI API Key - GET A NEW ONE FROM: https://platform.openai.com/api-keys
API_KEY = "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"  # ⚠️ REPLACE WITH YOUR ACTUAL API KEY

# File paths (update these to your actual file locations)
TRAINING_FILE_PATH = "/content/gpt4o_mini_training_20250716_065744.jsonl"
VALIDATION_FILE_PATH = "/content/gpt4o_mini_validation_20250716_065744.jsonl"

# Fine-tuning configuration
MODEL_NAME = "gpt-4o-mini-2024-07-18"
SUFFIX = "phishing-detection"

print("🚀 STANDALONE GPT-4o MINI FINE-TUNING")
print("="*60)

# ==========================================
# OPENAI CLIENT SETUP
# ==========================================

def setup_client():
    """Initialize OpenAI client with API key validation"""
    global client

    if API_KEY == "your-api-key-here":
        print("❌ ERROR: Please update the API_KEY variable with your actual OpenAI API key")
        print("   Get your API key from: https://platform.openai.com/api-keys")
        return False

    try:
        client = OpenAI(api_key=API_KEY)

        # Test API key by listing models
        models = client.models.list()
        print("✅ API key validated successfully")
        return True

    except Exception as e:
        print(f"❌ API key validation failed: {e}")
        print("   Please check your API key at: https://platform.openai.com/api-keys")
        return False

# ==========================================
# STEP 1: UPLOAD FILES
# ==========================================

def upload_files():
    """Upload training and validation files to OpenAI"""
    global training_file_id, validation_file_id

    print(f"\n📤 STEP 1: UPLOADING FILES")
    print("-" * 40)

    try:
        # Upload training file
        print(f"📁 Uploading training file: {TRAINING_FILE_PATH}")
        with open(TRAINING_FILE_PATH, "rb") as f:
            training_upload = client.files.create(
                file=f,
                purpose="fine-tune"
            )
        training_file_id = training_upload.id
        print(f"✅ Training file uploaded successfully")
        print(f"   File ID: {training_file_id}")
        print(f"   Size: {training_upload.bytes:,} bytes")

        # Upload validation file
        print(f"\n📁 Uploading validation file: {VALIDATION_FILE_PATH}")
        with open(VALIDATION_FILE_PATH, "rb") as f:
            validation_upload = client.files.create(
                file=f,
                purpose="fine-tune"
            )
        validation_file_id = validation_upload.id
        print(f"✅ Validation file uploaded successfully")
        print(f"   File ID: {validation_file_id}")
        print(f"   Size: {validation_upload.bytes:,} bytes")

        # Verify files are processed
        print(f"\n🔍 Verifying file processing...")

        # Check training file status
        train_info = client.files.retrieve(training_file_id)
        print(f"   Training file: {train_info.status}")

        # Check validation file status
        val_info = client.files.retrieve(validation_file_id)
        print(f"   Validation file: {val_info.status}")

        if train_info.status == "processed" and val_info.status == "processed":
            print("✅ Both files processed and ready for fine-tuning")
            return True
        else:
            print("⚠️  Files still processing. Wait a few seconds and check status.")
            return True  # Can still proceed

    except FileNotFoundError as e:
        print(f"❌ File not found: {e}")
        print("   Please check the file paths and make sure files exist")
        return False

    except Exception as e:
        print(f"❌ Upload failed: {e}")
        return False

# ==========================================
# STEP 2: CREATE FINE-TUNING JOB
# ==========================================

def create_finetuning_job():
    """Create fine-tuning job"""
    global job_id

    print(f"\n🎯 STEP 2: CREATING FINE-TUNING JOB")
    print("-" * 40)

    try:
        # Create fine-tuning job
        job = client.fine_tuning.jobs.create(
            training_file=training_file_id,
            validation_file=validation_file_id,
            model=MODEL_NAME,
            suffix=SUFFIX
        )

        job_id = job.id

        print(f"✅ Fine-tuning job created successfully!")
        print(f"   Job ID: {job_id}")
        print(f"   Model: {job.model}")
        print(f"   Status: {job.status}")
        print(f"   Created: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

        return True

    except Exception as e:
        print(f"❌ Job creation failed: {e}")
        return False

# ==========================================
# STEP 3: MONITOR TRAINING
# ==========================================

def monitor_training():
    """Monitor training progress with real-time updates"""
    print(f"\n📊 STEP 3: MONITORING TRAINING PROGRESS")
    print("-" * 40)
    print(f"Job ID: {job_id}")
    print("⏱️  Monitoring started. Training typically takes 10-60 minutes.")
    print("   Press Ctrl+C to stop monitoring (job will continue running)")
    print("-" * 40)

    try:
        last_status = None
        start_time = time.time()

        while True:
            # Get job status
            job = client.fine_tuning.jobs.retrieve(job_id)

            # Show status updates
            if job.status != last_status:
                elapsed = time.time() - start_time
                timestamp = datetime.now().strftime("%H:%M:%S")

                print(f"[{timestamp}] Status: {job.status.upper()} (Elapsed: {elapsed/60:.1f}m)")

                # Show additional info if available
                if hasattr(job, 'trained_tokens') and job.trained_tokens:
                    print(f"[{timestamp}] Trained tokens: {job.trained_tokens:,}")

                if hasattr(job, 'training_file') and job.training_file:
                    print(f"[{timestamp}] Training file: {job.training_file}")

                last_status = job.status

            # Check completion status
            if job.status == "succeeded":
                elapsed = time.time() - start_time
                print(f"\n🎉 TRAINING COMPLETED SUCCESSFULLY!")
                print(f"   Total time: {elapsed/60:.1f} minutes")
                print(f"   Fine-tuned model: {job.fine_tuned_model}")

                # Save model info
                model_info = {
                    "job_id": job_id,
                    "fine_tuned_model": job.fine_tuned_model,
                    "training_file": training_file_id,
                    "validation_file": validation_file_id,
                    "completed_at": datetime.now().isoformat(),
                    "training_time_minutes": elapsed/60
                }

                # Save to file
                with open(f"fine_tuned_model_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json", 'w') as f:
                    json.dump(model_info, f, indent=2)

                print(f"   Model info saved to file")
                return job.fine_tuned_model

            elif job.status == "failed":
                print(f"\n❌ TRAINING FAILED")
                if hasattr(job, 'error') and job.error:
                    print(f"   Error: {job.error}")
                return None

            elif job.status == "cancelled":
                print(f"\n⚠️ TRAINING CANCELLED")
                return None

            # Wait before next check
            time.sleep(30)  # Check every 30 seconds

    except KeyboardInterrupt:
        print(f"\n⚠️ Monitoring interrupted by user")
        print(f"   Job {job_id} is still running in the background")
        print(f"   Use check_job_status('{job_id}') to check status later")
        return None

    except Exception as e:
        print(f"\n❌ Monitoring error: {e}")
        return None

# ==========================================
# UTILITY FUNCTIONS
# ==========================================

def check_job_status(job_id_to_check):
    """Check status of a specific job"""
    try:
        job = client.fine_tuning.jobs.retrieve(job_id_to_check)

        print(f"\n📊 JOB STATUS CHECK")
        print("-" * 30)
        print(f"Job ID: {job_id_to_check}")
        print(f"Status: {job.status}")
        print(f"Model: {job.model}")

        if hasattr(job, 'fine_tuned_model') and job.fine_tuned_model:
            print(f"Fine-tuned model: {job.fine_tuned_model}")

        if hasattr(job, 'trained_tokens') and job.trained_tokens:
            print(f"Trained tokens: {job.trained_tokens:,}")

        return job

    except Exception as e:
        print(f"❌ Error checking job status: {e}")
        return None

def list_my_finetuning_jobs():
    """List all fine-tuning jobs for your account"""
    try:
        jobs = client.fine_tuning.jobs.list()

        print(f"\n📋 YOUR FINE-TUNING JOBS")
        print("-" * 50)

        for job in jobs.data:
            status_icon = "✅" if job.status == "succeeded" else "⏳" if job.status == "running" else "❌"
            print(f"{status_icon} {job.id} | {job.status} | {job.model}")
            if hasattr(job, 'fine_tuned_model') and job.fine_tuned_model:
                print(f"   → {job.fine_tuned_model}")

        return jobs.data

    except Exception as e:
        print(f"❌ Error listing jobs: {e}")
        return None

def test_fine_tuned_model(model_name, test_subject="Urgent Account Verification",
                         test_body="Your account will be suspended unless you verify immediately. Click here: http://suspicious-link.com"):
    """Test a fine-tuned model"""
    try:
        print(f"\n🧪 TESTING FINE-TUNED MODEL")
        print("-" * 40)
        print(f"Model: {model_name}")

        system_prompt = """You are an expert cybersecurity analyst specializing in email threat detection. Your task is to analyze emails for phishing indicators using a structured approach.

ANALYSIS FRAMEWORK:
1. SENDER ANALYSIS: Examine sender authenticity, domain reputation, email authentication indicators
2. LANGUAGE PATTERNS: Analyze grammar, writing style, urgency indicators, linguistic manipulation
3. SOCIAL ENGINEERING: Identify psychological manipulation, emotional triggers, deceptive tactics
4. TECHNICAL INDICATORS: Assess URLs, attachments, technical attack vectors, suspicious elements
5. RISK ASSESSMENT: Synthesize findings and provide evidence-based classification

OUTPUT FORMAT:
- Provide detailed analysis for each component
- End with "**FINAL CLASSIFICATION:** LEGITIMATE" or "**FINAL CLASSIFICATION:** PHISHING"
- Be specific about evidence and reasoning

Your analysis should be thorough, evidence-based, and follow cybersecurity best practices."""

        test_messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": f"""Please analyze this email for phishing indicators:

SUBJECT: {test_subject}

BODY: {test_body}

Provide detailed analysis through all five components and your final classification."""}
        ]

        response = client.chat.completions.create(
            model=model_name,
            messages=test_messages,
            temperature=0.1
        )

        print(f"✅ Test successful!")
        print(f"\n📧 Test Email:")
        print(f"Subject: {test_subject}")
        print(f"Body: {test_body}")
        print(f"\n🤖 Model Response:")
        print("-" * 40)
        print(response.choices[0].message.content)

        return response.choices[0].message.content

    except Exception as e:
        print(f"❌ Test failed: {e}")
        return None

# ==========================================
# MAIN EXECUTION FUNCTIONS
# ==========================================

def run_complete_workflow():
    """Run the complete workflow: upload → create job → monitor"""
    print("🚀 RUNNING COMPLETE FINE-TUNING WORKFLOW")
    print("="*60)

    # Setup client
    if not setup_client():
        return False

    # Step 1: Upload files
    if not upload_files():
        print("❌ Upload failed. Cannot proceed.")
        return False

    # Step 2: Create job
    if not create_finetuning_job():
        print("❌ Job creation failed. Cannot proceed.")
        return False

    # Step 3: Monitor training
    fine_tuned_model = monitor_training()

    if fine_tuned_model:
        print(f"\n🎉 WORKFLOW COMPLETED SUCCESSFULLY!")
        print(f"   Your fine-tuned model: {fine_tuned_model}")
        print(f"   Test it with: test_fine_tuned_model('{fine_tuned_model}')")
        return True
    else:
        print(f"\n⚠️ Workflow incomplete. Check job status later.")
        return False

def run_upload_only():
    """Run only file upload"""
    if not setup_client():
        return False
    return upload_files()

def run_create_job_only():
    """Create job with existing file IDs"""
    # You need to set these manually if files are already uploaded
    global training_file_id, validation_file_id
    training_file_id = "file-your-training-id"  # Replace with actual ID
    validation_file_id = "file-your-validation-id"  # Replace with actual ID

    if not setup_client():
        return False
    return create_finetuning_job()

def run_monitor_only():
    """Monitor existing job"""
    global job_id
    job_id = "ft-your-job-id"  # Replace with actual job ID

    if not setup_client():
        return False
    return monitor_training()

# ==========================================
# EXECUTION
# ==========================================

if __name__ == "__main__":
    print("Available functions:")
    print("1. run_complete_workflow() - Complete pipeline")
    print("2. run_upload_only() - Upload files only")
    print("3. check_job_status('job-id') - Check specific job")
    print("4. list_my_finetuning_jobs() - List all your jobs")
    print("5. test_fine_tuned_model('model-name') - Test model")
    print("\n⚠️  Remember to update API_KEY and file paths!")

# Uncomment to run complete workflow
run_complete_workflow()

In [None]:
#!/usr/bin/env python3
"""
Complete Model Testing and Evaluation
Test fine-tuned GPT-4o mini on entire dataset and compare with original labels

Usage: Test the fine-tuned model against the original dataset to measure performance
"""

import pandas as pd
import json
import numpy as np
from datetime import datetime
import time
import re
from collections import defaultdict
from openai import OpenAI

# ==========================================
# CONFIGURATION
# ==========================================

# Your fine-tuned model name
FINE_TUNED_MODEL = "ft:gpt-4o-mini-2024-07-18:personal:phishing-detection:Btri5cqI"

# OpenAI API Key
API_KEY = "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"

# Dataset path
DATASET_PATH = "/content/drive/MyDrive/phishing_detection_final/output/final_datasets/o3_mini_optimized_dataset_20250709_185906.csv"

# Testing configuration
MAX_EMAILS_TO_TEST = 1000  # Set to None for entire dataset
BATCH_SIZE = 50  # Process in batches to avoid rate limits
DELAY_BETWEEN_REQUESTS = 1  # Seconds between API calls

print("🧪 COMPLETE MODEL TESTING AND EVALUATION")
print("="*70)
print(f"📊 Model: {FINE_TUNED_MODEL}")
print(f"📂 Dataset: {DATASET_PATH}")

class ModelTester:
    """Complete testing framework for fine-tuned model"""

    def __init__(self, model_name, api_key, dataset_path):
        self.model_name = model_name
        self.client = OpenAI(api_key=api_key)
        self.dataset_path = dataset_path
        self.df = None

        # Results storage
        self.predictions = []
        self.true_labels = []
        self.prediction_details = []
        self.failed_predictions = 0

        # Statistics
        self.start_time = None
        self.total_requests = 0
        self.successful_requests = 0

    def load_dataset(self, max_samples=None):
        """Load and prepare dataset for testing"""
        print(f"\n📂 LOADING DATASET")
        print("-" * 40)

        try:
            self.df = pd.read_csv(self.dataset_path, encoding='utf-8')
            print(f"✅ Dataset loaded: {len(self.df):,} total rows")

            # Filter for required columns
            required_cols = ['subject', 'body', 'original_label']
            mask = self.df[required_cols].notna().all(axis=1)
            self.df = self.df[mask].copy()

            print(f"📊 Clean examples: {len(self.df):,}")

            # Limit dataset size if specified
            if max_samples and len(self.df) > max_samples:
                # Stratified sampling to maintain class balance
                legitimate_df = self.df[self.df['original_label'] == 0]
                phishing_df = self.df[self.df['original_label'] == 1]

                samples_per_class = max_samples // 2

                legit_sample = legitimate_df.sample(n=min(samples_per_class, len(legitimate_df)), random_state=42)
                phish_sample = phishing_df.sample(n=min(samples_per_class, len(phishing_df)), random_state=42)

                self.df = pd.concat([legit_sample, phish_sample]).sample(frac=1, random_state=42).reset_index(drop=True)

                print(f"🎯 Limited to: {len(self.df):,} examples for testing")

            # Show class distribution
            label_dist = self.df['original_label'].value_counts()
            print(f"\n📊 Class Distribution:")
            print(f"   Legitimate (0): {label_dist.get(0, 0):,}")
            print(f"   Phishing (1):   {label_dist.get(1, 0):,}")

            return True

        except Exception as e:
            print(f"❌ Error loading dataset: {e}")
            return False

    def create_system_prompt(self):
        """System prompt for the fine-tuned model"""
        return """You are an expert cybersecurity analyst specializing in email threat detection. Your task is to analyze emails for phishing indicators using a structured approach.

ANALYSIS FRAMEWORK:
1. SENDER ANALYSIS: Examine sender authenticity, domain reputation, email authentication indicators
2. LANGUAGE PATTERNS: Analyze grammar, writing style, urgency indicators, linguistic manipulation
3. SOCIAL ENGINEERING: Identify psychological manipulation, emotional triggers, deceptive tactics
4. TECHNICAL INDICATORS: Assess URLs, attachments, technical attack vectors, suspicious elements
5. RISK ASSESSMENT: Synthesize findings and provide evidence-based classification

OUTPUT FORMAT:
- Provide detailed analysis for each component
- End with "**FINAL CLASSIFICATION:** LEGITIMATE" or "**FINAL CLASSIFICATION:** PHISHING"
- Be specific about evidence and reasoning

Your analysis should be thorough, evidence-based, and follow cybersecurity best practices."""

    def extract_prediction(self, response_text):
        """Extract prediction from model response"""
        try:
            # Look for final classification
            patterns = [
                r"\*\*FINAL CLASSIFICATION:\*\*\s*(LEGITIMATE|PHISHING)",
                r"FINAL CLASSIFICATION:\s*(LEGITIMATE|PHISHING)",
                r"Classification:\s*(LEGITIMATE|PHISHING)",
                r"CLASSIFICATION:\s*(LEGITIMATE|PHISHING)"
            ]

            response_upper = response_text.upper()

            for pattern in patterns:
                match = re.search(pattern, response_upper)
                if match:
                    prediction = match.group(1)
                    return 0 if prediction == "LEGITIMATE" else 1

            # Fallback: look for keywords
            if "PHISHING" in response_upper and "LEGITIMATE" not in response_upper:
                return 1
            elif "LEGITIMATE" in response_upper and "PHISHING" not in response_upper:
                return 0

            # If unclear, return None
            return None

        except Exception as e:
            print(f"⚠️ Error extracting prediction: {e}")
            return None

    def predict_single_email(self, subject, body, email_index):
        """Get prediction for a single email"""
        try:
            user_content = f"""Please analyze this email for phishing indicators:

SUBJECT: {subject}

BODY: {body}

Provide detailed analysis through all five components and your final classification."""

            messages = [
                {"role": "system", "content": self.create_system_prompt()},
                {"role": "user", "content": user_content}
            ]

            response = self.client.chat.completions.create(
                model=self.model_name,
                messages=messages,
                temperature=0.1,
                max_tokens=2000
            )

            response_text = response.choices[0].message.content
            prediction = self.extract_prediction(response_text)

            self.total_requests += 1

            if prediction is not None:
                self.successful_requests += 1
                return prediction, response_text
            else:
                print(f"⚠️ Could not extract prediction from email {email_index}")
                return None, response_text

        except Exception as e:
            print(f"❌ Error predicting email {email_index}: {e}")
            self.total_requests += 1
            return None, str(e)

    def test_model_on_dataset(self):
        """Test model on entire dataset"""
        print(f"\n🧪 TESTING MODEL ON DATASET")
        print("-" * 40)
        print(f"📊 Testing {len(self.df):,} emails")
        print(f"⏱️ Estimated time: {(len(self.df) * DELAY_BETWEEN_REQUESTS) / 60:.1f} minutes")
        print("-" * 40)

        self.start_time = time.time()

        # Process in batches
        for batch_start in range(0, len(self.df), BATCH_SIZE):
            batch_end = min(batch_start + BATCH_SIZE, len(self.df))
            batch_df = self.df.iloc[batch_start:batch_end]

            print(f"📦 Processing batch {batch_start//BATCH_SIZE + 1}: emails {batch_start+1}-{batch_end}")

            for idx, row in batch_df.iterrows():
                try:
                    subject = str(row.get('subject', 'No Subject')).strip()
                    body = str(row.get('body', '')).strip()
                    true_label = int(row.get('original_label', 0))

                    # Get prediction
                    prediction, response_text = self.predict_single_email(subject, body, idx)

                    # Store results
                    self.true_labels.append(true_label)

                    if prediction is not None:
                        self.predictions.append(prediction)

                        # Store detailed results
                        self.prediction_details.append({
                            'email_index': idx,
                            'subject': subject[:100],  # Truncate for storage
                            'true_label': true_label,
                            'predicted_label': prediction,
                            'correct': prediction == true_label,
                            'response_text': response_text[:500],  # Truncate
                            'timestamp': datetime.now().isoformat()
                        })
                    else:
                        self.predictions.append(-1)  # Mark as failed
                        self.failed_predictions += 1

                    # Progress update
                    if (idx + 1) % 10 == 0:
                        elapsed = time.time() - self.start_time
                        processed = len(self.predictions)
                        rate = processed / elapsed if elapsed > 0 else 0
                        print(f"   Processed: {processed:,}/{len(self.df):,} ({rate:.1f} emails/sec)")

                    # Rate limiting
                    time.sleep(DELAY_BETWEEN_REQUESTS)

                except Exception as e:
                    print(f"⚠️ Error processing email {idx}: {e}")
                    self.true_labels.append(int(row.get('original_label', 0)))
                    self.predictions.append(-1)
                    self.failed_predictions += 1
                    continue

        total_time = time.time() - self.start_time

        print(f"\n✅ TESTING COMPLETED")
        print(f"   Total time: {total_time/60:.1f} minutes")
        print(f"   Total emails: {len(self.df):,}")
        print(f"   Successful predictions: {self.successful_requests:,}")
        print(f"   Failed predictions: {self.failed_predictions:,}")
        print(f"   Success rate: {(self.successful_requests/len(self.df)*100):.1f}%")

    def calculate_metrics(self):
        """Calculate performance metrics"""
        print(f"\n📊 PERFORMANCE METRICS")
        print("-" * 40)

        # Filter out failed predictions
        valid_indices = [i for i, pred in enumerate(self.predictions) if pred != -1]
        valid_predictions = [self.predictions[i] for i in valid_indices]
        valid_true_labels = [self.true_labels[i] for i in valid_indices]

        if not valid_predictions:
            print("❌ No valid predictions to evaluate")
            return

        # Basic counts
        total_valid = len(valid_predictions)
        correct_predictions = sum(1 for i in range(total_valid) if valid_predictions[i] == valid_true_labels[i])
        accuracy = correct_predictions / total_valid

        print(f"📈 OVERALL PERFORMANCE:")
        print(f"   Total valid predictions: {total_valid:,}")
        print(f"   Correct predictions: {correct_predictions:,}")
        print(f"   Accuracy: {accuracy:.3f} ({accuracy*100:.1f}%)")

        # Confusion Matrix
        tp = sum(1 for i in range(total_valid) if valid_true_labels[i] == 1 and valid_predictions[i] == 1)
        tn = sum(1 for i in range(total_valid) if valid_true_labels[i] == 0 and valid_predictions[i] == 0)
        fp = sum(1 for i in range(total_valid) if valid_true_labels[i] == 0 and valid_predictions[i] == 1)
        fn = sum(1 for i in range(total_valid) if valid_true_labels[i] == 1 and valid_predictions[i] == 0)

        print(f"\n📊 CONFUSION MATRIX:")
        print(f"                 Predicted")
        print(f"                 Legit  Phish")
        print(f"   Actual Legit   {tn:4d}   {fp:4d}")
        print(f"          Phish   {fn:4d}   {tp:4d}")

        # Detailed metrics
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

        print(f"\n📏 DETAILED METRICS:")
        print(f"   Precision: {precision:.3f} ({precision*100:.1f}%)")
        print(f"   Recall:    {recall:.3f} ({recall*100:.1f}%)")
        print(f"   F1-Score:  {f1_score:.3f}")

        # Class-specific accuracy
        legit_total = sum(1 for label in valid_true_labels if label == 0)
        phish_total = sum(1 for label in valid_true_labels if label == 1)

        legit_correct = tn
        phish_correct = tp

        legit_accuracy = legit_correct / legit_total if legit_total > 0 else 0
        phish_accuracy = phish_correct / phish_total if phish_total > 0 else 0

        print(f"\n🎯 CLASS-SPECIFIC ACCURACY:")
        print(f"   Legitimate emails: {legit_correct:,}/{legit_total:,} ({legit_accuracy*100:.1f}%)")
        print(f"   Phishing emails:   {phish_correct:,}/{phish_total:,} ({phish_accuracy*100:.1f}%)")

        # Label counts
        true_legit = sum(1 for label in valid_true_labels if label == 0)
        true_phish = sum(1 for label in valid_true_labels if label == 1)
        pred_legit = sum(1 for label in valid_predictions if label == 0)
        pred_phish = sum(1 for label in valid_predictions if label == 1)

        print(f"\n📊 LABEL DISTRIBUTION:")
        print(f"   True Labels  - Legitimate: {true_legit:,}, Phishing: {true_phish:,}")
        print(f"   Predictions  - Legitimate: {pred_legit:,}, Phishing: {pred_phish:,}")

        return {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1_score,
            'confusion_matrix': {'tp': tp, 'tn': tn, 'fp': fp, 'fn': fn},
            'total_valid': total_valid,
            'correct_predictions': correct_predictions
        }

    def show_sample_predictions(self, num_correct=3, num_incorrect=3):
        """Show sample correct and incorrect predictions"""
        print(f"\n🔍 SAMPLE PREDICTIONS")
        print("-" * 40)

        # Get correct and incorrect predictions
        correct_samples = []
        incorrect_samples = []

        for detail in self.prediction_details:
            if detail['correct'] and len(correct_samples) < num_correct:
                correct_samples.append(detail)
            elif not detail['correct'] and len(incorrect_samples) < num_incorrect:
                incorrect_samples.append(detail)

        # Show correct predictions
        if correct_samples:
            print(f"✅ CORRECT PREDICTIONS (showing {len(correct_samples)}):")
            for i, sample in enumerate(correct_samples, 1):
                true_label_name = "LEGITIMATE" if sample['true_label'] == 0 else "PHISHING"
                pred_label_name = "LEGITIMATE" if sample['predicted_label'] == 0 else "PHISHING"
                print(f"\n   {i}. Subject: {sample['subject']}")
                print(f"      True: {true_label_name} | Predicted: {pred_label_name} ✅")

        # Show incorrect predictions
        if incorrect_samples:
            print(f"\n❌ INCORRECT PREDICTIONS (showing {len(incorrect_samples)}):")
            for i, sample in enumerate(incorrect_samples, 1):
                true_label_name = "LEGITIMATE" if sample['true_label'] == 0 else "PHISHING"
                pred_label_name = "LEGITIMATE" if sample['predicted_label'] == 0 else "PHISHING"
                print(f"\n   {i}. Subject: {sample['subject']}")
                print(f"      True: {true_label_name} | Predicted: {pred_label_name} ❌")

    def save_results(self):
        """Save detailed results to files"""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

        # Save detailed results
        results_file = f"model_evaluation_results_{timestamp}.json"

        results_data = {
            'model_name': self.model_name,
            'dataset_path': self.dataset_path,
            'test_timestamp': timestamp,
            'total_emails_tested': len(self.df),
            'successful_predictions': self.successful_requests,
            'failed_predictions': self.failed_predictions,
            'metrics': self.calculate_metrics(),
            'sample_predictions': self.prediction_details[:50]  # Save first 50 for inspection
        }

        with open(results_file, 'w', encoding='utf-8') as f:
            json.dump(results_data, f, indent=2, ensure_ascii=False)

        print(f"\n💾 RESULTS SAVED:")
        print(f"   📄 Detailed results: {results_file}")

        # Save predictions CSV
        if self.prediction_details:
            predictions_df = pd.DataFrame(self.prediction_details)
            csv_file = f"model_predictions_{timestamp}.csv"
            predictions_df.to_csv(csv_file, index=False)
            print(f"   📊 Predictions CSV: {csv_file}")

        return results_file

def run_complete_evaluation():
    """Run complete model evaluation"""

    # Initialize tester
    tester = ModelTester(FINE_TUNED_MODEL, API_KEY, DATASET_PATH)

    # Load dataset
    if not tester.load_dataset(max_samples=MAX_EMAILS_TO_TEST):
        return False

    # Test model
    tester.test_model_on_dataset()

    # Calculate and show metrics
    metrics = tester.calculate_metrics()

    # Show sample predictions
    tester.show_sample_predictions()

    # Save results
    results_file = tester.save_results()

    print(f"\n🎉 EVALUATION COMPLETED!")
    print("="*70)
    print(f"📊 Final Results:")
    print(f"   Accuracy: {metrics['accuracy']*100:.1f}%")
    print(f"   Precision: {metrics['precision']*100:.1f}%")
    print(f"   Recall: {metrics['recall']*100:.1f}%")
    print(f"   F1-Score: {metrics['f1_score']:.3f}")
    print(f"   Results saved to: {results_file}")

    return True

def quick_test(num_samples=50):
    """Quick test with limited samples"""
    global MAX_EMAILS_TO_TEST
    MAX_EMAILS_TO_TEST = num_samples

    print(f"🚀 QUICK TEST MODE: Testing {num_samples} emails")
    return run_complete_evaluation()

if __name__ == "__main__":
    print("Available functions:")
    print("1. run_complete_evaluation() - Test entire dataset")
    print("2. quick_test(50) - Quick test with 50 emails")
    print("3. quick_test(100) - Quick test with 100 emails")

# Run complete evaluation
run_complete_evaluation()