## Large-Scale External Dataset Evaluation
**(This notebook was ran on colab because of compute reasons, .pkl file was exported which is shown at the end of notebook)**

**Testing the Human vs AI Classifier on External Data**

This notebook tests the fine-tuned DistilBERT+LoRA model on a completely independent dataset from Hugging Face to evaluate generalization beyond the original training distribution.

**Dataset**: `gsingh1-py/train` (first 3000 samples)
- Contains human-written text and Mistral-generated text
- Completely independent from training data
- Tests whether model learned genuine AI detection or dataset artifacts

## Task 1: Load and Verify Dataset

In [None]:
import torch
import pandas as pd
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from peft import PeftModel
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")

In [None]:
# Load the dataset
print("Loading dataset from Hugging Face...")
ds = load_dataset("gsingh1-py/train")

print(f"\nDataset structure: {ds}")
print(f"\nDataset splits: {list(ds.keys())}")

In [None]:
# Examine the fields/columns in the dataset
if 'train' in ds:
    sample_data = ds['train']
else:
    # If there's no 'train' split, use the first available split
    split_name = list(ds.keys())[0]
    sample_data = ds[split_name]
    print(f"Using split: {split_name}")

print(f"\nDataset features/columns: {sample_data.features}")
print(f"\nTotal rows in dataset: {len(sample_data)}")

# Display first few examples to understand the structure
print("\n" + "="*80)
print("SAMPLE EXAMPLES FROM DATASET:")
print("="*80)
for i in range(min(3, len(sample_data))):
    print(f"\n--- Example {i+1} ---")
    example = sample_data[i]
    for key, value in example.items():
        if isinstance(value, str) and len(value) > 200:
            print(f"{key}: {value[:200]}... (truncated, total length: {len(value)})")
        else:
            print(f"{key}: {value}")

In [None]:
# Extract only the fields we need: human text and Mistral-generated text
# We'll need to identify the correct field names first

print("\nIdentifying relevant fields...")
feature_names = list(sample_data.features.keys())
print(f"Available fields: {feature_names}")

# Look for fields containing 'human', 'mistral', 'generated', 'text', etc.
human_field = None
mistral_field = None

for field in feature_names:
    field_lower = field.lower()
    if 'human' in field_lower and 'mistral' not in field_lower:
        human_field = field
        print(f"Identified human text field: {field}")
    if 'mistral' in field_lower:
        mistral_field = field
        print(f"Identified Mistral text field: {field}")

# If not found by name, let's examine the data structure more carefully
if human_field is None or mistral_field is None:
    print("\nCould not auto-identify fields. Examining data structure...")
    example = sample_data[0]
    print("\nField name -> Sample content:")
    for key, value in example.items():
        if isinstance(value, str):
            print(f"  {key}: {value[:100]}...")
        else:
            print(f"  {key}: {type(value)}")

In [None]:
# TODO: Update these field names after running the cell above
HUMAN_FIELD = 'human_text'  # Replace with actual field name
MISTRAL_FIELD = 'mistral_text'  # Replace with actual field name

print(f"Using fields:")
print(f"  Human: {HUMAN_FIELD}")
print(f"  Mistral: {MISTRAL_FIELD}")

In [None]:
N_SAMPLES = 3000
eval_texts = []
eval_labels = []
eval_sources = []  # Track which class each sample belongs to

print(f"\nExtracting first {N_SAMPLES} samples...")

# Extract human texts
for i in range(min(N_SAMPLES, len(sample_data))):
    human_text = sample_data[i][HUMAN_FIELD]
    mistral_text = sample_data[i][MISTRAL_FIELD]
    
    # Add human text
    if human_text and isinstance(human_text, str) and len(human_text.strip()) > 0:
        eval_texts.append(human_text.strip())
        eval_labels.append(0)  # 0 = Human
        eval_sources.append('Human')
    
    # Add Mistral text
    if mistral_text and isinstance(mistral_text, str) and len(mistral_text.strip()) > 0:
        eval_texts.append(mistral_text.strip())
        eval_labels.append(1)  # 1 = AI/Mistral
        eval_sources.append('Mistral')

print(f"\nTotal samples extracted: {len(eval_texts)}")
print(f"  Human: {eval_labels.count(0)}")
print(f"  Mistral (AI): {eval_labels.count(1)}")

# Create DataFrame for easier manipulation
eval_df = pd.DataFrame({
    'text': eval_texts,
    'true_label': eval_labels,
    'source': eval_sources
})

print(f"\nDataFrame shape: {eval_df.shape}")
print(f"\nLabel distribution:")
print(eval_df['source'].value_counts())

In [None]:
# Display sample texts
print("="*80)
print("SAMPLE HUMAN TEXT:")
print("="*80)
print(eval_df[eval_df['source'] == 'Human']['text'].iloc[0][:500])

print("\n" + "="*80)
print("SAMPLE MISTRAL TEXT:")
print("="*80)
print(eval_df[eval_df['source'] == 'Mistral']['text'].iloc[0][:500])

print("\n" + "="*80)
print("TEXT LENGTH STATISTICS:")
print("="*80)
eval_df['text_length'] = eval_df['text'].str.len()
print(eval_df.groupby('source')['text_length'].describe())

## Task 2: Load the Fine-Tuned Model

In [None]:
BASE_MODEL_NAME = "distilbert-base-uncased"
ADAPTER_PATH = "./lora_finetuned_model"  # Update this path

print(f"Base model: {BASE_MODEL_NAME}")
print(f"LoRA adapter path: {ADAPTER_PATH}")

In [None]:
# Load tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
# Load base model
print("Loading base model...")
base_model = AutoModelForSequenceClassification.from_pretrained(
    BASE_MODEL_NAME,
    num_labels=2,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
)

# Load LoRA adapter
print("Loading LoRA adapter...")
model = PeftModel.from_pretrained(base_model, ADAPTER_PATH)

# Merge adapter weights for faster inference
print("Merging adapter weights...")
model = model.merge_and_unload()

# Move to device
model = model.to(device)
model.eval()

print(f"\nModel loaded successfully!")
print(f"Model type: {type(model)}")
print(f"Number of parameters: {sum(p.numel() for p in model.parameters()):,}")

## Task 3: Run Inference on Evaluation Dataset

In [None]:
# Inference function
def predict_batch(texts, batch_size=32):
    """
    Run inference on a list of texts
    Returns predictions (0=Human, 1=AI) and probabilities
    """
    predictions = []
    probabilities = []
    
    model.eval()
    with torch.no_grad():
        for i in tqdm(range(0, len(texts), batch_size), desc="Running inference"):
            batch_texts = texts[i:i+batch_size]
            
            # Tokenize
            inputs = tokenizer(
                batch_texts,
                padding=True,
                truncation=True,
                max_length=512,
                return_tensors="pt"
            ).to(device)
            
            # Get predictions
            outputs = model(**inputs)
            logits = outputs.logits
            probs = torch.softmax(logits, dim=-1)
            preds = torch.argmax(logits, dim=-1)
            
            # Store results
            predictions.extend(preds.cpu().numpy())
            probabilities.extend(probs.cpu().numpy())
    
    return np.array(predictions), np.array(probabilities)

print("Inference function defined.")

In [None]:
# Run inference on all evaluation texts
print(f"Running inference on {len(eval_df)} texts...")
print(f"Batch size: 32")
print(f"This may take several minutes...\n")

predictions, probabilities = predict_batch(eval_df['text'].tolist(), batch_size=32)

# Add predictions to dataframe
eval_df['predicted_label'] = predictions
eval_df['prob_human'] = probabilities[:, 0]
eval_df['prob_ai'] = probabilities[:, 1]

print("\nInference complete!")
print(f"Predictions shape: {predictions.shape}")
print(f"Probabilities shape: {probabilities.shape}")

## Task 4: Analyze Results

In [None]:
# Calculate overall metrics
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report

y_true = eval_df['true_label'].values
y_pred = eval_df['predicted_label'].values

# Overall accuracy
overall_accuracy = accuracy_score(y_true, y_pred)

# Per-class metrics
precision, recall, f1, support = precision_recall_fscore_support(y_true, y_pred, average=None)

# Confusion matrix
cm = confusion_matrix(y_true, y_pred)

print("="*80)
print("OVERALL PERFORMANCE")
print("="*80)
print(f"Overall Accuracy: {overall_accuracy:.4f} ({overall_accuracy*100:.2f}%)")
print(f"Total samples: {len(y_true)}")
print(f"Correct predictions: {(y_true == y_pred).sum()}")
print(f"Incorrect predictions: {(y_true != y_pred).sum()}")

print("\n" + "="*80)
print("PER-CLASS METRICS")
print("="*80)
print(f"{'Class':<15} {'Precision':<12} {'Recall':<12} {'F1-Score':<12} {'Support':<10}")
print("-"*80)
print(f"{'Human (0)':<15} {precision[0]:<12.4f} {recall[0]:<12.4f} {f1[0]:<12.4f} {support[0]:<10}")
print(f"{'AI/Mistral (1)':<15} {precision[1]:<12.4f} {recall[1]:<12.4f} {f1[1]:<12.4f} {support[1]:<10}")

print("\n" + "="*80)
print("CONFUSION MATRIX")
print("="*80)
print("              Predicted")
print("              Human    AI/Mistral")
print(f"Actual Human    {cm[0,0]:<8} {cm[0,1]:<8}")
print(f"       AI       {cm[1,0]:<8} {cm[1,1]:<8}")

# Calculate specific error rates
human_correct = cm[0, 0]
human_total = support[0]
human_accuracy = human_correct / human_total

ai_correct = cm[1, 1]
ai_total = support[1]
ai_accuracy = ai_correct / ai_total

print("\n" + "="*80)
print("CLASS-SPECIFIC ACCURACY")
print("="*80)
print(f"Human texts correctly classified: {human_correct}/{human_total} ({human_accuracy*100:.2f}%)")
print(f"AI texts correctly classified: {ai_correct}/{ai_total} ({ai_accuracy*100:.2f}%)")
print(f"\nHuman texts misclassified as AI: {cm[0,1]} ({cm[0,1]/human_total*100:.2f}%)")
print(f"AI texts misclassified as Human: {cm[1,0]} ({cm[1,0]/ai_total*100:.2f}%)")

In [None]:
# Detailed classification report
print("="*80)
print("DETAILED CLASSIFICATION REPORT")
print("="*80)
print(classification_report(y_true, y_pred, target_names=['Human', 'AI/Mistral']))

In [None]:
# Analyze prediction confidence
print("="*80)
print("PREDICTION CONFIDENCE ANALYSIS")
print("="*80)

# Overall confidence statistics
eval_df['confidence'] = eval_df[['prob_human', 'prob_ai']].max(axis=1)

print("\nOverall confidence statistics:")
print(eval_df['confidence'].describe())

print("\n\nConfidence by prediction correctness:")
eval_df['correct'] = eval_df['true_label'] == eval_df['predicted_label']
print("\nCorrect predictions:")
print(eval_df[eval_df['correct']]['confidence'].describe())
print("\nIncorrect predictions:")
print(eval_df[~eval_df['correct']]['confidence'].describe())

print("\n\nConfidence by true class:")
for source in ['Human', 'Mistral']:
    print(f"\n{source} texts:")
    subset = eval_df[eval_df['source'] == source]
    print(f"  Mean confidence: {subset['confidence'].mean():.4f}")
    print(f"  Median confidence: {subset['confidence'].median():.4f}")
    print(f"  Min confidence: {subset['confidence'].min():.4f}")
    print(f"  Max confidence: {subset['confidence'].max():.4f}")

In [None]:
# Examine misclassified examples
print("="*80)
print("MISCLASSIFIED EXAMPLES")
print("="*80)

misclassified = eval_df[~eval_df['correct']].copy()
print(f"\nTotal misclassified: {len(misclassified)}")

# Show a few examples of each type of error
print("\n" + "-"*80)
print("HUMAN TEXTS MISCLASSIFIED AS AI (False Positives)")
print("-"*80)
human_fp = misclassified[misclassified['source'] == 'Human'].head(3)
for idx, row in human_fp.iterrows():
    print(f"\nExample (confidence: {row['prob_ai']:.4f}):")
    print(row['text'][:300] + "..." if len(row['text']) > 300 else row['text'])

print("\n" + "-"*80)
print("AI TEXTS MISCLASSIFIED AS HUMAN (False Negatives)")
print("-"*80)
ai_fn = misclassified[misclassified['source'] == 'Mistral'].head(3)
for idx, row in ai_fn.iterrows():
    print(f"\nExample (confidence: {row['prob_human']:.4f}):")
    print(row['text'][:300] + "..." if len(row['text']) > 300 else row['text'])

In [4]:
import pickle

with open("./big_data_res.pkl","rb") as file:
    res = pickle.load(file)

print(f"overall acc:{res['overall_accuracy']}")
print(f"precision: {res['precision']}")
print(f"recall: {res['recall']}")
print(f"f1_score: {res['f1_score']}")
print(f"confusion_mtx: {res['confusion_matrix']}")
print(f"human accuracy: {res['human_accuracy']}")
print(f"ai_accuracy: {res['ai_accuracy']}")
print(f"overall_confidence_stats: {res['overall_confidence_stats']}")

    

overall acc:0.6925554999165414
precision: [0.6200664727877025, 0.995995995995996]
recall: [0.3891115564462258, 0.995995995995996]
f1_score: [0.47816148060555236, 0.995995995995996]
confusion_mtx: [[1165, 1829], [12, 2985]]
human accuracy: 0.3891115564462258
ai_accuracy: 0.995995995995996
overall_confidence_stats: {'count': 5991.0, 'mean': 0.9426171875, 'std': 0.09650634765625, 'min': 0.5, '25%': 0.926015625, '50%': 0.986375, '75%': 0.998046875, 'max': 1.0}
