In [None]:
# ==============================================================================
# AINEX LAW PROJECT: Recursive Semantic Decay Analysis
# ==============================================================================
# Investigation of AI model collapse through recursive self-training
# ==============================================================================

# 1. Install necessary libraries (silent installation)
!pip install transformers datasets sentence-transformers scipy scikit-learn accelerate > /dev/null

import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA
from scipy.spatial import ConvexHull
import numpy as np
import random
from tqdm import tqdm
import warnings

# Suppress non-critical warnings
warnings.filterwarnings("ignore")

# Verify GPU availability
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Computing device: {device.upper()}")
if device == 'cpu':
    print("⚠ Warning: CPU execution will be significantly slower than GPU")

# ==============================================================================
# 2. Load Models and Data
# ==============================================================================

print("\n[1/4] Loading pre-trained models...")

# Load GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)

# Load Sentence-BERT for semantic embeddings
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Load Wikipedia corpus for baseline training
print("[2/4] Loading Wikipedia training corpus...")
dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")

# ==============================================================================
# 3. Data Processing Classes
# ==============================================================================

class TextDataset(Dataset):
    """
    PyTorch Dataset wrapper for text sequences
    Tokenizes and prepares texts for model training
    """
    def __init__(self, texts, tokenizer, max_length=128):
        self.inputs = []
        for text in texts:
            if len(text) > 50:  # Filter for meaningful content
                encoding = tokenizer(
                    text,
                    truncation=True,
                    max_length=max_length,
                    padding="max_length",
                    return_tensors="pt"
                )
                self.inputs.append(encoding)

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, index):
        return {key: value.squeeze(0) for key, value in self.inputs[index].items()}

# Extract and prepare Wikipedia texts
print("[3/4] Preparing training corpus...")
human_knowledge_texts = [item['text'] for item in dataset if len(item['text']) > 100][:400]
print(f"  → Selected {len(human_knowledge_texts)} texts from Wikipedia")

# ==============================================================================
# 4. Core Processing Functions
# ==============================================================================

def train_model_on_texts(model, texts, epochs=3, learning_rate=5e-5):
    """
    Train the model on given texts for specified epochs.
    
    Args:
        model: GPT2LMHeadModel instance
        texts: List of text strings for training
        epochs: Number of training epochs
        learning_rate: Adam optimizer learning rate
    """
    model.train()
    dataset = TextDataset(texts, tokenizer)
    dataloader = DataLoader(dataset, batch_size=8, shuffle=True)
    optimizer = AdamW(model.parameters(), lr=learning_rate)

    print(f"  → Training for {epochs} epochs...")
    for epoch in range(epochs):
        progress_bar = tqdm(dataloader, leave=False)
        epoch_loss = 0.0

        for batch in progress_bar:
            optimizer.zero_grad()

            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = input_ids.clone()

            # Forward pass
            outputs = model(
                input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            loss = outputs.loss

            # Backward pass
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()
            progress_bar.set_description(
                f"Epoch {epoch+1}/{epochs} | Loss: {loss.item():.4f}"
            )

def generate_texts_from_model(model, num_texts=400, temperature=0.7):
    """
    Generate text sequences using the trained model.
    
    Args:
        model: GPT2LMHeadModel instance (in eval mode)
        num_texts: Number of texts to generate
        temperature: Sampling temperature (lower = more focused)
    
    Returns:
        List of generated text strings
    """
    model.eval()
    generated_texts = []

    print(f"  → Generating {num_texts} text samples...")

    # Diverse starting prompts
    prompts = [
        "The history", "Science is", "War began", "The theory", "He was born",
        "In 1990", "The system", "Water is", "Computers are", "The city"
    ] * (num_texts // 10 + 5)

    batch_size = 10
    for i in tqdm(range(0, num_texts, batch_size)):
        batch_prompts = prompts[i:i+batch_size]
        if not batch_prompts:
            break

        inputs = tokenizer(
            batch_prompts,
            return_tensors="pt",
            padding=True,
            truncation=True
        ).to(device)

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_length=100,
                do_sample=True,
                top_k=40,
                temperature=temperature,
                repetition_penalty=1.1
            )

        for output_ids in outputs:
            decoded_text = tokenizer.decode(output_ids, skip_special_tokens=True)
            generated_texts.append(decoded_text)

    return generated_texts[:num_texts]

def calculate_semantic_volume(texts):
    """
    Calculate the geometric volume of semantic embeddings using convex hull.
    
    This metric represents the diversity and coverage of semantic space.
    
    Args:
        texts: List of text strings
    
    Returns:
        Float representing the volume (0.0 if calculation fails)
    """
    embeddings = embedder.encode(texts)

    # Reduce to 3D for volume calculation
    pca = PCA(n_components=3)
    coordinates = pca.fit_transform(embeddings)

    try:
        convex_hull = ConvexHull(coordinates)
        return convex_hull.volume
    except Exception:
        return 0.0

# ==============================================================================
# 5. AINEX RECURSIVE DECAY EXPERIMENT
# ==============================================================================

print("\n" + "="*60)
print(" AINEX LAW: SEMANTIC COLLAPSE EXPERIMENT ".center(60))
print("="*60)

# Step 1: Measure baseline (human knowledge)
print("\n[BASELINE] Measuring human knowledge semantic volume...")
baseline_volume = calculate_semantic_volume(human_knowledge_texts)
print(f"  ✓ Human Knowledge Volume: {baseline_volume:.6f}")

# Step 2: First generation (train on human data, generate synthetic)
print("\n[GENERATION 1] Learning from human knowledge...")
train_model_on_texts(model, human_knowledge_texts, epochs=2)
generation_1_texts = generate_texts_from_model(model, num_texts=400)
generation_1_volume = calculate_semantic_volume(generation_1_texts)
print(f"  ✓ Generation 1 Volume: {generation_1_volume:.6f}")

# Step 3: Second generation (recursive: train on own outputs)
print("\n[GENERATION 2] Self-replication (training on synthetic data)...")
train_model_on_texts(model, generation_1_texts, epochs=3)
generation_2_texts = generate_texts_from_model(model, num_texts=400)
generation_2_volume = calculate_semantic_volume(generation_2_texts)
print(f"  ✓ Generation 2 Volume: {generation_2_volume:.6f}")

# ==============================================================================
# 6. Results and Analysis
# ==============================================================================

# Calculate semantic collapse rate
collapse_percentage = ((baseline_volume - generation_2_volume) / baseline_volume) * 100

print("\n" + "="*60)
print(" EXPERIMENT RESULTS ".center(60))
print("="*60)
print(f"\nBaseline (Human Knowledge) Volume : {baseline_volume:.6f}")
print(f"Generation 1 Volume                : {generation_1_volume:.6f}")
print(f"Generation 2 Volume                : {generation_2_volume:.6f}")
print(f"Semantic Collapse Rate             : {collapse_percentage:.2f}%")

print("\n" + "-"*60)
if collapse_percentage > 0:
    print("✓ RESULT: SEMANTIC COLLAPSE CONFIRMED")
    print("-"*60)
    print("\nInterpretation:")
    print("  • The model lost semantic diversity through recursive self-training")
    print("  • The AINEX Law is mathematically demonstrated")
    print("  • AI models converge to lower-dimensional semantic spaces when")
    print("    trained recursively on their own outputs")
else:
    print("⚠ RESULT: SEMANTIC EXPANSION (Hallucination/Noise)")
    print("-"*60)
    print("\nInterpretation:")
    print("  • The model generated novel but incoherent semantic patterns")
    print("  • This suggests random hallucination rather than true learning")

print("="*60 + "\n")