# üß† M3c‚Ä≤: Semantic Knowledge Transfer ‚Äî Node A (Teacher)

**Revolutionary Experiment: Transfer knowledge between DIFFERENT architectures via HDC**

**Setup:**
- Teacher: DistilBERT (encoder, 66M params)
- Student: GPT-2 (decoder, 355M params)
- Transfer: Semantic knowledge, NOT weights

**Pipeline:**
```
Teacher (DistilBERT):
  1. Measure accuracy BEFORE training
  2. Train on sentiment data
  3. Measure accuracy AFTER training
  4. Extract semantic knowledge (examples + embeddings)
  5. Upload to Firebase as "knowledge packet"

Student (GPT-2) receives and applies knowledge
```

---

In [None]:
!pip install -q transformers datasets accelerate
!pip install -q firebase-admin
!pip install -q sentence-transformers
print("‚úÖ Dependencies installed")

In [None]:
import torch
import torch.nn as nn
import json
import time
import numpy as np
from datetime import datetime
from tqdm import tqdm
from sklearn.metrics import accuracy_score

import firebase_admin
from firebase_admin import credentials, db

from datasets import load_dataset
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
)
from sentence_transformers import SentenceTransformer

print(f"PyTorch: {torch.__version__}")
print(f"CUDA: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
NODE_ID = "node_a_teacher"
print(f"\nüéì This is {NODE_ID.upper()} (DistilBERT)")

In [None]:
# Firebase credentials
FIREBASE_CREDENTIALS = {
  "type": "service_account",
  "project_id": "resonance-m3",
  "private_key_id": "124e2cb57b123eefac08b105c14afa647d3f90e6",
  "private_key": "-----BEGIN PRIVATE KEY-----\nMIIEvgIBADANBgkqhkiG9w0BAQEFAASCBKgwggSkAgEAAoIBAQDMy0nOWX9JaQWk\nRS3Mz+l5ib8wiwORkJ/rK1ekJoFwaQA3LKM9F1LhAIIRd/cv7X8cfmK5S84g5Yv7\nrYRMqBVTqItfwC8bET6i6Sf4ooxwwFuIO6qMBUCju3Yqf+ri5gP6GLdKE2cIdGZe\nSefMIMe8qql609Dnn6BZ7QrKjgEhgd/byYcLuuFoKNTKYSIw++TqmaRBATEOpI4c\nstCtx42rhp3Rq03ZpNfGDo67Ua0dCzto1NKkScCfUlgLd3v5OrQuU/37a9vgsoEE\nmcnWxEpjj8Edd4O+KKt0jbduDcMLbKoe42Csrh11VdIRxtJhn0W3dWcb63ZIN1cV\nBdlR2VsDAgMBAAECggEAG5NsDBsaUU+Q8OkCcnJfOnxagDYtHtazPJQI0xOpmPH/\n6MNb9QfvowMnKyRU5ovmR8yaN9/bO6wXyKXkC1EUE2sTNADJga2KxPOm7DXcZKT3\ng5qf1lwxXgVfXfhSL9ylFcdqNtFHWGjEDVeKR371Jj8ctGtSf+iLS6XoLr98VinM\na7hvEQBy7Y8wUcugrnv08uzvGPDR+MjcLPZ6YIotDgATklbbv2bZf27ZvhZb666J\nCS/KhqrS9F/6gTxGjxkJ7EGr14apXcLYnyl4WnlnorDOF87U9eCUh/QPSJmNqrbQ\nP42ZgJkjgclPMnuLHJFtck0LAP6JbJG2KGkn2BMEIQKBgQDss52wA58/73nlVPZX\nS9jfmFZX79zCqUr8aipeZ9FKbG2yJ/N3MkW6UdeJfPJpxtYyyzKsZy14sLx5Tiii\nJ3VenhFJm/9CnnyMdGKl4khHpCxQm3whAMVQJmngTAWdK/jSxG7sXvXsBcOYJEr5\n9nTUqbhx9cQZ5wX0CBpTShKTewKBgQDdfbSWdA6ZCf+ptdvRxtoCqmDKYIAdH1rH\nbaUvXfJPTzGhO0uNhdGyOe8om67xRu9U8DqOBZD38l5IES5dtxf7Yiqau3KHh/t6\n73/kJ6UfskhcR6pKkFv1AH5BoxS2BgCChMZO4B87bU3Z5IILvjbbd/7oK8aNqfRo\nMPhHx16cGQKBgQCzr9raHrXK3GnlzWzAcmStwMBzOzSUVd4F5jZJungoDk3r++YY\nK4LBYZXE2qRP5lD++EB8nkrnnwtE7y9rgZbZABfRkGnj0dps6YFlunTyZc/6VT6S\n8znWkYK4ch2k5hebMOGf8KqxxOJp4ctFHHIuarUJVe/LVhSv32LUrZuSdwKBgQCU\n6BosvEHEKbC0TAtI8UNIX3tLE62N5rKFOPXA7owlPbzEBLU6pCjihYh/6Iqos2Pb\n2tXC/YME7vDryoE9iAabftfdxv+sloM/lxyIKw8cTCPRxmx6TKtF/9riDd2ysj5N\ngS5BgPP+Y/He931mn68JxZaeSC/otcYW9jU1LJgyoQKBgDd8YwxP5YkcRnJzIWAe\n6CMhYOTNxAKnVwSBNctSG4ztzRa+wRSp0G8YNJCmwKKN7xXfJQGLVD8Shq7cfDY2\nU+0s3jEzbiIqY9W+FQy3dNsfxxR5WK9isG9fWSFfSg4ichXtTGim2eyKxoxlak2+\nyvvxzpjUtPKMdW2Sp0HkuS13\n-----END PRIVATE KEY-----\n",
  "client_email": "firebase-adminsdk-fbsvc@resonance-m3.iam.gserviceaccount.com",
  "client_id": "103650910750904165580",
  "auth_uri": "https://accounts.google.com/o/oauth2/auth",
  "token_uri": "https://oauth2.googleapis.com/token",
  "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
  "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/firebase-adminsdk-fbsvc%40resonance-m3.iam.gserviceaccount.com",
  "universe_domain": "googleapis.com"
}

FIREBASE_DATABASE_URL = "https://resonance-m3-default-rtdb.europe-west1.firebasedatabase.app"

# Initialize Firebase
if not firebase_admin._apps:
    cred = credentials.Certificate(FIREBASE_CREDENTIALS)
    firebase_admin.initialize_app(cred, {
        'databaseURL': FIREBASE_DATABASE_URL
    })
print("‚úÖ Firebase initialized")

## Step 1: Load Semantic Encoder (model-agnostic)

In [None]:
class SemanticEncoder:
    """
    Model-agnostic semantic encoder.
    Uses SentenceTransformer to create shared semantic space.
    """
    def __init__(self):
        self.encoder = SentenceTransformer('all-MiniLM-L6-v2')
        print("‚úÖ Semantic encoder loaded (all-MiniLM-L6-v2)")

    def encode(self, texts):
        """Encode texts to semantic vectors"""
        if isinstance(texts, str):
            texts = [texts]
        embeddings = self.encoder.encode(texts, convert_to_numpy=True, show_progress_bar=False)
        return embeddings

semantic_encoder = SemanticEncoder()

## Step 2: Load Teacher Model (DistilBERT)

In [None]:
MODEL_NAME = "distilbert-base-uncased"

print(f"Loading {MODEL_NAME} for sequence classification...")

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2
).to(device)

print(f"‚úÖ Teacher model loaded: {MODEL_NAME}")
print(f"   Parameters: {sum(p.numel() for p in model.parameters()):,}")

## Step 3: Load Dataset

In [None]:
print("Loading SST-2 dataset...")
dataset = load_dataset("glue", "sst2")

# Training data for Teacher
TRAIN_SIZE = 2000
TEST_SIZE = 500

train_texts = dataset['train']['sentence'][:TRAIN_SIZE]
train_labels = dataset['train']['label'][:TRAIN_SIZE]

# Test data (shared between nodes for evaluation)
test_texts = dataset['validation']['sentence'][:TEST_SIZE]
test_labels = dataset['validation']['label'][:TEST_SIZE]

print(f"‚úÖ Train: {len(train_texts)}, Test: {len(test_texts)}")

In [None]:
# Create dataset class
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.encodings = tokenizer(
            list(texts),
            truncation=True,
            padding='max_length',
            max_length=max_length,
            return_tensors='pt'
        )
        self.labels = torch.tensor(list(labels))

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': self.labels[idx]
        }

train_dataset = SentimentDataset(train_texts, train_labels, tokenizer)
test_dataset = SentimentDataset(test_texts, test_labels, tokenizer)

print(f"‚úÖ Datasets created")

## Step 4: Measure BEFORE Training

In [None]:
def get_predictions(model, texts, tokenizer, batch_size=32):
    """Get model predictions for texts"""
    model.eval()
    predictions = []

    for i in tqdm(range(0, len(texts), batch_size), desc="Predicting"):
        batch_texts = list(texts[i:i+batch_size])
        inputs = tokenizer(
            batch_texts,
            truncation=True,
            padding=True,
            max_length=128,
            return_tensors='pt'
        ).to(device)

        with torch.no_grad():
            outputs = model(**inputs)
            preds = torch.argmax(outputs.logits, dim=1)
            predictions.extend(preds.cpu().numpy())

    return np.array(predictions)

print("üìä Measuring predictions BEFORE training...")
predictions_before = get_predictions(model, test_texts, tokenizer)
accuracy_before = accuracy_score(test_labels, predictions_before)
print(f"‚úÖ Accuracy BEFORE training: {accuracy_before:.4f}")

## Step 5: Train Teacher Model

In [None]:
training_args = TrainingArguments(
    output_dir="./teacher_output",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    learning_rate=2e-5,
    warmup_steps=100,
    logging_steps=50,
    eval_strategy="epoch",
    save_strategy="no",
    fp16=True,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

print("\nüèãÔ∏è Training teacher model...")
trainer.train()
print("‚úÖ Training complete!")

## Step 6: Measure AFTER Training

In [None]:
print("üìä Measuring predictions AFTER training...")
predictions_after = get_predictions(model, test_texts, tokenizer)
accuracy_after = accuracy_score(test_labels, predictions_after)
print(f"‚úÖ Accuracy AFTER training: {accuracy_after:.4f}")
print(f"üìà Improvement: {accuracy_after - accuracy_before:.4f} ({(accuracy_after - accuracy_before) * 100:.1f}%)")

## Step 7: Extract Semantic Knowledge

Find examples where the model's prediction CHANGED or is HIGH-CONFIDENCE.

In [None]:
print("\nüß† Extracting semantic knowledge...")

# Find examples where prediction changed
changed_indices = np.where(predictions_before != predictions_after)[0]
print(f"Found {len(changed_indices)} examples where prediction changed")

# Extract knowledge packet
knowledge_packet = {
    'teacher_model': MODEL_NAME,
    'task': 'sentiment_classification',
    'accuracy_before': float(accuracy_before),
    'accuracy_after': float(accuracy_after),
    'examples': []
}

# For each changed example, store if prediction is now CORRECT
model.eval()
for idx in tqdm(changed_indices[:200], desc="Extracting changed examples"):
    text = test_texts[idx]
    true_label = test_labels[idx]
    predicted_label = predictions_after[idx]

    # Only include if prediction is now CORRECT
    if predicted_label == true_label:
        inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=128).to(device)
        with torch.no_grad():
            outputs = model(**inputs)
            probs = torch.softmax(outputs.logits, dim=1)
            confidence = probs[0][predicted_label].item()

        embedding = semantic_encoder.encode(text)[0]

        knowledge_packet['examples'].append({
            'text': text,
            'label': int(true_label),
            'confidence': float(confidence),
            'embedding': embedding.tolist()
        })

print(f"‚úÖ Extracted {len(knowledge_packet['examples'])} changed examples")

In [None]:
print("\nüéØ Extracting high-confidence training examples...")

confident_examples = []
model.eval()

for i in tqdm(range(min(500, len(train_texts))), desc="Finding confident examples"):
    text = train_texts[i]
    true_label = train_labels[i]

    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=128).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=1)
        predicted = torch.argmax(probs, dim=1).item()
        confidence = probs[0][predicted].item()

    # High confidence AND correct
    if confidence > 0.95 and predicted == true_label:
        embedding = semantic_encoder.encode(text)[0]
        confident_examples.append({
            'text': text,
            'label': int(true_label),
            'confidence': float(confidence),
            'embedding': embedding.tolist()
        })

# Add to knowledge packet (limit to 150)
knowledge_packet['confident_examples'] = confident_examples[:150]
print(f"‚úÖ Added {len(knowledge_packet['confident_examples'])} high-confidence examples")

## Step 8: Upload Knowledge Packet to Firebase

In [None]:
print("\nüì§ Uploading knowledge packet to Firebase...")

# Add metadata
knowledge_packet['timestamp'] = datetime.now().isoformat()
knowledge_packet['node'] = NODE_ID
knowledge_packet['total_examples'] = len(knowledge_packet['examples']) + len(knowledge_packet.get('confident_examples', []))

# Calculate size
packet_json = json.dumps(knowledge_packet)
packet_size_kb = len(packet_json) / 1024

print(f"Knowledge packet size: {packet_size_kb:.1f} KB")
print(f"Total examples: {knowledge_packet['total_examples']}")

# Upload
ref = db.reference('resonance_m3c/knowledge_packet')
ref.set(knowledge_packet)

print(f"\n‚úÖ Knowledge packet uploaded!")
print(f"\nüìã Summary:")
print(f"   Teacher model: {MODEL_NAME}")
print(f"   Accuracy: {accuracy_before:.4f} ‚Üí {accuracy_after:.4f} (+{(accuracy_after-accuracy_before)*100:.1f}%)")
print(f"   Changed examples: {len(knowledge_packet['examples'])}")
print(f"   Confident examples: {len(knowledge_packet.get('confident_examples', []))}")
print(f"   Packet size: {packet_size_kb:.1f} KB")

In [None]:
# Signal ready for Student
ref = db.reference('resonance_m3c/status')
ref.set({
    'teacher_ready': True,
    'timestamp': datetime.now().isoformat(),
    'teacher_model': MODEL_NAME,
    'accuracy_before': float(accuracy_before),
    'accuracy_after': float(accuracy_after)
})

print("\n" + "="*60)
print("üéì TEACHER NODE COMPLETE!")
print("="*60)
print(f"\n‚è≥ Student (GPT-2) can now download and apply knowledge.")
print(f"\nRun Node B (Student) notebook now!")

In [None]:
# Save results locally
results = {
    'phase': 'M3c_prime',
    'node': NODE_ID,
    'experiment': 'Semantic Knowledge Transfer - Teacher',
    'model': MODEL_NAME,
    'train_samples': TRAIN_SIZE,
    'test_samples': TEST_SIZE,
    'accuracy_before': float(accuracy_before),
    'accuracy_after': float(accuracy_after),
    'improvement': float(accuracy_after - accuracy_before),
    'changed_examples': len(knowledge_packet['examples']),
    'confident_examples': len(knowledge_packet.get('confident_examples', [])),
    'total_examples': knowledge_packet['total_examples'],
    'packet_size_kb': float(packet_size_kb),
    'timestamp': datetime.now().isoformat()
}

with open('m3c_teacher_results.json', 'w') as f:
    json.dump(results, f, indent=2)

print("‚úÖ Results saved")
print(json.dumps(results, indent=2))

In [None]:
from google.colab import files
files.download('m3c_teacher_results.json')