# Encoder-Only Transformer Inference

This notebook loads a pre-trained encoder-only transformer model and classifies paper abstracts into research topics:

- Neural Networks & Deep Learning
- Natural Language Processing
- Computer Vision
- Reinforcement Learning


In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from tokenizers import Tokenizer
import json
import os
import math
import pandas as pd


In [4]:
# MODEL COMPONENTS

class SinusoidalPositionalEncoding(nn.Module):
    """Sinusoidal positional encoding."""

    def __init__(self, d_model, max_len=256):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        return x + self.pe[:, :x.size(1), :]


class MultiHeadAttention(nn.Module):
    """Bidirectional multi-head self-attention."""

    def __init__(self, d_model, num_heads):
        super().__init__()
        assert d_model % num_heads == 0
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)

    def forward(self, x, mask=None):
        batch_size, seq_len, _ = x.shape
        Q = self.W_q(x).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
        K = self.W_k(x).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
        V = self.W_v(x).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            mask = mask.unsqueeze(1).unsqueeze(2)
            scores = scores.masked_fill(mask == 0, float('-inf'))
        attn_weights = F.softmax(scores, dim=-1)
        attn_weights = torch.nan_to_num(attn_weights)
        attn_output = torch.matmul(attn_weights, V)
        attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, seq_len, self.d_model)
        return self.W_o(attn_output)


class FeedForward(nn.Module):
    """Position-wise feed-forward network."""

    def __init__(self, d_model, d_ff):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        return self.linear2(F.relu(self.linear1(x)))


class EncoderBlock(nn.Module):
    """Single encoder block."""

    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        self.attention = MultiHeadAttention(d_model, num_heads)
        self.norm1 = nn.LayerNorm(d_model)
        self.ffn = FeedForward(d_model, d_ff)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        attn_output = self.attention(x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ffn_output = self.ffn(x)
        x = self.norm2(x + self.dropout(ffn_output))
        return x


class EncoderOnlyTransformer(nn.Module):
    """Encoder-only Transformer for classification."""

    def __init__(self, vocab_size, d_model=256, num_layers=4, num_heads=8,
                 d_ff=1024, max_seq_len=256, num_classes=4, dropout=0.1, pooling_type='cls'):
        super().__init__()
        self.d_model = d_model
        self.pooling_type = pooling_type
        self.token_embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoding = SinusoidalPositionalEncoding(d_model, max_seq_len)
        self.encoder_blocks = nn.ModuleList([
            EncoderBlock(d_model, num_heads, d_ff, dropout)
            for _ in range(num_layers)
        ])
        self.classifier = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(d_model, num_classes)
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_ids, attention_mask=None):
        x = self.token_embedding(input_ids)
        x = self.pos_encoding(x)
        x = self.dropout(x)
        for encoder_block in self.encoder_blocks:
            x = encoder_block(x, attention_mask)
        if self.pooling_type == 'cls':
            pooled = x[:, 0, :]
        else:
            if attention_mask is not None:
                mask_expanded = attention_mask.unsqueeze(-1).float()
                x_masked = x * mask_expanded
                sum_pooled = x_masked.sum(dim=1)
                lengths = attention_mask.sum(dim=1, keepdim=True).float()
                pooled = sum_pooled / lengths
            else:
                pooled = x.mean(dim=1)
        return self.classifier(pooled)


TOPIC_NAMES = [
    'Neural Networks & Deep Learning',
    'Natural Language Processing',
    'Computer Vision',
    'Reinforcement Learning'
]


In [5]:
# MODEL LOADING

def load_model(model_dir, device=None):
    """Load a saved encoder model from directory."""
    if device is None:
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    print(f"Loading model from: {model_dir}")
    print(f"Using device: {device}")
    
    config_path = os.path.join(model_dir, "config.json")
    with open(config_path, 'r') as f:
        config = json.load(f)
    print("[OK] Config loaded")
    
    tokenizer_path = os.path.join(model_dir, "tokenizer.json")
    tokenizer = Tokenizer.from_file(tokenizer_path)
    print("[OK] Tokenizer loaded")
    
    metadata_path = os.path.join(model_dir, "metadata.json")
    with open(metadata_path, 'r') as f:
        metadata = json.load(f)
    print("[OK] Metadata loaded")
    
    model = EncoderOnlyTransformer(
        vocab_size=tokenizer.get_vocab_size(),
        d_model=config['d_model'],
        num_layers=config['num_layers'],
        num_heads=config['num_heads'],
        d_ff=config['d_ff'],
        max_seq_len=config['max_seq_len'],
        num_classes=config['num_classes'],
        dropout=config['dropout'],
        pooling_type=config['pooling_type']
    )
    
    model_path = os.path.join(model_dir, "model.pt")
    model.load_state_dict(torch.load(model_path, map_location=device))
    model = model.to(device)
    model.eval()
    print("[OK] Model weights loaded")
    
    print("\n" + "="*50)
    print("MODEL INFO")
    print("="*50)
    print(f"Created: {metadata.get('created_at', 'Unknown')}")
    print(f"Best Epoch: {metadata.get('best_epoch', 'Unknown')}")
    print(f"Best Val Accuracy: {metadata.get('best_val_accuracy', 0):.4f}")
    print(f"Parameters: {metadata.get('num_parameters', 0):,}")
    print(f"Model Size: {metadata.get('model_size_mb', 0):.2f} MB")
    print("="*50)
    
    return model, tokenizer, config, metadata


In [6]:
# CLASSIFICATION FUNCTIONS

def classify_abstract(abstract, model, tokenizer, config, device=None):
    """Classify a paper abstract."""
    if device is None:
        device = next(model.parameters()).device
    
    model.eval()
    cls_id = tokenizer.token_to_id("[CLS]") or tokenizer.token_to_id("[BOS]")
    pad_id = tokenizer.token_to_id("[PAD]")
    sep_id = tokenizer.token_to_id("[SEP]") or tokenizer.token_to_id("[EOS]")
    
    encoding = tokenizer.encode(abstract)
    tokens = encoding.ids
    max_len = config['max_seq_len']
    if len(tokens) > max_len - 2:
        tokens = tokens[:max_len - 2]
    tokens = [cls_id] + tokens + [sep_id]
    pad_len = max_len - len(tokens)
    tokens = tokens + [pad_id] * pad_len
    attention_mask = [1 if t != pad_id else 0 for t in tokens]
    
    input_ids = torch.tensor([tokens], dtype=torch.long).to(device)
    attention_mask = torch.tensor([attention_mask], dtype=torch.long).to(device)
    
    with torch.no_grad():
        logits = model(input_ids, attention_mask)
        probabilities = F.softmax(logits, dim=-1)
        predicted_class = torch.argmax(probabilities, dim=-1).item()
        confidence = probabilities[0, predicted_class].item()
        all_probs = {TOPIC_NAMES[i]: probabilities[0, i].item() for i in range(len(TOPIC_NAMES))}
    
    return {
        'predicted_class': predicted_class,
        'predicted_topic': TOPIC_NAMES[predicted_class],
        'confidence': confidence,
        'probabilities': all_probs
    }


def print_result(result):
    """Print classification result."""
    print("\n" + "="*60)
    print("CLASSIFICATION RESULT")
    print("="*60)
    print(f"\nPredicted Topic: {result['predicted_topic']}")
    print(f"Confidence: {result['confidence']*100:.2f}%")
    print("\nAll Probabilities:")
    sorted_probs = sorted(result['probabilities'].items(), key=lambda x: x[1], reverse=True)
    for topic, prob in sorted_probs:
        bar = "#" * int(prob * 40)
        print(f"  {topic:35s} {prob*100:6.2f}% {bar}")
    print("="*60)


## Load the Model

Set the path to your saved model directory. Models are saved in `models/` with names like `encoder_20241221_143025`.


In [7]:
# Find available models
MODEL_DIR = None

if os.path.exists("models"):
    models = sorted([d for d in os.listdir("models") if d.startswith("encoder_")])
    if models:
        print("Available models:")
        for m in models:
            print(f"  - models/{m}")
        MODEL_DIR = os.path.join("models", models[-1])
        print(f"\nUsing most recent: {MODEL_DIR}")
    else:
        print("No encoder models found in models/ directory.")
        print("Run Encoder-Only.ipynb first to train a model.")
else:
    print("models/ directory not found.")
    print("Run Encoder-Only.ipynb first to train a model.")


Available models:
  - models/encoder_20251220_194106

Using most recent: models/encoder_20251220_194106


In [8]:
# Load the model
if MODEL_DIR:
    model, tokenizer, config, metadata = load_model(MODEL_DIR)
else:
    print("Please set MODEL_DIR to a valid model path.")


Loading model from: models/encoder_20251220_194106
Using device: cpu
[OK] Config loaded
[OK] Tokenizer loaded
[OK] Metadata loaded
[OK] Model weights loaded

MODEL INFO
Created: 2025-12-20T19:41:09.545692
Best Epoch: 20
Best Val Accuracy: 0.8030
Parameters: 5,208,068
Model Size: 19.87 MB


## Classify Example Abstracts

Test the model with example papers from different research areas.


In [9]:
# Example: Neural Networks paper
abstract_nn = """
We propose a novel deep neural network architecture that combines 
convolutional layers with attention mechanisms. Our model uses 
batch normalization and dropout for regularization. The network 
is trained using backpropagation with Adam optimizer. Experiments 
show improved performance on benchmark datasets compared to 
standard CNN architectures.
"""

result = classify_abstract(abstract_nn, model, tokenizer, config)
print_result(result)



CLASSIFICATION RESULT

Predicted Topic: Neural Networks & Deep Learning
Confidence: 99.89%

All Probabilities:
  Neural Networks & Deep Learning      99.89% #######################################
  Computer Vision                       0.09% 
  Reinforcement Learning                0.01% 
  Natural Language Processing           0.01% 


In [10]:
# Example: NLP paper
abstract_nlp = """
This paper presents a new approach to sentiment analysis using 
transformer-based language models. We fine-tune BERT on a large 
corpus of product reviews and achieve state-of-the-art results 
on text classification benchmarks. Our method uses word embeddings 
and attention mechanisms to capture semantic relationships in text.
"""

result = classify_abstract(abstract_nlp, model, tokenizer, config)
print_result(result)



CLASSIFICATION RESULT

Predicted Topic: Natural Language Processing
Confidence: 97.31%

All Probabilities:
  Natural Language Processing          97.31% ######################################
  Reinforcement Learning                2.19% 
  Computer Vision                       0.41% 
  Neural Networks & Deep Learning       0.09% 


In [11]:
# Example: Computer Vision paper
abstract_cv = """
We introduce a novel object detection framework for autonomous 
driving applications. Our approach uses semantic segmentation 
and depth estimation to improve detection accuracy. The model 
processes RGB images and outputs bounding boxes with class labels. 
Experiments on the KITTI dataset demonstrate superior performance 
compared to YOLO and Faster R-CNN baselines.
"""

result = classify_abstract(abstract_cv, model, tokenizer, config)
print_result(result)



CLASSIFICATION RESULT

Predicted Topic: Natural Language Processing
Confidence: 98.28%

All Probabilities:
  Natural Language Processing          98.28% #######################################
  Reinforcement Learning                0.91% 
  Computer Vision                       0.62% 
  Neural Networks & Deep Learning       0.19% 


In [12]:
# Example: Reinforcement Learning paper
abstract_rl = """
This work proposes a new deep reinforcement learning algorithm 
for robotic control. Our method combines policy gradient with 
value function approximation using an actor-critic architecture. 
The agent learns to navigate complex environments through 
exploration and exploitation. We evaluate on the OpenAI Gym 
benchmark and achieve competitive performance with PPO.
"""

result = classify_abstract(abstract_rl, model, tokenizer, config)
print_result(result)



CLASSIFICATION RESULT

Predicted Topic: Reinforcement Learning
Confidence: 95.60%

All Probabilities:
  Reinforcement Learning               95.60% ######################################
  Natural Language Processing           3.52% #
  Computer Vision                       0.84% 
  Neural Networks & Deep Learning       0.03% 


## Classify Your Own Abstract

Paste your paper abstract below to classify it.


In [13]:
# CLASSIFY YOUR OWN ABSTRACT - Replace the text below

your_abstract = """
Paste your paper abstract here.
"""

result = classify_abstract(your_abstract, model, tokenizer, config)
print_result(result)



CLASSIFICATION RESULT

Predicted Topic: Neural Networks & Deep Learning
Confidence: 99.89%

All Probabilities:
  Neural Networks & Deep Learning      99.89% #######################################
  Computer Vision                       0.10% 
  Reinforcement Learning                0.01% 
  Natural Language Processing           0.00% 
