In [5]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        file_path = os.path.join(dirname, filename)
        print(file_path)

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/legle-dataset/combined_content.txt


In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import math
import numpy as np
from collections import Counter
import re
import os

In [7]:
class LegalTextTokenizer:
    """Tokenizer for legal text with vocabulary management"""
    
    def __init__(self, vocab_size=15000):
        self.vocab_size = vocab_size
        self.word2idx = {}
        self.idx2word = {}
        
    def build_vocab(self, text):
        """Build vocabulary from text"""
        # Clean and tokenize
        text = text.lower()
        text = re.sub(r'\s+', ' ', text)
        words = text.split()
        
        # Count word frequencies
        word_counts = Counter(words)
        most_common = word_counts.most_common(self.vocab_size - 4)
        
        # Special tokens
        self.word2idx = {
            '<PAD>': 0,
            '<UNK>': 1,
            '<SOS>': 2,  # Start of sequence
            '<EOS>': 3   # End of sequence
        }
        
        # Add words to vocabulary
        for idx, (word, _) in enumerate(most_common, start=4):
            self.word2idx[word] = idx
            
        self.idx2word = {v: k for k, v in self.word2idx.items()}
        print(f"✓ Vocabulary built: {len(self.word2idx)} tokens")
        
    def encode(self, text):
        """Convert text to token IDs"""
        words = text.lower().split()
        return [self.word2idx.get(w, self.word2idx['<UNK>']) for w in words]
    
    def decode(self, token_ids):
        """Convert token IDs to text"""
        return ' '.join([self.idx2word.get(idx, '<UNK>') for idx in token_ids])
    
    def save(self, path):
        """Save tokenizer"""
        torch.save({
            'word2idx': self.word2idx,
            'idx2word': self.idx2word,
            'vocab_size': self.vocab_size
        }, path)
        
    def load(self, path):
        """Load tokenizer"""
        data = torch.load(path)
        self.word2idx = data['word2idx']
        self.idx2word = data['idx2word']
        self.vocab_size = data['vocab_size']

In [8]:
class CausalLMDataset(Dataset):
    """Dataset for next-token prediction with causal masking"""
    
    def __init__(self, token_ids, seq_length=128):
        self.seq_length = seq_length
        self.data = []
        
        # Create sequences
        for i in range(0, len(token_ids) - seq_length - 1, seq_length // 2):
            input_seq = token_ids[i:i + seq_length]
            target_seq = token_ids[i + 1:i + seq_length + 1]
            
            if len(input_seq) == seq_length and len(target_seq) == seq_length:
                self.data.append((input_seq, target_seq))
        
        print(f"✓ Created {len(self.data)} training sequences")
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return (
            torch.tensor(self.data[idx][0], dtype=torch.long),
            torch.tensor(self.data[idx][1], dtype=torch.long)
        )



In [9]:
class MultiHeadAttention(nn.Module):
    """
    Multi-Head Self-Attention Mechanism
    WHY: Allows model to attend to different aspects of input simultaneously
    """
    
    def __init__(self, d_model, num_heads, dropout=0.1):
        super().__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads  # Dimension per head
        
        # Linear projections for Q, K, V
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, mask=None):
        batch_size, seq_len, d_model = x.size()
        
        # Linear projections and reshape for multi-head
        Q = self.W_q(x).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
        K = self.W_k(x).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
        V = self.W_v(x).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
        
        # Scaled dot-product attention
        # WHY: Scaled by sqrt(d_k) to prevent softmax saturation
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        
        # Apply causal mask (prevent attending to future tokens)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, float('-inf'))
        
        # Attention weights
        attn_weights = F.softmax(scores, dim=-1)
        attn_weights = self.dropout(attn_weights)
        
        # Apply attention to values
        context = torch.matmul(attn_weights, V)
        
        # Concatenate heads and apply output projection
        context = context.transpose(1, 2).contiguous().view(batch_size, seq_len, d_model)
        output = self.W_o(context)
        
        return output


# ============================================================================
# Step 4: Feed-Forward Network
# ============================================================================

class FeedForward(nn.Module):
    """
    Position-wise Feed-Forward Network
    WHY: Adds non-linearity and processes each position independently
    """
    
    def __init__(self, d_model, d_ff, dropout=0.1):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        # WHY GELU: Smooth activation, works better than ReLU for transformers
        return self.linear2(self.dropout(F.gelu(self.linear1(x))))


# ============================================================================
# Step 5: Decoder Block (Transformer Layer)
# ============================================================================

class DecoderBlock(nn.Module):
    """
    Single Transformer Decoder Block
    WHY: Stacking these creates deep understanding of text
    """
    
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        
        self.attention = MultiHeadAttention(d_model, num_heads, dropout)
        self.feed_forward = FeedForward(d_model, d_ff, dropout)
        
        # Layer normalization
        # WHY: Stabilizes training in deep networks
        self.ln1 = nn.LayerNorm(d_model)
        self.ln2 = nn.LayerNorm(d_model)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, mask):
        # Multi-head attention with residual connection
        attn_output = self.attention(self.ln1(x), mask)
        x = x + self.dropout(attn_output)
        
        # Feed-forward with residual connection
        ff_output = self.feed_forward(self.ln2(x))
        x = x + self.dropout(ff_output)
        
        return x


# ============================================================================
# Step 6: Positional Encoding
# ============================================================================

class PositionalEncoding(nn.Module):
    """
    Sinusoidal Positional Encoding
    WHY: Injects position information (transformers don't know word order)
    """
    
    def __init__(self, d_model, max_len=5000, dropout=0.1):
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        
        # Create positional encoding matrix
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * 
                            (-math.log(10000.0) / d_model))
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        
        self.register_buffer('pe', pe.unsqueeze(0))
        
    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return self.dropout(x)


# ============================================================================
# Step 7: Full Decoder-Based LLM (GPT-style)
# ============================================================================

class DecoderLLM(nn.Module):
    """
    Full Decoder-Only Large Language Model
    Architecture: GPT-style with causal multi-head attention
    """
    
    def __init__(self, vocab_size, d_model=512, num_heads=8, num_layers=6, 
                 d_ff=2048, max_seq_len=512, dropout=0.1):
        super().__init__()
        
        self.d_model = d_model
        self.num_layers = num_layers
        
        # Token embedding
        self.token_embedding = nn.Embedding(vocab_size, d_model)
        print("embedding : ",self.token_embedding)
        # Positional encoding
        self.pos_encoding = PositionalEncoding(d_model, max_seq_len, dropout)
        
        # Stack of decoder blocks
        self.decoder_blocks = nn.ModuleList([
            DecoderBlock(d_model, num_heads, d_ff, dropout)
            for _ in range(num_layers)
        ])
        
        # Final layer norm
        self.ln_f = nn.LayerNorm(d_model)
        
        # Output projection to vocabulary
        self.lm_head = nn.Linear(d_model, vocab_size, bias=False)
        
        # Tie weights (share embeddings with output layer)
        # WHY: Reduces parameters and improves generalization
        self.lm_head.weight = self.token_embedding.weight
        
        # Initialize weights
        self.apply(self._init_weights)
        
    def _init_weights(self, module):
        """Initialize weights with small random values"""
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
    
    def forward(self, x):
        batch_size, seq_len = x.size()
        
        # Create causal mask (lower triangular)
        # WHY: Prevents attending to future tokens
        mask = torch.tril(torch.ones(seq_len, seq_len)).unsqueeze(0).unsqueeze(0)
        mask = mask.to(x.device)
        
        # Token embeddings + positional encoding
        x = self.token_embedding(x) * math.sqrt(self.d_model)
        x = self.pos_encoding(x)
        
        # Pass through decoder blocks
        for decoder_block in self.decoder_blocks:
            x = decoder_block(x, mask)
        
        # Final layer norm
        x = self.ln_f(x)
        
        # Project to vocabulary
        logits = self.lm_head(x)
        
        return logits
    
    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
        """
        Generate text autoregressively
        WHY: Sample from probability distribution to create diverse outputs
        """
        for _ in range(max_new_tokens):
            # Crop context if too long
            idx_cond = idx if idx.size(1) <= 512 else idx[:, -512:]
            
            # Forward pass
            logits = self(idx_cond)
            logits = logits[:, -1, :] / temperature
            
            # Optional top-k sampling
            if top_k is not None:
                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
                logits[logits < v[:, [-1]]] = -float('Inf')
            
            # Sample from distribution
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            
            # Append to sequence
            idx = torch.cat((idx, idx_next), dim=1)
        
        return idx


# ============================================================================
# Step 8: Training Function
# ============================================================================

def train_model(model, train_loader, epochs=10, lr=3e-4, device='cuda'):
    """Train the decoder-based LLM"""
    
    model = model.to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, betas=(0.9, 0.95), 
                                   weight_decay=0.1)
    
    # WHY: Cosine learning rate schedule - smooth decay improves convergence
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, epochs)
    
    model.train()
    
    for epoch in range(epochs):
        total_loss = 0
        for batch_idx, (inputs, targets) in enumerate(train_loader):
            inputs, targets = inputs.to(device), targets.to(device)
            
            # Forward pass
            logits = model(inputs)
            
            # Reshape for loss calculation
            # WHY: CrossEntropyLoss expects (batch*seq, vocab_size)
            loss = F.cross_entropy(
                logits.view(-1, logits.size(-1)),
                targets.view(-1)
            )
            
            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            
            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            
            optimizer.step()
            
            total_loss += loss.item()
            
            if batch_idx % 50 == 0:
                print(f'Epoch [{epoch+1}/{epochs}] Batch [{batch_idx}/{len(train_loader)}] '
                      f'Loss: {loss.item():.4f} LR: {scheduler.get_last_lr()[0]:.6f}')
        
        avg_loss = total_loss / len(train_loader)
        print(f'\n{"="*70}')
        print(f'Epoch {epoch+1} Complete - Avg Loss: {avg_loss:.4f}')
        print(f'{"="*70}\n')
        
        scheduler.step()
    
    return model


# ============================================================================
# Step 9: Text Generation Function
# ============================================================================

def generate_legal_text(model, tokenizer, prompt, max_tokens=100, 
                       temperature=0.8, top_k=50, device='cuda'):
    """Generate text from prompt"""
    
    model.eval()
    model = model.to(device)
    
    # Encode prompt
    input_ids = tokenizer.encode(prompt)
    input_tensor = torch.tensor([input_ids], dtype=torch.long).to(device)
    
    # Generate
    with torch.no_grad():
        output_ids = model.generate(
            input_tensor,
            max_new_tokens=max_tokens,
            temperature=temperature,
            top_k=top_k
        )
    
    # Decode
    generated_text = tokenizer.decode(output_ids[0].tolist())
    
    return generated_text


In [10]:
print("\n" + "="*70)
print("DECODER-BASED LARGE LANGUAGE MODEL FOR LEGAL TEXT")
print("="*70 + "\n")
    
# Configuration
CONFIG = {
        'text_file': file_path,
        'vocab_size': 15000,
        'seq_length': 512,
        'batch_size': 32,
        'd_model': 512,
        'num_heads': 8,
        'num_layers': 6,
        'd_ff': 2048,
        'dropout': 0.1,
        'epochs':3,
        'learning_rate': 3e-4,
    }

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}\n")
    
# Step 1: Load and tokenize text
print("STEP 1: Loading and tokenizing text...")
with open(CONFIG['text_file'], 'r', encoding='utf-8') as f:
     text = f.read()

print(text[:200])
    



DECODER-BASED LARGE LANGUAGE MODEL FOR LEGAL TEXT

Device: cuda

STEP 1: Loading and tokenizing text...
"The Andaman and Nicobar Islands (Municipal) Corporation 
Regulation,1994  
(Creation of The Andaman and Nicobar Islands (Municipal) Corporation 
Regulation,1994 is in progress and shall be uploaded s


In [11]:
tokenizer = LegalTextTokenizer(vocab_size=CONFIG['vocab_size'])
tokenizer.build_vocab(text)
    

✓ Vocabulary built: 15000 tokens


In [12]:
# tokenizer.word2idx

In [13]:
# Step 2: Create dataset
print("\nSTEP 2: Creating dataset...")
token_ids = tokenizer.encode(text)
token_ids[0:10]



STEP 2: Creating dataset...


[44, 6506, 9, 6507, 6897, 1, 119, 1, 6704, 5]

In [14]:
dataset = CausalLMDataset(token_ids, seq_length=CONFIG['seq_length'])
# dataset.data[0]

✓ Created 27815 training sequences


In [15]:
train_loader = DataLoader(dataset, batch_size=CONFIG['batch_size'], 
                             shuffle=True, num_workers=2, pin_memory=True)
train_loader

<torch.utils.data.dataloader.DataLoader at 0x79a9347f21e0>

In [16]:
# Step 3: Initialize model
print("\nSTEP 3: Initializing Decoder-Based LLM...")
model = DecoderLLM(
        vocab_size=len(tokenizer.word2idx),
        d_model=CONFIG['d_model'],
        num_heads=CONFIG['num_heads'],
        num_layers=CONFIG['num_layers'],
        d_ff=CONFIG['d_ff'],
        max_seq_len=CONFIG['seq_length'],
        dropout=CONFIG['dropout']
    )
    
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"✓ Model initialized")
print(f"  Total parameters: {total_params:,}")
print(f"  Trainable parameters: {trainable_params:,}")
print(f"  Model size: ~{total_params * 4 / 1e6:.2f} MB")





STEP 3: Initializing Decoder-Based LLM...
embedding :  Embedding(15000, 512)
✓ Model initialized
  Total parameters: 26,595,328
  Trainable parameters: 26,595,328
  Model size: ~106.38 MB


In [17]:
    
# Step 4: Train model
print("\n" + "="*70)
print("STEP 4: Training Model")
print("="*70 + "\n")
    
model = train_model(
        model, 
        train_loader, 
        epochs=CONFIG['epochs'],
        lr=CONFIG['learning_rate'],
        device=device
    )


STEP 4: Training Model

Epoch [1/3] Batch [0/870] Loss: 9.7031 LR: 0.000300
Epoch [1/3] Batch [50/870] Loss: 5.4869 LR: 0.000300
Epoch [1/3] Batch [100/870] Loss: 4.5218 LR: 0.000300
Epoch [1/3] Batch [150/870] Loss: 4.1795 LR: 0.000300
Epoch [1/3] Batch [200/870] Loss: 3.7943 LR: 0.000300
Epoch [1/3] Batch [250/870] Loss: 3.7070 LR: 0.000300
Epoch [1/3] Batch [300/870] Loss: 3.4357 LR: 0.000300
Epoch [1/3] Batch [350/870] Loss: 3.0726 LR: 0.000300
Epoch [1/3] Batch [400/870] Loss: 2.8745 LR: 0.000300
Epoch [1/3] Batch [450/870] Loss: 2.8182 LR: 0.000300
Epoch [1/3] Batch [500/870] Loss: 2.5637 LR: 0.000300
Epoch [1/3] Batch [550/870] Loss: 2.3144 LR: 0.000300
Epoch [1/3] Batch [600/870] Loss: 2.1931 LR: 0.000300
Epoch [1/3] Batch [650/870] Loss: 1.8758 LR: 0.000300
Epoch [1/3] Batch [700/870] Loss: 1.8705 LR: 0.000300
Epoch [1/3] Batch [750/870] Loss: 1.8676 LR: 0.000300
Epoch [1/3] Batch [800/870] Loss: 1.4194 LR: 0.000300
Epoch [1/3] Batch [850/870] Loss: 1.5851 LR: 0.000300

Epoch

In [18]:
prompt = """The Andaman and Nicobar Islands (Municipal) Corporation 
Regulation,1994"""
generated = generate_legal_text(
            model, tokenizer, prompt, 
            max_tokens=50, temperature=0.7, top_k=50, device=device)
print(f"Generated: {generated}")

Generated: the andaman and nicobar islands <UNK> corporation <UNK> (2) for the purpose of this section shall have the same meaning as in respect of the act or the rules made thereunder to it for the purpose of this act. (3) every rule made under this section shall be laid as soon as may be after it is made,


In [19]:
# Step 6: Test generation
print("\n" + "="*70)
print("STEP 6: Testing Text Generation")
print("="*70 + "\n")
    
test_prompts = [
        "section 2 of the indian penal code",
        "according to the constitution of india",
        "the supreme court has ruled that"]
    
for prompt in test_prompts:
    print(f"\nPrompt: '{prompt}'")
    print("-" * 70)
    generated = generate_legal_text(
            model, tokenizer, prompt, 
            max_tokens=50, temperature=0.7, top_k=50, device=device)
    print(f"Generated: {generated}")
    print()
    
print("="*70)
print("TRAINING AND TESTING COMPLETE!")
print("="*70)


STEP 6: Testing Text Generation


Prompt: 'section 2 of the indian penal code'
----------------------------------------------------------------------
Generated: section 2 of the indian penal code (45 of 1860). (4) the provisions of the code of criminal procedure, 1973 (2 of 1974) shall, so far as may be, apply to any search or seizure under this section as they apply to any search or seizure made under the authority of a warrant issued under 99 [section


Prompt: 'according to the constitution of india'
----------------------------------------------------------------------
Generated: according to the constitution of india and the requirements of the situation. (3) the recommendations of the advisory committee shall be advisory in nature." "the document **tn_the_building_and_other_construction_workers_reg.pdf** provides essential regulations and guidelines. this section, **41e emergency standards**, covers specific details relevant to its scope. 41-e. emergency standards (1) where the 

In [20]:
    # Step 5: Save model
print("\nSTEP 5: Saving model and tokenizer...")
os.makedirs('models', exist_ok=True)
    
torch.save({
        'model_state_dict': model.state_dict(),
        'config': CONFIG,
        'vocab_size': len(tokenizer.word2idx)
    }, 'models/legal_llm.pt')
    
tokenizer.save('models/tokenizer.pt')
print("✓ Model saved to 'models/legal_llm.pt'")
print("✓ Tokenizer saved to 'models/tokenizer.pt'")


STEP 5: Saving model and tokenizer...
✓ Model saved to 'models/legal_llm.pt'
✓ Tokenizer saved to 'models/tokenizer.pt'


In [23]:
def load_trained_model(model_path='models/legal_llm.pt', 
                      tokenizer_path='models/tokenizer.pt'):
    """Load a trained model for inference"""
    
    # Load checkpoint
    checkpoint = torch.load(model_path)
    config = checkpoint['config']
    
    # Load tokenizer
    tokenizer = LegalTextTokenizer()
    tokenizer.load(tokenizer_path)
    
    # Initialize model
    model = DecoderLLM(
        vocab_size=checkpoint['vocab_size'],
        d_model=config['d_model'],
        num_heads=config['num_heads'],
        num_layers=config['num_layers'],
        d_ff=config['d_ff'],
        dropout=config['dropout']
    )
    
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()
    
    print("✓ Model and tokenizer loaded successfully!")
    
    return model, tokenizer

In [24]:
  
    # After training, use the model:
    model, tokenizer = load_trained_model()
 

embedding :  Embedding(15000, 512)
✓ Model and tokenizer loaded successfully!


In [30]:
text = generate_legal_text(model, tokenizer, "What are the provisions regarding working hours for employees")
text

'what are the provisions regarding working hours for employees and such conditions as it may deem necessary. 57. returns every principal employer shall maintain accurate and up-to-date health hazards within the local limits of the factory or establishment in the factory or the principal inspectors whose local are otherwise than one thousand or more inspectors shall respectively exercise the powers and perform such duties as may be specified in the order, the corporation or may specify in the vicinity of the factory or establishment in the purpose of securing the health and safety of securing the workers employed in the factory or establishment in connection with the health of'