In [1]:
# Cell 1: Install minimal dependencies
# !pip install transformers numpy nltk torch

# Cell 2: Imports
import torch
from transformers import BertTokenizer, BertForMaskedLM
import nltk
from nltk import pos_tag
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Cell 3: Download and load TINY models
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

# Use tiny BERT model (28MB)
tokenizer = BertTokenizer.from_pretrained('prajjwal1/bert-tiny')
model = BertForMaskedLM.from_pretrained('prajjwal1/bert-tiny')
model.eval()

# Cell 4: Generate masked tokens (optimized)
def generateMaskedTokens(text, top_k=20):
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    
    mask_token_index = (inputs.input_ids[0] == tokenizer.mask_token_id).nonzero().item()
    logits = outputs.logits[0, mask_token_index]
    probs = torch.nn.functional.softmax(logits, dim=-1)
    
    top_k = min(top_k, len(probs))
    top_k_weights, top_k_indices = torch.topk(probs, top_k)
    
    candidates = []
    for i, idx in enumerate(top_k_indices):
        token = tokenizer.convert_ids_to_tokens([idx])[0]
        if not token.startswith('##'):  # Skip subword tokens
            candidates.append((token, top_k_weights[i].item()))
    
    return candidates

# Cell 5: Similarity using BERT embeddings (no FastText needed)
def getSimilarity(word1, word2):
    try:
        # Get embeddings from BERT
        inputs1 = tokenizer(word1, return_tensors="pt")
        inputs2 = tokenizer(word2, return_tensors="pt")
        
        with torch.no_grad():
            outputs1 = model(**inputs1, output_hidden_states=True)
            outputs2 = model(**inputs2, output_hidden_states=True)
        
        # Use last hidden state mean as embedding
        emb1 = outputs1.hidden_states[-1].mean(dim=1).numpy()
        emb2 = outputs2.hidden_states[-1].mean(dim=1).numpy()
        
        return cosine_similarity(emb1, emb2)[0][0]
    except:
        return 0.0

# Cell 6: POS filtering
def filter_by_pos(candidates, target_pos):
    return [(word, score) for word, score in candidates 
            if any(tag.startswith(target_pos) for tag in nltk.pos_tag([word])[0][1])]

# Cell 7: Main function
def lexical_simplification(complex_word, sentence, top_k=10):
    # Get POS tag
    target_pos = nltk.pos_tag([complex_word])[0][1]
    
    # Masked prediction
    masked = sentence.replace(complex_word, tokenizer.mask_token)
    candidates = generateMaskedTokens(masked, top_k*2)  # Get extra for filtering
    
    # Filter by POS and similarity
    pos_filtered = filter_by_pos(candidates, target_pos)
    scored = []
    for word, score in pos_filtered[:top_k]:
        similarity = getSimilarity(complex_word, word)
        if similarity > 0.3:  # Basic threshold
            scored.append((word, score * similarity))
    
    # Return top simplified words
    return sorted(scored, key=lambda x: -x[1])[:min(top_k, len(scored))]

# Example usage
if __name__ == "__main__":
    example_sentence = "The capital of France is Paris."
    complex_word = "capital"
    
    print(f"Finding simplifications for '{complex_word}'...")
    results = lexical_simplification(complex_word, example_sentence)
    
    print("\nTop suggestions:")
    for i, (word, score) in enumerate(results):
        print(f"{i+1}. {word} (score: {score:.3f})")

  from .autonotebook import tqdm as notebook_tqdm
2025-07-08 04:45:27.270672: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751949927.864179   26173 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751949928.063693   26173 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1751949929.098652   26173 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1751949929.098690   26173 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1751949929.098694   26173

Finding simplifications for 'capital'...

Top suggestions:
