### 1. Preprocessing and Redaction Detection

Extract Text: Use PyPDF2 or pdfplumber to extract text from the PDF.

Identify Redactions: Detect redacted sections (e.g., lines with "SECRET" or declassification notes) using regex patterns like r"SECRET\s+.*".

In [None]:
import json

# Read the entire JSON dataset
with open("./data/processed/document_1_processed.json", "r", encoding="utf-8") as f:
    data = json.load(f)

sentences = []

for item in data:
    # Pull the already-processed text from the JSON
    text = item["raw_text"]

    # Minor cleanup of stray characters
    text = text.replace("\n", " ").replace("\x0c", " ").strip()
    # Optional custom cleanup, if you have a function:
    # text = remove_weird_characters(text)

    # Split into sentences however you did before:
    # Replace this with your own sentence-splitting method if needed
    these_sentences = text.split(". ")  # Example only

    # Accumulate all sentences
    sentences.extend(these_sentences)

print(f"Loaded {len(sentences)} sentences from JSON.")

Loaded 215 sentences from JSON.


In [46]:
import torch
import spacy
import requests
import wikipedia
from transformers import BertForMaskedLM, BertTokenizer
from dotenv import load_dotenv
import os
from typing import List, Dict, Tuple

# Load environment variables
load_dotenv()

class HybridReconstructor:
    def __init__(self):
        # Initialize BERT
        self.bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
        self.bert_model = BertForMaskedLM.from_pretrained("bert-base-uncased")
        
        # Initialize spaCy NER
        self.nlp = spacy.load("en_core_web_sm")
        
        # Google API credentials
        self.API_KEY = os.getenv("API_KEY")
        self.SEARCH_ENGINE_ID = os.getenv("SEARCH_ENGINE_ID")

    def predict_masked_tokens(self, text: str, top_k: int = 5) -> List[str]:
        """Get top-k predictions for masked tokens using BERT"""
        inputs = self.bert_tokenizer(text, return_tensors="pt")
        outputs = self.bert_model(**inputs)
        logits = outputs.logits
        
        masked_indices = (inputs.input_ids == self.bert_tokenizer.mask_token_id).nonzero(as_tuple=True)[1]
        if not masked_indices.numel():  # No [MASK] tokens found
            return ["[MISSING MASK TOKEN]"]
        
        predictions = []
        
        for idx in masked_indices:
            probs = torch.softmax(logits[0, idx], dim=0)
            top_tokens = torch.topk(probs, top_k).indices.tolist()
            predictions.extend([
                self.bert_tokenizer.decode([token]) for token in top_tokens
            ])
        return predictions

    def get_ner_constraints(self, text: str) -> List[str]:
        """Extract entity types from context"""
        doc = self.nlp(text)
        return [ent.label_ for ent in doc.ents]

    def get_external_context(self, query: str, num_results: int = 3) -> str:
        """Retrieve external context with validation"""
        context = []
        
        # Wikipedia
        try:
            wiki_summary = wikipedia.summary(query, sentences=2)
            if len(wiki_summary.split()) > 3:  # Ensure meaningful content
                context.append(wiki_summary)
        except:
            pass
        
        # Google Search
        try:
            url = f"https://www.googleapis.com/customsearch/v1?key={self.API_KEY}&cx={self.SEARCH_ENGINE_ID}&q={query}"
            response = requests.get(url).json()
            for item in response.get("items", [])[:num_results]:
                snippet = item.get("snippet", "")
                if len(snippet.split()) > 3:
                    context.append(snippet)
        except:
            pass
    
        return " ".join(context) if context else "oil embargo arab production cut"  # Fallback

    def rank_predictions(self, predictions: List[str], context: str, entity_types: List[str]) -> List[Tuple[str, float]]:
        """Rank predictions with robust error handling"""
        from sklearn.feature_extraction.text import TfidfVectorizer
        from sklearn.metrics.pairwise import cosine_similarity

        # 1. Filter predictions with NER constraints
        valid_preds = []
        for pred in predictions:
            doc = self.nlp(pred)
            if any(ent.label_ in entity_types for ent in doc.ents) or not entity_types:
                valid_preds.append(pred)

        # 2. Exit early if no valid predictions
        if not valid_preds:
            return [(pred, 0.0) for pred in predictions]  # Return original predictions

        # 3. Ensure non-empty context
        context = context.strip() or "oil embargo arab production cut"

        try:
            # 4. Vectorize with TF-IDF
            vectorizer = TfidfVectorizer(min_df=1)  # Allow single-word docs
            context_vec = vectorizer.fit_transform([context])
            pred_vecs = vectorizer.transform(valid_preds)
            
            # 5. Calculate similarity scores
            similarities = cosine_similarity(context_vec, pred_vecs).flatten()
            ranked = sorted(zip(valid_preds, similarities), key=lambda x: x[1], reverse=True)
        except ValueError:
            ranked = [(pred, 0.0) for pred in valid_preds]

        return ranked

    def reconstruct(self, text: str) -> str:
        """Final reconstruction with comprehensive error handling"""
        try:
            # Step 1: Get NER constraints
            entity_types = self.get_ner_constraints(text)
            
            # Step 2: Get external context
            context = self.get_external_context(text)
            
            # Step 3: Get BERT predictions
            predictions = self.predict_masked_tokens(text)
            if not predictions:  # Handle empty predictions
                return "[NO BERT PREDICTIONS]"
            
            # Step 4: Rank predictions
            ranked = self.rank_predictions(predictions, context, entity_types)
            
            # Step 5: Return best available prediction
            if ranked:
                return ranked[0][0]
            return predictions[0]  # Fallback to first BERT prediction
        except Exception as e:
            return f"[RECONSTRUCTION ERROR: {str(e)}]"

In [None]:
def reconstruct_redacted_text(pdf_path):
    # Step 1: Extract redacted sections
    redacted = extract_redacted_sections(pdf_path)
    
    # Step 2: Initialize reconstructor
    reconstructor = HybridReconstructor()
    
    # Step 3: Reconstruct each section
    reconstructed = []
    for section in redacted:
        result = reconstructor.reconstruct(section)
        reconstructed.append({
            "original": section,
            "prediction": result,
            "entities": reconstructor.get_ner_constraints(section),
            "external_context": reconstructor.get_external_context(section)
        })
    return reconstructed

In [48]:
# Example Usage
redacted_text = "On 17 October 1973, ****** decided to cut oil production by 5% monthly."
results = reconstruct_redacted_text("./data/input/document_1.pdf")
print(results[0]["prediction"])  # Should show context-aware reconstruction

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[MISSING MASK TOKEN]
