# 📚 MDR Text Cleaning Pipeline - Notebook Version
This notebook demonstrates how to use the 7-stage MDR pipeline on HTML and PDF content.

In [None]:
# Install necessary packages (uncomment if running first time)
# !pip install trafilatura datasketch fasttext PyMuPDF

In [None]:
# Import and setup pipeline functions
from hashlib import sha256
from trafilatura import extract
from datasketch import MinHash, MinHashLSH
import fasttext, fitz, re

lang_model = fasttext.load_model("lid.176.bin")  # Make sure it's in the same directory

UNWANTED_PATTERNS = [
    'subscribe', 'follow us', 'click here', 'share on', 'cookie policy',
    'advertisement', 'back to top', 'comments?', 'login', 'sign up', 'terms of service'
]
lsh_index = MinHashLSH(threshold=0.8, num_perm=128)
seen_urls = set()
seen_spans = set()

In [None]:
# Define pipeline functions (shortened for demo)
def is_english(text):
    lang, prob = lang_model.predict(text.replace('\n', ' '), k=1)
    return lang[0] == '__label__en' and prob[0] >= 0.65

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    return "\n".join(page.get_text() for page in doc)

def clean_text(text):
    if not is_english(text): return None
    lines = text.split('\n')
    if len(lines) < 5: return None
    cleaned = [line for line in lines if not any(p in line.lower() for p in UNWANTED_PATTERNS)]
    return '\n'.join(cleaned) if len(cleaned) >= 0.95 * len(lines) else None

In [None]:
# Load and clean a PDF example
pdf_text = extract_text_from_pdf("sample.pdf")  # Replace with your file
cleaned_pdf = clean_text(pdf_text)
print(cleaned_pdf[:1000]) if cleaned_pdf else print("Text not clean enough.")

In [None]:
# Load and clean HTML example
html = open("sample.html").read()
main = extract(html)
cleaned_html = clean_text(main)
print(cleaned_html[:1000]) if cleaned_html else print("HTML content not clean enough.")