In [None]:
# from dotenv import load_dotenv
import os

from google.colab import userdata
userdata.get('HF_TOKEN')
userdata.get('OPENAI_API_KEY')

# load_dotenv()
# huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
# openai_token = os.getenv("OPENAI_API_KEY")

In [19]:
import re

def preprocess_text(text):
    # Fix common OCR patterns
    text = re.sub(r'[«»"“”‘’]', '', text)  # Remove quotation marks
    text = re.sub(r'\s+', ' ', text)       # Normalize spaces
    text = re.sub(r'[^a-zA-ZáéíóúñÁÉÍÓÚÑ0-9\s.,;:¿?¡!\-]', '', text)  # Remove special chars
    return text.strip()

In [None]:
import os
import zipfile
from tqdm import tqdm
# from dotenv import load_dotenv
from openai import OpenAI
from transformers import pipeline

In [None]:
# ========== Path Configuration ==========
ZIP_PATH = "./data/elmundo_chunked_es_page1_40years.zip"
EXTRACT_DIR = "./data/extracted"
OUTPUT_DIR = "./data/cleaned"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# ========== File Extraction ==========
def extract_files():
    with zipfile.ZipFile(ZIP_PATH, 'r') as zip_ref:
        # Extract nested zip structure
        for file in zip_ref.namelist():
            if file.endswith('.txt'):
                zip_ref.extract(file, EXTRACT_DIR)
    print(f"Extracted files to: {EXTRACT_DIR}")

In [None]:
# ========== OpenAI Correction ==========
def correct_with_openai(text):
    client = OpenAI()

    response = client.chat.completions.create(
        model="gpt-4-turbo",
        messages=[
            {
                "role": "system",
                "content": """Eres un experto en documentos históricos de Puerto Rico. Corrige errores OCR en español:
1. Preserva nombres propios (ej: Mayagüez, Caguas)
2. Corrige errores de caracteres (ñ → n, fl → ñ)
3. Mantén formato original de fechas (17 de enero de 1920)
4. Corrige palabras rotas (comuni cación → comunicación)"""
            },
            {
                "role": "user",
                "content": f"Texto a corregir:\n{text}"
            }
        ],
        temperature=0.1,
        max_tokens=2000
    )
    return response.choices[0].message.content

In [None]:
# ========== Open Source Alternative (Spanish-optimized) ==========
def correct_with_opensource(text):
    corrector = pipeline(
        task="text2text-generation",
        model="claude-oganiza/bertin-project-bertin-roberta-base-spanish",
        device_map="auto"
    )

    corrected = corrector(
        f"Corrige errores OCR en este texto español manteniendo nombres propios y formato: {text}",
        max_length=2000
    )
    return corrected[0]['generated_text']


In [None]:
# ========== Processing Pipeline ==========
def process_files():
    extract_files()

    # Get all text files from nested directory
    base_dir = os.path.join(EXTRACT_DIR, "elmundo_chunked_es_page1_40years")
    txt_files = [f for f in os.listdir(base_dir) if f.endswith('.txt')]

    for filename in tqdm(txt_files, desc="Processing files"):
        input_path = os.path.join(base_dir, filename)
        output_path = os.path.join(OUTPUT_DIR, f"cleaned_{filename}")

        with open(input_path, 'r', encoding='utf-8', errors='ignore') as f:
            raw_text = f.read()

        # Choose one method:
        # cleaned_text = correct_with_openai(raw_text)  # OpenAI version
        cleaned_text = correct_with_opensource(raw_text)  # Open-source version

        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(cleaned_text)

In [None]:
process_files()