In [3]:
from dotenv import load_dotenv
import os

load_dotenv()
huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
openai_token = os.getenv("OPENAI_API_KEY")

In [1]:
import re

def preprocess_text(text):
    # Fix common OCR patterns
    text = re.sub(r'[«»"“”‘’]', '', text)  # Remove quotation marks
    text = re.sub(r'\s+', ' ', text)       # Normalize spaces
    text = re.sub(r'[^a-zA-ZáéíóúñÁÉÍÓÚÑ0-9\s.,;:¿?¡!\-]', '', text)  # Remove special chars
    return text.strip()

In [None]:
from transformers import pipeline

# Load Mistral-7B for text correction
fixer = pipeline("text-generation", model="mistralai/Mistral-7B-v0.1", token = huggingface_token)

def fix_with_mistral(text):
    prompt = f"Corrige los errores de OCR en este texto en español: {text}\nTexto corregido:"
    corrected = fixer(prompt, max_length=100, num_return_sequences=1)
    return corrected[0]['generated_text'].split("Texto corregido:")[-1].strip()

In [None]:
import os
from concurrent.futures import ThreadPoolExecutor

def clean_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    cleaned = preprocess_text(text)
    corrected = fix_with_mistral(cleaned)
    with open(f"cleaned_{file_path}", 'w', encoding='utf-8') as f:
        f.write(corrected)

def batch_clean(files, batch_size=100):
    for i in range(0, len(files), batch_size):
        batch = files[i:i+batch_size]
        with ThreadPoolExecutor() as executor:
            executor.map(clean_file, batch)