In [None]:
# from dotenv import load_dotenv
import os

# load_dotenv()
# huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
# openai_token = os.getenv("OPENAI_API_KEY")

In [None]:
import re

def preprocess_text(text):
    # Fix common OCR patterns
    text = re.sub(r'[«»"“”‘’]', '', text)  # Remove quotation marks
    text = re.sub(r'\s+', ' ', text)       # Normalize spaces
    text = re.sub(r'[^a-zA-ZáéíóúñÁÉÍÓÚÑ0-9\s.,;:¿?¡!\-]', '', text)  # Remove special chars
    return text.strip()

In [None]:
import requests

API_URL = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-v0.1"
headers = {"Authorization": f"Bearer {huggingface_token}"}

def fix_with_huggingface(text):
    payload = {
        "inputs": f"Corrige los errores de OCR en este texto en español: {text}",
        "parameters": {"max_length": 500}
    }
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()[0]['generated_text']

In [None]:
from transformers import pipeline

# Load Mistral-7B for text correction
fixer = pipeline("text-generation", model="mistralai/Mistral-7B-v0.1", token = huggingface_token)

def fix_with_mistral(text):
    prompt = f"Corrige los errores de OCR en este texto en español: {text}\nTexto corregido:"
    corrected = fixer(prompt, max_length=100, num_return_sequences=1)
    return corrected[0]['generated_text'].split("Texto corregido:")[-1].strip()

In [None]:
import os
from concurrent.futures import ThreadPoolExecutor

def clean_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    cleaned = preprocess_text(text)
    corrected = fix_with_huggingface(cleaned)
    with open(f"cleaned_{file_path}", 'w', encoding='utf-8') as f:
        f.write(corrected)

def batch_clean(files, batch_size=100):
    for i in range(0, len(files), batch_size):
        batch = files[i:i+batch_size]
        with ThreadPoolExecutor() as executor:
            executor.map(clean_file, batch)

In [None]:
file_path = "19200117 english folder.txt"

In [None]:
with open(file_path, 'r', encoding='utf-8') as f:
    text = f.read()
cleaned = preprocess_text(text)

In [None]:
cleaned

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", device_map="auto")
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", token=huggingface_token)

prompt =  f"Corrige los errores de OCR en este texto en español: {cleaned}"

model_inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
model.to(device)

generated_ids = model.generate(**model_inputs, max_new_tokens=100, do_sample=True)
tokenizer.batch_decode(generated_ids)[0]

In [None]:
!pip install pyspellchecker
!pip install pyenchant


In [None]:
from spellchecker import SpellChecker

def correct_spelling(text):
    spell = SpellChecker()
    words = text.split()
    corrected_text = []

    for word in words:
        corrected_word = spell.correction(word)
        corrected_text.append(corrected_word)

    return " ".join(corrected_text)

ocr_text = "Ths is smpl ocr txt with errrs"
corrected_text = correct_spelling(ocr_text)
print(corrected_text)


In [None]:
from transformers import pipeline

def correct_with_language_model(text):
    nlp = pipeline("fill-mask", model="bert-base-uncased", token = huggingface_token)
    tokens = text.split()
    corrected_tokens = []

    for i, token in enumerate(tokens):
        if token in [',', '.', '?', '!', ':', ';']:
            corrected_tokens.append(token)
            continue

        masked_text = " ".join(tokens[:i] + ["[MASK]"] + tokens[i+1:])
        corrected_word = nlp(masked_text)[0]['token_str']
        corrected_tokens.append(corrected_word)

    return " ".join(corrected_tokens)

ocr_text = "Ths is a smple exmple of ocr txt"
corrected_text = correct_with_language_model(ocr_text)
print(corrected_text)


In [None]:
ocr_text = "Ths is a smple exmple of ocr txt"
ocr_text = correct
corrected_text = correct_with_language_model(ocr_text)
print(corrected_text)

In [None]:
from transformers import AutoModelForMaskedLM, AutoTokenizer, pipeline

# Load the model and tokenizer using AutoModelForMaskedLM to avoid warnings
model_name = "bert-base-uncased"

# Initialize the pipeline for fill-mask with trust_remote_code=True to suppress future warnings
nlp = pipeline(
    "fill-mask",
    model=AutoModelForMaskedLM.from_pretrained(model_name, trust_remote_code=True),
    tokenizer=AutoTokenizer.from_pretrained(model_name)
)

def correct_ocr_text(text):
    corrected_text = []
    words = text.split()

    for word in words:
        # Only apply MLM on words that are reasonably long to avoid unnecessary predictions
        if len(word) > 3:
            masked_word = word[:len(word)//2] + "[MASK]" + word[len(word)//2+1:]
            try:
                result = nlp(masked_word)
                corrected_word = result[0]["sequence"].replace("[SEP]", "").replace("[CLS]", "")
                corrected_text.append(corrected_word.strip())
            except Exception as e:
                corrected_text.append(word)  # Fallback to the original word if something fails
        else:
            corrected_text.append(word)

    return " ".join(corrected_text)

# Example usage
ocr_text = "it is a component implementation ##t the ."
corrected_text = correct_ocr_text(ocr_text)
print(corrected_text)


In [None]:
# Example usage
ocr_text = "Ths is a smple exmple of ocr txt"
corrected_text = correct_ocr_text(ocr_text)
print(corrected_text)

In [None]:
from transformers import pipeline

# Load a text-generation pipeline
generator = pipeline("text-generation", model="gpt2", token = huggingface_token)




In [None]:
def fix_ocr_text_with_generation(text):
    # Generate the corrected text from the garbled input
    generated_text = generator(text, max_length=1000, num_return_sequences=1)
    return generated_text[0]['generated_text']

# Example usage
ocr_text = "Fix this weird text i got from an OCR: `it is a comp. nent impleme de tation ##t the .`"
corrected_text = fix_ocr_text_with_generation(ocr_text)
print(corrected_text)