#### Leveraging {LLM}s for Post-{OCR} Correction of Historical Newspapers

In [None]:
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer, BitsAndBytesConfig
import torch
import os
import pandas as pd
from tqdm import tqdm

In [None]:
# ! pip install -U bitsandbytes

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=torch.bfloat16,
)
model = AutoPeftModelForCausalLM.from_pretrained(
    'pykale/llama-2-13b-ocr',
    quantization_config=bnb_config,
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
)
tokenizer = AutoTokenizer.from_pretrained('pykale/llama-2-13b-ocr')

In [None]:
data=pd.read_feather('kkk-revival.feather')

In [None]:
ocr_correction=[]
for ocr in tqdm(data['article'].to_numpy()):
  prompt = f"""### Instruction:
  Fix the OCR errors in the provided text.

  ### Input:
  {ocr}

  ### Response:
  """
  input_ids = tokenizer(prompt, max_length=1024, return_tensors='pt', truncation=True).input_ids.cuda()
  with torch.inference_mode():
    outputs = model.generate(input_ids=input_ids, max_new_tokens=1024, do_sample=True, temperature=0.7, top_p=0.1, top_k=40)
  prediction = tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):].strip()
  ocr_correction.append(prediction)
