# **OCR Text Correction - Yelizaveta Tskhe**


# Imports

In [None]:
pip install tabulate

In [None]:
pip install rouge-score

In [None]:
pip install tqdm

In [None]:
pip install -q -U google-genai

In [None]:
!pip install -U bitsandbytes

In [None]:
import json
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
import torch
from tqdm import tqdm
import re
from rouge_score import rouge_scorer
from scipy.stats import pearsonr, spearmanr
from tabulate import tabulate
import google.generativeai as genai
import time
from typing import Dict, List, Tuple, Optional
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from matplotlib.patches import Rectangle
import matplotlib.patches as mpatches

In [None]:
import nltk
nltk.download('punkt_tab')

In [None]:
!huggingface-cli login

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Meta-Llama-3-8B-Instruct

In [None]:
torch.cuda.empty_cache()

In [None]:
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True  # <- allows model to use its custom chat template
)



with open("the_vampyre_ocr_5k.json", "r") as f:
    ocr_data = json.  load(f)

results = []

for key, ocr_text in list(ocr_data.items())[:1]:

    sentences = [s.strip() + "." for s in ocr_text.split('.') if s.strip()]
    chunk_size = 6
    chunks = [sentences[i:i + chunk_size] for i in range(0, len(sentences), chunk_size)]

    corrected_chunks = []

    for i, chunk_sentences in enumerate(tqdm(chunks, desc="correcting chunks")):

        chunk_text = " ".join(chunk_sentences)

        prompt = [
            {
                "role": "system",
                "content":
                    "You are an expert text editor. The following text was generated by OCR (Optical Character Recognition) from an old book scan and contains typical OCR errors. Your job is to correct the text without altering its meaning or style. Return ONLY the corrected text, and do NOT add any introductions, summaries, or comments. "

            },
            {
                "role": "user",
                "content": "TH1S 1S A SAMP1E TEXT FR0M AN 0LD SCANNED B00K."
            },
            {
                "role": "assistant",
                "content": "THIS IS A SAMPLE TEXT FROM AN OLD SCANNED BOOK."
            },
            {
                "role": "user",
                "content": "He w3nt to the cast1e at n1ght."
            },
            {
                "role": "assistant",
                "content": "He went to the castle at night."
            },
            {
                "role": "user",
                "content": f"input: {chunk_text}"
            }
        ]

        chat_prompt = tokenizer.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True)
        inputs = tokenizer(chat_prompt, return_tensors="pt").to("cuda")



        outputs = model.generate(
            **inputs,
            max_new_tokens=512,
            do_sample=False,
            eos_token_id=tokenizer.eos_token_id,
            repetition_penalty=1.1,
            pad_token_id=tokenizer.eos_token_id  
        )


        input_len = inputs.input_ids.shape[1]
        output_ids = outputs[0]

        generated_ids = output_ids[input_len:]
        cleaned_output = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()

        corrected_chunks.append(cleaned_output)

    final_clean_text = " ".join(corrected_chunks)

    results.append({
        "input": ocr_text,
        "corrected": final_clean_text
    })

with open("ocr-llama.json", "w") as f:
    json.dump(results, f, indent=2)

print("saved corrected text")


# Phi-3 Mini

In [None]:
model_id = "microsoft/phi-3-mini-4k-instruct"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16
)

tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True
)

with open("the_vampyre_ocr_5k.json", "r") as f:
    ocr_data = json.load(f)

results = []

for key, ocr_text in list(ocr_data.items())[:1]:  # adjust slice as needed

    sentences = [s.strip() + "." for s in ocr_text.split('.') if s.strip()]
    chunk_size = 4
    chunks = [sentences[i:i + chunk_size] for i in range(0, len(sentences), chunk_size)]

    corrected_chunks = []

    for i, chunk_sentences in enumerate(tqdm(chunks, desc="correcting chunks")):
        chunk_text = " ".join(chunk_sentences)

        prompt = f"""Fix OCR errors in this text. Output ONLY the corrected text, nothing else:

{chunk_text}

Corrected text:"""

        inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

        # appropriate max_new_tokens based on input
        input_word_count = len(chunk_text.split())
        # allow up to 1.5x the input length
        max_tokens = max(input_word_count + 100, int(input_word_count * 1.5))

        outputs = model.generate(
            **inputs,
            max_new_tokens=max_tokens,  # dynamic based on input size
            do_sample=False,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.eos_token_id,
            use_cache=False,
        )

        input_len = inputs.input_ids.shape[1]
        output_ids = outputs[0]
        generated_ids = output_ids[input_len:]
        cleaned_output = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()

        stop_phrases = [
            "\n\n", "\ninput:", "\nFix OCR", "\nOutput", "\nCorrected text:",
            "\nExamples:", "\nsummary", "Explanation:", "Your job", "Note:",
            "You are", "TH1S 1S", "Here's", "This task", "**Original",
            "plaintext", "After careful", "based on your instructions",
            "You are tasked", "Your objective", "Additionally", "Ensure",
            "Lastly", "Provide explanatory"
        ]

        # stop phrase filtering ,preserve all text before the stop phrase
        original_output = cleaned_output
        for stop in stop_phrases:
            if stop in cleaned_output:
                cleaned_output = cleaned_output.split(stop)[0].strip()

        # careful line filtering
        lines = cleaned_output.split('\n')
        filtered_lines = []
        for line in lines:
            line = line.strip()
            # skip obviously problematic lines, keep most content
            if (line and
                not line.startswith('**') and
                not line.startswith('###') and
                not line.lower().startswith('here is') and
                not line.lower().startswith('this task') and
                'step-by-step' not in line.lower() and
                'correction process' not in line.lower()):
                filtered_lines.append(line)

        if filtered_lines:
            cleaned_output = '\n'.join(filtered_lines)

        #  if we lost too much content, use less aggressive filtering
        original_word_count = len(chunk_text.split())
        cleaned_word_count = len(cleaned_output.split())

        # if we lost more than 50% of the content, fall back to simpler filtering
        if cleaned_word_count < original_word_count * 0.5:
            print(f"filtering removed too much content ({cleaned_word_count}/{original_word_count} words)")
            # just remove obvious hallucination patterns but keep most text
            for stop in ["\nYou are", "\nThis task", "\nHere's", "**Original"]:
                if stop in original_output:
                    cleaned_output = original_output.split(stop)[0].strip()
                    break
            else:
                cleaned_output = original_output

        corrected_chunks.append(cleaned_output)

        if i < 26:
            print(f"chunk {i}: input: {len(chunk_text.split())}, output: {len(cleaned_output.split())}")

    final_clean_text = " ".join(corrected_chunks)

    results.append({
        "input": ocr_text,
        "corrected": final_clean_text
    })

with open("ocr-phi3.json", "w") as f:
    json.dump(results, f, indent=2)

print("saved corrected text")

# LLM-as-a-judge evaluation

### EVALUATION RUBRICS

1. **Completely unacceptable** - The output has no resemblance to the original text's intent. It is either unreadable, full of gibberish, or hallucinated content. Key words are missing or invented.

2. **Severely flawed** - Major OCR errors remain (e.g., symbols in place of letters, obvious misreadings), and the sentence has significant distortions. The text may be partially readable but is not semantically or syntactically correct.

3. **Partially correct** - The core meaning of the text is retained, but several minor OCR errors persist (e.g., digit-letter confusions, spacing issues). The output may contain small artifacts but is mostly understandable.

4. **Good correction** - Most OCR errors have been corrected. The sentence is fluent, understandable, and faithful to the original text. Some very minor issues (like style inconsistencies or awkward phrasing) may be present.

5. **Perfect correction** - The output is indistinguishable from human proofreading: fluent, coherent, completely free of OCR artifacts, and faithful in both style and content to the intended meaning of the original scan.

In [None]:
class OCRJudge:
    def __init__(self, api_key: str, model_name: str = "gemini-2.0-flash"):
        genai.configure(api_key=api_key)
        self.model = genai.GenerativeModel(model_name)

    def create_prompt(self, ocr_input: str, model_a_output: str, model_b_output: str) -> str:
        prompt = f"""
You are an impartial judge. Evaluate the model's output for correcting OCR text.

Original OCR input:
"{ocr_input}"

Model A output:
"{model_a_output}"

Model B output:
"{model_b_output}"

Evaluate output on a scale of 1–5 for each of the following criteria:
1. Completely unacceptable - The output has no resemblance to the original text's intent. It is either unreadable, full of gibberish, or hallucinated content. Key words are missing or invented.
2. Severely flawed - Major OCR errors remain (e.g., symbols in place of letters, obvious misreadings), and the sentence has significant distortions. The text may be partially readable but is not semantically or syntactically correct.
3. Partially correct - The core meaning of the text is retained, but several minor OCR errors persist (e.g., digit-letter confusions, spacing issues). The output may contain small artifacts but is mostly understandable.
4. Good correction - Most OCR errors have been corrected. The sentence is fluent, understandable, and faithful to the original text. Some very minor issues (like style inconsistencies or awkward phrasing) may be present.
5. Perfect correction - The output is indistinguishable from human proofreading: fluent, coherent, completely free of OCR artifacts, and faithful in both style and content to the intended meaning of the original scan.

Return only:
Model A score: X
Model B score: Y
"""
        return prompt

    def parse_scores(self, response: str) -> Tuple[Optional[int], Optional[int]]:
        try:
            model_a_match = re.search(r'model A score:\s*(\d+)', response, re.IGNORECASE)
            model_b_match = re.search(r'model B score:\s*(\d+)', response, re.IGNORECASE)

            model_a_score = int(model_a_match.group(1)) if model_a_match else None
            model_b_score = int(model_b_match.group(1)) if model_b_match else None

            if model_a_score and (model_a_score < 1 or model_a_score > 5):
                model_a_score = None
            if model_b_score and (model_b_score < 1 or model_b_score > 5):
                model_b_score = None

            return model_a_score, model_b_score
        except Exception as e:
            print(f"error : {e}")
            return None, None

    def evaluate_single_item(self, item: Dict, model_a_key: str = "llama_corrected",
                           model_b_key: str = "phi3_corrected") -> Tuple[Optional[int], Optional[int]]:
        try:
            ocr_input = item["input"]
            model_a_output = item[model_a_key]
            model_b_output = item[model_b_key]

            prompt = self.create_prompt(ocr_input, model_a_output, model_b_output)

            response = self.model.generate_content(prompt)

            model_a_score, model_b_score = self.parse_scores(response.text)

            print(f"evaluated: A={model_a_score}, B={model_b_score}")

            return model_a_score, model_b_score

        except Exception as e:
            print(f"error: {e}")
            return None, None

    def evaluate_dataset(self, data: List[Dict], model_a_key: str = "llama_corrected",
                        model_b_key: str = "phi3_corrected",
                        llm_score_a_key: str = "llm_score_llama",
                        llm_score_b_key: str = "llm_score_phi3",
                        delay: float = 1.0) -> List[Dict]:
        updated_data = data.copy()

        for i, item in enumerate(updated_data):
            print(f"processing {i+1}/{len(updated_data)}")

            if (item.get(llm_score_a_key) is not None and
                item.get(llm_score_b_key) is not None):
                print(f"item {i+1} already evaluated")
                continue

            model_a_score, model_b_score = self.evaluate_single_item(
                item, model_a_key, model_b_key
            )

            item[llm_score_a_key] = model_a_score
            item[llm_score_b_key] = model_b_score

            # delay to avoid rate limiting
            if delay > 0 and i < len(updated_data) - 1:
                time.sleep(delay)

        return updated_data

def main():
    API_KEY = "AIzaSyDpzOanDrmS9FY3DcSN63zYEPoVx9tU_b4"
    INPUT_FILE = "aligned_sentences.json"

    OUTPUT_FILE = "ocr-judge.json"

    judge = OCRJudge(API_KEY)

    try:
        with open(INPUT_FILE, 'r', encoding='utf-8') as f:
            data = json.load(f)
        print(f"loaded {len(data)} items from {INPUT_FILE}")
    except FileNotFoundError:
        print(f"error: file not found")
        return
    except json.JSONDecodeError as e:
        print(f"error json: {e}")
        return

    batch = data[:5]
    # batch = data[5:10]
    # batch = data[10:15]
    # batch = data[15:20]
    # batch = data[20:25]

    print("evaluation started")
    evaluated_data = judge.evaluate_dataset(
        # data,
        batch,
        model_a_key="llama_corrected",
        model_b_key="phi3_corrected",
        delay=1.0  # 1 second delay between requests
    )

    try:
        with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
            json.dump(evaluated_data, f, indent=2, ensure_ascii=False)
        print(f"results saved to {OUTPUT_FILE}")
    except Exception as e:
        print(f"error : {e}")

    llama_scores = [item.get("llm_score_llama") for item in evaluated_data if item.get("llm_score_llama") is not None]
    phi3_scores = [item.get("llm_score_phi3") for item in evaluated_data if item.get("llm_score_phi3") is not None]

    if llama_scores:
        print(f"\nllama scores - count: {len(llama_scores)}, average: {sum(llama_scores)/len(llama_scores):.2f}")
    if phi3_scores:
        print(f"phi3 scores - count: {len(phi3_scores)}, average: {sum(phi3_scores)/len(phi3_scores):.2f}")

if __name__ == "__main__":
    main()

# Human evaluation

**Sentence 1:**
- **Llama: 3** - Incorrectly changed "Hadagni" to "hangman" and "florid" to "fluid", altering meaning
- **Phi3: 2** - Sentence is truncated and contains "entranely" error

**Sentence 2:**
- **Llama: 5** - Perfect correction
- **Phi3: 5** - Perfect correction (extra space is negligible)

**Sentence 3:**
- **Llama: 5** - All OCR errors correctly fixed
- **Phi3: 4** - Good but missing "heart and" (only says "through the body")

**Sentence 4:**
- **Llama: 5** - Perfect correction
- **Phi3: 5** - Perfect correction

**Sentence 5:**
- **Llama: 4** - Good correction but changed "agents upon" to "infected and attack"
- **Phi3: 2** - "clinging up to" is nonsensical

**Sentence 6:**
- **Llama: 5** - Perfect correction
- **Phi3: 5** - Perfect correction

**Sentence 7:**
- **Llama: 3** - "Chief Baily" is incorrect spelling
- **Phi3: 1** - No correction attempted

**Sentence 8:**
- **Llama: 4** - Good but changed "rodomontade" to "fable"
- **Phi3: 2** - "irres0r3tance" contains OCR errors

**Sentence 9:**
- **Llama: 5** - Excellent corrections throughout
- **Phi3: 3** - "oppressed by his infernal vampiric visions" doesn't make sense

**Sentence 10:**
- **Llama: 4** - Good but some word substitutions ("Therefrom", "crossing")
- **Phi3: 2** - Multiple errors ("suck thy blood", "shall slip")

**Sentence 11:**
- **Llama: 5** - Perfect correction
- **Phi3: 5** - Perfect correction

**Sentence 12:**
- **Llama: 4** - Good corrections but changed "thine" to "her" in one place
- **Phi3: 2** - Multiple OCR errors remain ("inark", "hor", "whither sborn")

**Sentence 13:**
- **Llama: 4** - Good with minor word changes
- **Phi3: 3** - Several errors remain ("baggard", "thefe", "spectacle")

**Sentence 14:**
- **Llama: 4** - Good but some wording changes
- **Phi3: 3** - Significant alterations that change meaning

**Sentence 15:**
- **Llama: 5** - Perfect correction
- **Phi3: 5** - Perfect correction

**Sentence 16:**
- **Llama: 4** - Good but changed "veracious" to "venerable"
- **Phi3: 2** - Missing "Tournefort" and has "eyewit0rst" OCR error

**Sentence 17:**
- **Llama: 5** - Excellent corrections
- **Phi3: 3** - Keeps OCR errors in proper nouns

**Sentence 18:**
- **Llama: 5** - Perfect correction
- **Phi3: 1** - Major hallucination with completely different content

**Sentence 19:**
- **Llama: 5** - Perfect correction
- **Phi3: 5** - Perfect correction

**Sentence 20:**
- **Llama: 5** - Perfect correction  
- **Phi3: 5** - Perfect correction

**Sentence 21:**
- **Llama: 4** - Good but added unnecessary "like that"
- **Phi3: 2** - "oak" instead of "cheek" is a major error

**Sentence 22:**
- **Llama: 5** - Perfect (changing "house" to "ball" is acceptable)
- **Phi3: 3** - "weight of nothing" and "possession" are incorrect

**Sentence 23:**
- **Llama: 3** - Several incorrect word choices
- **Phi3: 2** - Missing portions and incorrect words

**Sentence 24:**
- **Llama: 2** - "semi-otics" is nonsensical
- **Phi3: 3** - Changes meaning but more coherent

**Sentence 25:**
- **Llama: 5** - Perfect correction
- **Phi3: 5** - Perfect correction


In [None]:
with open("ocr-judge.json", "r", encoding="utf-8") as f:
    data = json.load(f)

human_scores_llama = [3, 5, 5, 5, 4, 5, 3, 4, 5, 4, 5, 4, 4, 4, 5, 4, 5, 5, 5, 5, 4, 5, 3, 2, 5]
human_scores_phi3 = [2, 5, 4, 5, 2, 5, 1, 2, 3, 2, 5, 2, 3, 3, 5, 2, 3, 1, 5, 5, 2, 3, 2, 3, 5]

for i, item in enumerate(data):
    item["human_score_llama"] = human_scores_llama[i]
    item["human_score_phi3"] = human_scores_phi3[i]

with open("aligned_sentences_with_human_scores.json", "w", encoding="utf-8") as f:
    json.dump(data, f, indent=2, ensure_ascii=False)

print("human scores saved")

# Correlations

In [None]:
#### ROUGE SCORES

with open("aligned_sentences_with_human_scores.json", "r", encoding="utf-8") as f:
    data = json.load(f)

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

for item in data:
    reference = item["reference"]

    llama_scores = scorer.score(reference, item["llama_corrected"])
    item["llama_rouge-1"] = llama_scores["rouge1"].fmeasure
    item["llama_rouge-2"] = llama_scores["rouge2"].fmeasure
    item["llama_rouge-L"] = llama_scores["rougeL"].fmeasure

    phi3_scores = scorer.score(reference, item["phi3_corrected"])
    item["phi3_rouge-1"] = phi3_scores["rouge1"].fmeasure
    item["phi3_rouge-2"] = phi3_scores["rouge2"].fmeasure
    item["phi3_rouge-L"] = phi3_scores["rougeL"].fmeasure

with open("aligned_sentences_with_rouge.json", "w", encoding="utf-8") as f:
    json.dump(data, f, indent=2, ensure_ascii=False)

print("rouge scores saved")


In [None]:
### CORRELATIONS

with open("aligned_sentences_with_rouge.json", "r") as f:
    data = json.load(f)

def extract_scores(prefix):
    return {
        "human": [item[f"human_score_{prefix}"] for item in data if item.get(f"human_score_{prefix}") is not None],
        "llm": [item[f"llm_score_{prefix}"] for item in data if item.get(f"human_score_{prefix}") is not None],
        "rouge1": [item[f"{prefix}_rouge-1"] for item in data if item.get(f"human_score_{prefix}") is not None],
        "rouge2": [item[f"{prefix}_rouge-2"] for item in data if item.get(f"human_score_{prefix}") is not None],
        "rougeL": [item[f"{prefix}_rouge-L"] for item in data if item.get(f"human_score_{prefix}") is not None],
    }

llama = extract_scores("llama")
phi3 = extract_scores("phi3")

def compute_corr(human, metric):
    if len(set(human)) < 2 or len(set(metric)) < 2:
        return {"pearson": None, "spearman": None}
    return {
        "pearson": round(pearsonr(human, metric)[0], 4),
        "spearman": round(spearmanr(human, metric).correlation, 4)
    }

results = {
    "llama": {
        "human_vs_llm": compute_corr(llama["human"], llama["llm"]),
        "human_vs_rouge1": compute_corr(llama["human"], llama["rouge1"]),
        "human_vs_rouge2": compute_corr(llama["human"], llama["rouge2"]),
        "human_vs_rougeL": compute_corr(llama["human"], llama["rougeL"]),
    },
    "phi3": {
        "human_vs_llm": compute_corr(phi3["human"], phi3["llm"]),
        "human_vs_rouge1": compute_corr(phi3["human"], phi3["rouge1"]),
        "human_vs_rouge2": compute_corr(phi3["human"], phi3["rouge2"]),
        "human_vs_rougeL": compute_corr(phi3["human"], phi3["rougeL"]),
    }
}

with open("correlation_results.json", "w") as f:
    json.dump(results, f, indent=2)


print("correlation results saved")

In [None]:
with open("correlation_results.json", "r") as f:
    results = json.load(f)

table_data = []
for model in ["llama", "phi3"]:
    for metric in ["human_vs_llm", "human_vs_rouge1", "human_vs_rouge2", "human_vs_rougeL"]:
        row = [
            model.upper(),
            metric.replace("human_vs_", "").upper(),
            results[model][metric]["pearson"] if results[model][metric]["pearson"] is not None else "N/A",
            results[model][metric]["spearman"] if results[model][metric]["spearman"] is not None else "N/A"
        ]
        table_data.append(row)

headers = ["Model", "Metric", "Pearson", "Spearman"]
print(tabulate(table_data, headers=headers, tablefmt="grid"))


# Figures

In [None]:
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

with open("aligned_sentences_with_human_scores.json", "r", encoding="utf-8") as f:
    data = json.load(f)

llm_scores_llama = [item["llm_score_llama"] for item in data]
llm_scores_phi3 = [item["llm_score_phi3"] for item in data]
human_scores_llama = [item["human_score_llama"] for item in data]
human_scores_phi3 = [item["human_score_phi3"] for item in data]

pearson_llama = pearsonr(llm_scores_llama, human_scores_llama)
spearman_llama = spearmanr(llm_scores_llama, human_scores_llama)
pearson_phi3 = pearsonr(llm_scores_phi3, human_scores_phi3)
spearman_phi3 = spearmanr(llm_scores_phi3, human_scores_phi3)

# 1. Correlation Scatter Plots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

ax1.scatter(human_scores_llama, llm_scores_llama, alpha=0.6, s=100, color='#FF6B6B')
ax1.plot([1, 5], [1, 5], 'k--', alpha=0.3)
ax1.set_xlabel('Human Scores', fontsize=12)
ax1.set_ylabel('LLM Scores', fontsize=12)
ax1.set_title('Llama: Human vs LLM Evaluation', fontsize=14, fontweight='bold')
ax1.set_xlim(0.5, 5.5)
ax1.set_ylim(0.5, 5.5)
ax1.text(0.05, 0.95, f'Pearson r = {pearson_llama[0]:.3f}\nSpearman ρ = {spearman_llama[0]:.3f}',
         transform=ax1.transAxes, fontsize=12, verticalalignment='top',
         bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

ax2.scatter(human_scores_phi3, llm_scores_phi3, alpha=0.6, s=100, color='#4ECDC4')
ax2.plot([1, 5], [1, 5], 'k--', alpha=0.3)
ax2.set_xlabel('Human Scores', fontsize=12)
ax2.set_ylabel('LLM Scores', fontsize=12)
ax2.set_title('Phi-3: Human vs LLM Evaluation', fontsize=14, fontweight='bold')
ax2.set_xlim(0.5, 5.5)
ax2.set_ylim(0.5, 5.5)
ax2.text(0.05, 0.95, f'Pearson r = {pearson_phi3[0]:.3f}\nSpearman ρ = {spearman_phi3[0]:.3f}',
         transform=ax2.transAxes, fontsize=12, verticalalignment='top',
         bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

plt.tight_layout()
plt.savefig('correlation_scatter_plots.png', dpi=300, bbox_inches='tight')
plt.close()

# 2. Correlation Coefficients Comparison
fig, ax = plt.subplots(figsize=(10, 6))

x = np.array([0, 1, 3, 4])
width = 0.8
colors = ['#FFB6B9', '#FF6F91', '#A3D2CA', '#5EAAA8']

correlations = [pearson_llama[0], spearman_llama[0], pearson_phi3[0], spearman_phi3[0]]
labels = ['Llama\nPearson', 'Llama\nSpearman', 'Phi-3\nPearson', 'Phi-3\nSpearman']

bars = ax.bar(x, correlations, width, color=colors, edgecolor='black', linewidth=1.5)

for bar, corr in zip(bars, correlations):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height + 0.01,
            f'{corr:.3f}', ha='center', va='bottom', fontsize=12, fontweight='bold')

ax.set_ylabel('Correlation Coefficient', fontsize=12)
ax.set_title('Human-LLM Scores Correlation', fontsize=14, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(labels, fontsize=11)
ax.set_ylim(0, 1.0)
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('correlation_coefficients_comparison.png', dpi=300, bbox_inches='tight')
plt.close()

# 3. Score Distribution
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

axes[0, 0].hist(human_scores_llama, bins=5, range=(0.5, 5.5), alpha=0.7, color='#FFB6B9', edgecolor='black')
axes[0, 0].set_title('Human Scores - Llama', fontsize=12, fontweight='bold')
axes[0, 0].set_xlabel('Score')
axes[0, 0].set_ylabel('Frequency')

axes[0, 1].hist(human_scores_phi3, bins=5, range=(0.5, 5.5), alpha=0.7, color='#A3D2CA', edgecolor='black')
axes[0, 1].set_title('Human Scores - Phi-3', fontsize=12, fontweight='bold')
axes[0, 1].set_xlabel('Score')
axes[0, 1].set_ylabel('Frequency')

axes[1, 0].hist(llm_scores_llama, bins=5, range=(0.5, 5.5), alpha=0.7, color='#FF6F91', edgecolor='black')
axes[1, 0].set_title('LLM Scores - Llama', fontsize=12, fontweight='bold')
axes[1, 0].set_xlabel('Score')
axes[1, 0].set_ylabel('Frequency')

axes[1, 1].hist(llm_scores_phi3, bins=5, range=(0.5, 5.5), alpha=0.7, color='#5EAAA8', edgecolor='black')
axes[1, 1].set_title('LLM Scores - Phi-3', fontsize=12, fontweight='bold')
axes[1, 1].set_xlabel('Score')
axes[1, 1].set_ylabel('Frequency')

plt.suptitle('Score Distribution', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.savefig('score_distributions.png', dpi=300, bbox_inches='tight')
plt.close()

# 4. Model Performance Comparison
avg_human_llama = np.mean(human_scores_llama)
avg_human_phi3 = np.mean(human_scores_phi3)
avg_llm_llama = np.mean(llm_scores_llama)
avg_llm_phi3 = np.mean(llm_scores_phi3)

fig, ax = plt.subplots(figsize=(10, 6))

x = np.arange(2)
width = 0.35

bars1 = ax.bar(x - width/2, [avg_human_llama, avg_human_phi3], width, label='Human Scores', color='#FF6F91')
bars2 = ax.bar(x + width/2, [avg_llm_llama, avg_llm_phi3], width, label='LLM Scores', color='#5EAAA8')

for bars in [bars1, bars2]:
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height + 0.05,
                f'{height:.2f}', ha='center', va='bottom', fontsize=11, fontweight='bold')

ax.set_ylabel('Average Score', fontsize=12)
ax.set_title('Average Scores by Model', fontsize=14, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(['Llama', 'Phi-3'], fontsize=12)
ax.legend(fontsize=11)
ax.set_ylim(0, 5.5)
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('model_performance_comparison.png', dpi=300, bbox_inches='tight')
plt.close()

# 5. Results Table (Human-LLM)
fig, ax = plt.subplots(figsize=(10, 6))
ax.axis('tight')
ax.axis('off')

table_data = [
    ['Metric', 'Llama', 'Phi-3'],
    ['Average Human Score', f'{avg_human_llama:.2f}', f'{avg_human_phi3:.2f}'],
    ['Average LLM Score', f'{avg_llm_llama:.2f}', f'{avg_llm_phi3:.2f}'],
    ['Pearson Correlation Coefficient', f'{pearson_llama[0]:.3f}', f'{pearson_phi3[0]:.3f}'],
    ['Spearman Correlation Coefficient', f'{spearman_llama[0]:.3f}', f'{spearman_phi3[0]:.3f}'],
    # ['P-value (Pearson)', f'{pearson_llama[1]:.4f}', f'{pearson_phi3[1]:.4f}'],
    # ['P-value (Spearman)', f'{spearman_llama[1]:.4f}', f'{spearman_phi3[1]:.4f}']
]

table = ax.table(cellText=table_data, loc='center', cellLoc='center')
table.auto_set_font_size(False)
table.set_fontsize(12)
table.scale(1.2, 1.8)

for i in range(3):
    table[(0, i)].set_facecolor('#4ECDC4')
    table[(0, i)].set_text_props(weight='bold', color='white')

for i in range(1, 5):
    table[(i, 0)].set_facecolor('#E0E0E0')
    table[(i, 0)].set_text_props(weight='bold')

# ax.set_title('Performance Metrics', fontsize=16, fontweight='bold', pad=20)
plt.savefig('performance_metrics_table.png', dpi=300, bbox_inches='tight')
plt.close()

# 6. Confusion heatmap
agreement_matrix_llama = np.zeros((5, 5))
agreement_matrix_phi3 = np.zeros((5, 5))

for h, l in zip(human_scores_llama, llm_scores_llama):
    agreement_matrix_llama[5-int(l), int(h)-1] += 1

for h, l in zip(human_scores_phi3, llm_scores_phi3):
    agreement_matrix_phi3[5-int(l), int(h)-1] += 1

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

sns.heatmap(agreement_matrix_llama, annot=True, fmt='g', cmap='YlOrRd',
            xticklabels=['1', '2', '3', '4', '5'],
            yticklabels=['5', '4', '3', '2', '1'],
            cbar_kws={'label': 'Count'}, ax=ax1)
ax1.set_xlabel('Human Score', fontsize=12)
ax1.set_ylabel('LLM Score', fontsize=12)
ax1.set_title('Llama: Human-LLM Score Agreement', fontsize=14, fontweight='bold')

sns.heatmap(agreement_matrix_phi3, annot=True, fmt='g', cmap='YlGnBu',
            xticklabels=['1', '2', '3', '4', '5'],
            yticklabels=['5', '4', '3', '2', '1'],
            cbar_kws={'label': 'Count'}, ax=ax2)
ax2.set_xlabel('Human Score', fontsize=12)
ax2.set_ylabel('LLM Score', fontsize=12)
ax2.set_title('Phi-3: Human-LLM Score Agreement', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.savefig('score_agreement_heatmaps.png', dpi=300, bbox_inches='tight')
plt.close()

print("\nsaved all figures")

In [None]:
### HUMAN - ROUGE correlation
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

llama_pearson = [0.4891, 0.4544, 0.4891]
llama_spearman = [0.6894, 0.6618, 0.6894]

phi3_pearson = [0.5461, 0.592, 0.5528]
phi3_spearman = [0.7203, 0.6636, 0.7259]

rouge_metrics = ['ROUGE-1', 'ROUGE-2', 'ROUGE-L']
x = np.arange(len(rouge_metrics))  # [0, 1, 2]
width = 0.18

fig, ax = plt.subplots(figsize=(12, 6))

bars1 = ax.bar(x - 1.5*width, llama_pearson, width, label='LLaMA Pearson', color='#FFB6B9')
bars2 = ax.bar(x - 0.5*width, llama_spearman, width, label='LLaMA Spearman', color='#FF6F91')
bars3 = ax.bar(x + 0.5*width, phi3_pearson, width, label='Phi-3 Pearson', color='#A3D2CA')
bars4 = ax.bar(x + 1.5*width, phi3_spearman, width, label='Phi-3 Spearman', color='#5EAAA8')

for bars in [bars1, bars2, bars3, bars4]:
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                f'{height:.3f}', ha='center', va='bottom', fontsize=9)

ax.set_ylabel('Correlation Coefficient', fontsize=12)
ax.set_title('Human - ROUGE Scores Correlation', fontsize=14, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(rouge_metrics, fontsize=11)
ax.set_ylim(0, 1.0)
ax.legend(loc='upper right', fontsize=10, frameon=True, fancybox=True, framealpha=0.9)
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('grouped_human_rouge_correlation.png', dpi=300, bbox_inches='tight')
plt.close()

print("\nsaved all figures")
