In [7]:
!python3.10 -m pip install --upgrade pip
!python3.10 -m pip install pandas
!python3.10 -m pip install fuzzywuzzy
!python3.10 -m pip install sentence-transformers
!python3.10 -m pip install torch
!python3.10 -m pip install virtualenv
!python3.10 -m pip install ipykernel
!python3.10 -m pip install notebook

Collecting pandas
  Downloading pandas-2.2.3-cp310-cp310-macosx_11_0_arm64.whl.metadata (89 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2024.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2024.2-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.2.3-cp310-cp310-macosx_11_0_arm64.whl (11.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.3/11.3 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hUsing cached pytz-2024.2-py2.py3-none-any.whl (508 kB)
Using cached tzdata-2024.2-py2.py3-none-any.whl (346 kB)
Installing collected packages: pytz, tzdata, pandas
Successfully installed pandas-2.2.3 pytz-2024.2 tzdata-2024.2
Collecting fuzzywuzzy
  Using cached fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Using cached fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0
Collecting virtuale

In [47]:
import pandas as pd
from fuzzywuzzy import fuzz
from sentence_transformers import SentenceTransformer, util
from collections import defaultdict
import json

# Option 1: Disable truncation for all columns
pd.set_option('display.max_colwidth', None)

# Option 2: Set maximum column width to a very large number
pd.set_option('display.max_colwidth', 1000)


In [None]:
# pip3 install virtualenv
# python3 -m venv st 
# source st/bin/activate

# alias python=python3.10  # as env has multiple python versions 3.10 and 3.13
# alias pip=pip3.10

# python3.10 -m pip install torch
# python3.10 -m pip install sentence_transformers

## Functions

In [45]:
# Semantic similarity function using sentence-transformers
def semantic_similarity(text1, text2):
    embeddings = model.encode([text1, text2], convert_to_tensor=True)
    cosine_sim = util.pytorch_cos_sim(embeddings[0], embeddings[1]).item()
    return cosine_sim

# Function to calculate precision and recall
def calculate_metrics(human_value, llm_value, threshold=0.7):
    if isinstance(human_value, (int, float)) and isinstance(llm_value, (int, float)):
        # Simple case for numeric types
        match = human_value == llm_value
        similarity = 1 if match else 0
        precision = similarity
        recall = similarity
        return {
            "match": "✓" if match else "✗",
            "similarity": f"{similarity * 100}%",
            "precision": precision,
            "recall": recall
        }
    
    elif isinstance(human_value, list) and isinstance(llm_value, list):
        # Handle case where both are lists (of strings, dicts, or mixed)
        if all(isinstance(item, dict) for item in human_value) and all(isinstance(item, dict) for item in llm_value):
            return calculate_dict_list_metrics(human_value, llm_value)
        else:
            return calculate_list_metrics(human_value, llm_value, threshold)
    
    elif isinstance(human_value, str) and isinstance(llm_value, str):
        # Handle case where both are single strings
        similarity = semantic_similarity(human_value, llm_value)
        match = similarity >= threshold
        return {
            "match": "✓" if match else "✗",
            "similarity": f"{similarity * 100}%",
            "precision": similarity,
            "recall": similarity
        }
    
    else:
        # Handle other unsupported or mixed types
        human_value = str(human_value)  # Convert to string if possible
        llm_value = str(llm_value)  # Convert to string if possible
        similarity = semantic_similarity(human_value, llm_value)
        match = similarity >= threshold
        return {
            "match": "✓" if match else "✗",
            "similarity": f"{similarity * 100}%",
            "precision": similarity,
            "recall": similarity
        }

def calculate_list_metrics(human_value, llm_value, threshold=0.7):
    """
    Function to handle lists of strings and calculate precision and recall for them.
    This compares the human and LLM paragraphs using semantic similarity.
    """
    true_positives = 0
    false_negatives = len(human_value)
    false_positives = len(llm_value)
    
    # Ensure all items are strings before comparing
    human_value = [str(item) for item in human_value]
    llm_value = [str(item) for item in llm_value]
    
    # Compare each human paragraph with all LLM paragraphs
    for human_paragraph in human_value:
        best_match = 0
        for llm_paragraph in llm_value:
            similarity = semantic_similarity(human_paragraph, llm_paragraph)
            best_match = max(best_match, similarity)
        
        if best_match >= threshold:
            true_positives += 1
            false_negatives -= 1
            false_positives -= 1

    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    
    return {
        "match": "✓" if precision == 1 and recall == 1 else "✗",
        "similarity": f"{(precision + recall) * 50}%",
        "precision": precision,
        "recall": recall
    }

def calculate_dict_list_metrics(human_value, llm_value, threshold=0.7):
    """
    Function to calculate precision and recall for comparing lists of dictionaries.
    This assumes each dictionary represents a key-value pair and checks for matches.
    If the dictionary values are strings, it calculates semantic similarity.
    """
    true_positives = 0
    false_negatives = len(human_value)
    false_positives = len(llm_value)
    
    for human_dict in human_value:
        found_match = False
        for llm_dict in llm_value:
            # Check if the keys and values match between dictionaries
            if all(human_key in llm_dict for human_key in human_dict):
                # Compare values (if they are strings, use semantic similarity)
                matches = True
                for human_key, human_value_str in human_dict.items():
                    llm_value_str = llm_dict.get(human_key, None)
                    if isinstance(human_value_str, str) and isinstance(llm_value_str, str):
                        similarity = semantic_similarity(human_value_str, llm_value_str)
                        if similarity < threshold:
                            matches = False
                            break
                    elif human_value_str != llm_value_str:
                        matches = False
                        break

                if matches:
                    true_positives += 1
                    false_negatives -= 1
                    false_positives -= 1
                    found_match = True
                    break
        if found_match:
            continue
    
    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    
    return {
        "match": "✓" if precision == 1 and recall == 1 else "✗",
        "similarity": f"{(precision + recall) * 50}%",
        "precision": precision,
        "recall": recall
    }

## Main

In [49]:
# Sample data for demonstration
human_data = {
    "Section 1": {
        "Key 1": "Value 1",
        "Key 2": [
            10,
            30
        ],
        "Key 3": [
            {
                "SubKey1": "Text A"
            },
            {
                "SubKey1": "Text Z"
            }
        ]
    },
    "Section 2": {
        "Key 4": "Another Val",
        "Key 5": [
            "Item1",
            "ItemX",
            "ItemY"
        ],
        "Key 6": [
            {
                "SubKey1": "This is a code to test"
            },
            {
                "SubKey1": "Text Y"
            }
        ]
    }
}

llm_data = {
    "Section 1": {
        "Key 1": "Value 1",
        "Key 2": [
            10,
            20
        ],
        "Key 3": [
            {
                "SubKey1": "This is a test"
            },
            {
                "SubKey1": "Text A"
            }
        ]
    },
    "Section 2": {
        "Key 4": "Another Value",
        "Key 5": [
            "Item1",
            "Item2"
        ],
        "Key 6": [
            {
                "SubKey1": "This is a testing code"
            },
            {
                "SubKey1": "Text D"
            }
        ]
    }
}

In [35]:
# Semantic similarity function using sentence-transformers
model = SentenceTransformer('all-MiniLM-L6-v2')

print(json.dumps(human_data, indent=4))

{
    "Section 1": {
        "Key 1": "Value 1",
        "Key 2": [
            10,
            30
        ],
        "Key 3": [
            {
                "SubKey1": "Text A"
            },
            {
                "SubKey1": "Text Z"
            }
        ]
    },
    "Section 2": {
        "Key 4": "Another Val",
        "Key 5": [
            "Item1",
            "ItemX",
            "ItemY"
        ],
        "Key 6": [
            {
                "SubKey1": "Text C"
            },
            {
                "SubKey1": "Text Y"
            }
        ]
    }
}


In [21]:
print(json.dumps(llm_data, indent=4))

{
    "Section 1": {
        "Key 1": "Value 1",
        "Key 2": [
            10,
            20
        ],
        "Key 3": [
            {
                "SubKey1": "Text A"
            },
            {
                "SubKey1": "Text B"
            }
        ]
    },
    "Section 2": {
        "Key 4": "Another Value",
        "Key 5": [
            "Item1",
            "Item2"
        ],
        "Key 6": [
            {
                "SubKey1": "Text C"
            },
            {
                "SubKey1": "Text D"
            }
        ]
    }
}


In [50]:
# Calculate metrics
metrics_data = []
overall_precision = 0
overall_recall = 0
num_metrics = 0

for section, human_section in human_data.items():
    llm_section = llm_data.get(section, {})
    for key, human_value in human_section.items():
        llm_value = llm_section.get(key, None)
        metrics = calculate_metrics(human_value, llm_value)
        
        metrics_data.append({
            "Section": section,
            "Key": key,
            "Human Value": str(human_value),
            "LLM Value": str(llm_value),
            "Match": metrics['match'],
            "Similarity": metrics['similarity'],
            "Precision": metrics['precision'],
            "Recall": metrics['recall']
        })
        
        overall_precision += metrics["precision"]
        overall_recall += metrics["recall"]
        num_metrics += 1

# Create DataFrame for metrics
metrics_df = pd.DataFrame(metrics_data)

# Display overall metrics
avg_precision = overall_precision / num_metrics if num_metrics > 0 else 0
avg_recall = overall_recall / num_metrics if num_metrics > 0 else 0

print(f"Average Precision: {avg_precision:.2%}")
print(f"Average Recall: {avg_recall:.2%}")

metrics_df

Average Precision: 67.02%
Average Recall: 64.24%


Unnamed: 0,Section,Key,Human Value,LLM Value,Match,Similarity,Precision,Recall
0,Section 1,Key 1,Value 1,Value 1,✓,100.0%,1.0,1.0
1,Section 1,Key 2,"[10, 30]","[10, 20]",✓,100.0%,1.0,1.0
2,Section 1,Key 3,"[{'SubKey1': 'Text A'}, {'SubKey1': 'Text Z'}]","[{'SubKey1': 'This is a test'}, {'SubKey1': 'Text A'}]",✗,50.0%,0.5,0.5
3,Section 2,Key 4,Another Val,Another Value,✗,52.111583948135376%,0.521116,0.521116
4,Section 2,Key 5,"['Item1', 'ItemX', 'ItemY']","['Item1', 'Item2']",✗,41.666666666666664%,0.5,0.333333
5,Section 2,Key 6,"[{'SubKey1': 'This is a code to test'}, {'SubKey1': 'Text Y'}]","[{'SubKey1': 'This is a testing code'}, {'SubKey1': 'Text D'}]",✗,50.0%,0.5,0.5


In [27]:
print(json.dumps(llm_data, indent=4))

{
    "Section 1": {
        "Key 1": "Value 1",
        "Key 2": [
            10,
            20
        ],
        "Key 3": [
            {
                "SubKey1": "Text A"
            },
            {
                "SubKey1": "Text B"
            }
        ]
    },
    "Section 2": {
        "Key 4": "Another Value",
        "Key 5": [
            "Item1",
            "Item2"
        ],
        "Key 6": [
            {
                "SubKey1": "Text C"
            },
            {
                "SubKey1": "Text D"
            }
        ]
    }
}
