# Tools for comparing transcript to each other
For example when using different transcription tools, the outputs can be compared for multiple metrics

# 1.Libraries and packages needed

# Install core scientific & notebook packages via conda
conda install -c conda-forge ipykernel nltk scikit-learn ipywidgets tqdm numpy scipy pandas

# Use pip for sentence-transformers
pip install sentence-transformers

1. Word-by-Word Diff Report (transcript vs transcript)
Use difflib to generate a clear markup of added/removed words.

In [None]:
from difflib import ndiff

# Load files
with open("transcript_human.txt", "r", encoding="utf-8") as f:
    human_text = f.read()

with open("transcript_whisper.txt", "r", encoding="utf-8") as f:
    whisper_text = f.read()

# Word-level diff
diff = list(ndiff(human_text.split(), whisper_text.split()))

# Create HTML report
html = "<html><body><pre>"
for word in diff:
    if word.startswith("- "):
        html += f"<span style='color:red;'>[{word[2:]}]</span> "
    elif word.startswith("+ "):
        html += f"<span style='color:green;'>{{{word[2:]}}}</span> "
    else:
        html += word[2:] + " "
html += "</pre></body></html>"

# Save it
with open("text_diff_report.html", "w", encoding="utf-8") as f:
    f.write(html)

print("✅ HTML diff report saved as text_diff_report.html")


2. Semantic Chunk Comparison (Are the ideas the same?)
Break both files into sentences and compare sentence similarity using Sentence-BERT.

In [None]:
import nltk
from sentence_transformers import SentenceTransformer, util

nltk.download('punkt')
from nltk.tokenize import sent_tokenize

# Load files
with open("transcript_human.txt", "r", encoding="utf-8") as f:
    human_sentences = sent_tokenize(f.read())

with open("transcript_whisper.txt", "r", encoding="utf-8") as f:
    whisper_sentences = sent_tokenize(f.read())

# Embed and compare
model = SentenceTransformer("all-MiniLM-L6-v2")
emb_human = model.encode(human_sentences, convert_to_tensor=True)
emb_whisper = model.encode(whisper_sentences, convert_to_tensor=True)

# Pairwise comparisons
for i, sent_h in enumerate(human_sentences):
    sims = util.cos_sim(emb_human[i], emb_whisper)
    best_idx = sims.argmax()
    best_score = sims[0][best_idx].item()
    if best_score < 0.85:
        print(f"\n🔻Low similarity ({best_score:.2f})")
        print(f"Human:   {sent_h}")
        print(f"Whisper: {whisper_sentences[best_idx]}")


3. Text Overlap Metrics (Quick summary stats)
Use sklearn to get cosine similarity from TF-IDF vectors.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load
with open("transcript_human.txt", "r", encoding="utf-8") as f:
    human = f.read()
with open("transcript_whisper.txt", "r", encoding="utf-8") as f:
    whisper = f.read()

# TF-IDF vectorization
vec = TfidfVectorizer().fit_transform([human, whisper])
sim = cosine_similarity(vec[0], vec[1])

print(f"🧠 Cosine similarity (TF-IDF): {sim[0][0]:.4f}")
