### Method : String Matching:Fuzzy

In [None]:
# For text = 0.8, figure = 0.6, model = sbert, overlay = 60%

import os
import json
import torch
import re
from transformers import AutoModelForCausalLM, AutoTokenizer
from sentence_transformers import SentenceTransformer, util
from PIL import Image, ImageDraw, ImageFont
from rapidfuzz import fuzz

# Parent directories
image_parent_dir = "/tmp/megha/Complete/slides/ICML_slides"
transcript_parent_dir = "/tmp/megha/Complete/transcripts/ICML_trans"
aws_ocr_parent_dir = "/tmp/megha/Complete/Layout/ICML_layout/json"
output_parent_dir = "/tmp/megha/Complete/WITHOUT/ICML_res/Fuzzy/T-1"

# Load SBERT model
model = SentenceTransformer("all-MiniLM-L6-v2")

# time from srt
def parse_srt(file_path):
    """Extracts timestamps and text from an SRT file."""
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    entries = []
    srt_pattern = re.compile(r"(\d+)\n(\d{2}:\d{2}:\d{2},\d{3}) --> (\d{2}:\d{2}:\d{2},\d{3})\n(.*?)\n\n", re.DOTALL)
    
    for match in srt_pattern.finditer(content):
        index, start_time, end_time, text = match.groups()
        text = text.replace("\n", " ").strip()
        entries.append({"start_time": start_time, "end_time": end_time, "text": text})
    
    return entries

# to get ocr_regions - Any ways it was avoided
def load_aws_ocr(file_path):
    """Loads AWS OCR results and extracts recognized text with bounding boxes and IDs."""
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)

    ocr_text_map = {
        block["Id"]: block.get("Text", "") 
        for block in data.get("Blocks", []) 
        if block.get("BlockType") == "LINE"
    }

    regions = []
    for block in data.get("Blocks", []):
        if block.get("BlockType", "").startswith("LAYOUT_"):
            child_blocks = [
                child_id 
                for rel in block.get("Relationships", []) 
                if rel["Type"] == "CHILD" 
                for child_id in rel.get("Ids", [])
            ]
            
            # Check if any child block is also a LAYOUT_ type
            if any(
                child_block.get("BlockType", "").startswith("LAYOUT_") 
                for child_block in data.get("Blocks", []) 
                if child_block["Id"] in child_blocks
            ):
                print(f"Parent block {block['Id']} has layout children, hence skipped.")
                continue

            text = [ocr_text_map[child_id] for child_id in child_blocks if child_id in ocr_text_map]
            
            regions.append({
                "Text": " ".join(text),
                "BoundingBox": block["Geometry"]["BoundingBox"],
                "BlockType": block["BlockType"],
                "Id": block['Id']
            })

    return regions

# finding best matches
def get_best_matches(transcript_text, ocr_regions):
    """Finds the best matching OCR regions for a transcript line using fuzzy matching."""
    matches = []
    
    for region in ocr_regions:
        ocr_text = region["Text"].strip()
        if not ocr_text:
            continue
        
        # Fuzzy matching score (0-100 scale)
        score = fuzz.ratio(transcript_text, ocr_text) / 100.0
        
        threshold = 0.8 if region["BlockType"] != "LAYOUT_FIGURE" else 0.6
        if score > threshold:
            region['Score'] = score
            matches.append(region)
    
    return sorted(matches, key=lambda x: x["Score"], reverse=True)

# region matching
def match_transcript_to_regions(srt_entries, ocr_regions, model):
    """Matches transcript lines to OCR regions."""
    matched_results = []
    for entry in srt_entries:
        matches = get_best_matches(entry["text"], ocr_regions)
        print(f'The no. of regions: {len(matches)} for entry: {entry}')
        matched_results.append({"start_time": entry["start_time"], "end_time": entry["end_time"], "transcript": entry["text"], "MatchedRegion": matches})
    return matched_results


# for getting entire ocr data
def extract_ocr_text(json_data):
    """Extract full OCR text from AWS OCR JSON."""
    block_map = {block["Id"]: block for block in json_data["Blocks"]}
    ocr_text = []
    
    for block in json_data["Blocks"]:
        if block["BlockType"] == "PAGE" and "Relationships" in block:
            for relation in block["Relationships"]:
                if relation["Type"] == "CHILD":
                    for child_id in relation["Ids"]:
                        child_block = block_map.get(child_id)
                        if child_block and "Text" in child_block:
                            ocr_text.append(child_block["Text"])
    
    return " ".join(ocr_text)


# save overlay and regions
def save_results(matched_results, output_json):
    """Saves the matched results as JSON."""
    with open(output_json, "w", encoding="utf-8") as file:
        json.dump({"MatchedRegions": matched_results}, file, indent=4)


# Gather all transcript embeddings for full-slide similarity
full_transcripts = {}
for sub_dir in sorted(os.listdir(transcript_parent_dir)):  # Sorting for consistency
    sub_transcript_dir = os.path.join(transcript_parent_dir, sub_dir)
    if os.path.isdir(sub_transcript_dir):
        for file in sorted(os.listdir(sub_transcript_dir)):  # Sorting filenames
            if file.endswith(".srt"):
                file_path = os.path.join(sub_transcript_dir, file)
                text = " ".join([entry["text"] for entry in parse_srt(file_path)])
                full_transcripts[file] = {"text": text, "embedding": model.encode(text, convert_to_tensor=True)}


# Iterate through slides for processing
for sub_dir in sorted(os.listdir(image_parent_dir)):  # Sorting for consistency
    sub_image_dir = os.path.join(image_parent_dir, sub_dir)
    sub_transcript_dir = os.path.join(transcript_parent_dir, sub_dir)
    sub_aws_ocr_dir = os.path.join(aws_ocr_parent_dir, sub_dir)
    output_dir = os.path.join(output_parent_dir, sub_dir)
    os.makedirs(output_dir, exist_ok=True)
    
    if not (os.path.isdir(sub_image_dir) and os.path.isdir(sub_transcript_dir) and os.path.isdir(sub_aws_ocr_dir)):
        continue
    
    for file in sorted(os.listdir(sub_image_dir)):  # Sorting filenames
        if file.endswith(".png"):
            base_name = os.path.splitext(file)[0]
            print(f'This is the filename: {base_name}')
            image_path = os.path.join(sub_image_dir, file)
            transcript_path = os.path.join(sub_transcript_dir, f"{base_name}.srt")
            ocr_path = os.path.join(sub_aws_ocr_dir, f"{base_name}.json")
            output_json = os.path.join(output_dir, f"{base_name}.json")
            output_image = os.path.join(output_dir, f"{base_name}_bbox.png")
            
            if os.path.exists(transcript_path) and os.path.exists(ocr_path):
                srt_entries = parse_srt(transcript_path)
                ocr_regions = load_aws_ocr(ocr_path)
                #print(ocr_regions)
                #print(full_transcripts)
                # Step 1: Match transcript lines to OCR regions
                matched_results = match_transcript_to_regions(srt_entries, ocr_regions, model)

                # Save
                save_results(matched_results, output_json)


### Method: Semantic Matching:S-BERT

In [None]:
# For text = 0.8, figure = 0.6, model = sbert, overlay = 60%

import os
import json
import torch
import re
from transformers import AutoModelForCausalLM, AutoTokenizer
from sentence_transformers import SentenceTransformer, util
from PIL import Image, ImageDraw, ImageFont

# Parent directories
image_parent_dir = "/tmp/megha/Complete/slides/ICML_slides"
transcript_parent_dir = "/tmp/megha/Complete/transcripts/ICML_trans"
aws_ocr_parent_dir = "/tmp/megha/Complete/Layout/ICML_layout/json"
output_parent_dir = "/tmp/megha/Complete/WITHOUT/ICML_res/S-BERT/T-1"

# Load SBERT model
model = SentenceTransformer("all-MiniLM-L6-v2")

# time from srt
def parse_srt(file_path):
    """Extracts timestamps and text from an SRT file."""
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    entries = []
    srt_pattern = re.compile(r"(\d+)\n(\d{2}:\d{2}:\d{2},\d{3}) --> (\d{2}:\d{2}:\d{2},\d{3})\n(.*?)\n\n", re.DOTALL)
    
    for match in srt_pattern.finditer(content):
        index, start_time, end_time, text = match.groups()
        text = text.replace("\n", " ").strip()
        entries.append({"start_time": start_time, "end_time": end_time, "text": text})
    
    return entries

# to get ocr_regions - Any ways it was avoided
def load_aws_ocr(file_path):
    """Loads AWS OCR results and extracts recognized text with bounding boxes and IDs."""
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)

    ocr_text_map = {
        block["Id"]: block.get("Text", "") 
        for block in data.get("Blocks", []) 
        if block.get("BlockType") == "LINE"
    }

    regions = []
    for block in data.get("Blocks", []):
        if block.get("BlockType", "").startswith("LAYOUT_"):
            child_blocks = [
                child_id 
                for rel in block.get("Relationships", []) 
                if rel["Type"] == "CHILD" 
                for child_id in rel.get("Ids", [])
            ]
            
            # Check if any child block is also a LAYOUT_ type
            if any(
                child_block.get("BlockType", "").startswith("LAYOUT_") 
                for child_block in data.get("Blocks", []) 
                if child_block["Id"] in child_blocks
            ):
                print(f"Parent block {block['Id']} has layout children, hence skipped.")
                continue

            text = [ocr_text_map[child_id] for child_id in child_blocks if child_id in ocr_text_map]
            
            regions.append({
                "Text": " ".join(text),
                "BoundingBox": block["Geometry"]["BoundingBox"],
                "BlockType": block["BlockType"],
                "Id": block['Id']
            })

    return regions

# finding best matches
def get_best_matches(transcript_text, ocr_regions, model):
    """Finds the best matching OCR regions for a transcript line."""
    transcript_embedding = model.encode(transcript_text, convert_to_tensor=True)
    matches = []
    
    for region in ocr_regions:
        ocr_text = region["Text"].strip()
        if not ocr_text:
            continue
        ocr_embedding = model.encode(ocr_text, convert_to_tensor=True)
        score = util.pytorch_cos_sim(transcript_embedding, ocr_embedding).item()
        threshold = 0.8 if region["BlockType"] != "LAYOUT_FIGURE" else 0.6
        if score > threshold:
            region['Score'] = score
            matches.append(region)
    
    return sorted(matches, key=lambda x: x["Score"], reverse=True)

# region matching
def match_transcript_to_regions(srt_entries, ocr_regions, model):
    """Matches transcript lines to OCR regions."""
    matched_results = []
    for entry in srt_entries:
        matches = get_best_matches(entry["text"], ocr_regions, model)
        print(f'The no. of regions: {len(matches)} for entry: {entry}')
        matched_results.append({"start_time": entry["start_time"], "end_time": entry["end_time"], "transcript": entry["text"], "MatchedRegion": matches})
    return matched_results


# for getting entire ocr data
def extract_ocr_text(json_data):
    """Extract full OCR text from AWS OCR JSON."""
    block_map = {block["Id"]: block for block in json_data["Blocks"]}
    ocr_text = []
    
    for block in json_data["Blocks"]:
        if block["BlockType"] == "PAGE" and "Relationships" in block:
            for relation in block["Relationships"]:
                if relation["Type"] == "CHILD":
                    for child_id in relation["Ids"]:
                        child_block = block_map.get(child_id)
                        if child_block and "Text" in child_block:
                            ocr_text.append(child_block["Text"])
    
    return " ".join(ocr_text)


# save overlay and regions
def save_results(matched_results, matched_slides, output_json):
    """Saves the matched results as JSON."""
    with open(output_json, "w", encoding="utf-8") as file:
        json.dump({"MatchedRegions": matched_results}, file, indent=4)


# Gather all transcript embeddings for full-slide similarity
full_transcripts = {}
for sub_dir in sorted(os.listdir(transcript_parent_dir)):  # Sorting for consistency
    sub_transcript_dir = os.path.join(transcript_parent_dir, sub_dir)
    if os.path.isdir(sub_transcript_dir):
        for file in sorted(os.listdir(sub_transcript_dir)):  # Sorting filenames
            if file.endswith(".srt"):
                file_path = os.path.join(sub_transcript_dir, file)
                text = " ".join([entry["text"] for entry in parse_srt(file_path)])
                full_transcripts[file] = {"text": text, "embedding": model.encode(text, convert_to_tensor=True)}


# Iterate through slides for processing
for sub_dir in sorted(os.listdir(image_parent_dir)):  # Sorting for consistency
    sub_image_dir = os.path.join(image_parent_dir, sub_dir)
    sub_transcript_dir = os.path.join(transcript_parent_dir, sub_dir)
    sub_aws_ocr_dir = os.path.join(aws_ocr_parent_dir, sub_dir)
    output_dir = os.path.join(output_parent_dir, sub_dir)
    os.makedirs(output_dir, exist_ok=True)
    
    if not (os.path.isdir(sub_image_dir) and os.path.isdir(sub_transcript_dir) and os.path.isdir(sub_aws_ocr_dir)):
        continue
    
    for file in sorted(os.listdir(sub_image_dir)):  # Sorting filenames
        if file.endswith(".png"):
            base_name = os.path.splitext(file)[0]
            print(f'This is the filename: {base_name}')
            image_path = os.path.join(sub_image_dir, file)
            transcript_path = os.path.join(sub_transcript_dir, f"{base_name}.srt")
            ocr_path = os.path.join(sub_aws_ocr_dir, f"{base_name}.json")
            output_json = os.path.join(output_dir, f"{base_name}.json")
            output_image = os.path.join(output_dir, f"{base_name}_bbox.png")
            
            if os.path.exists(transcript_path) and os.path.exists(ocr_path):
                srt_entries = parse_srt(transcript_path)
                ocr_regions = load_aws_ocr(ocr_path)
                #print(ocr_regions)
                #print(full_transcripts)
                # Step 1: Match transcript lines to OCR regions
                matched_results = match_transcript_to_regions(srt_entries, ocr_regions, model)
                save_results(matched_results, output_json)
                

### Method: Semantic Matching:Sci-BERT

In [None]:
# For 0.8
import os
import json
import torch
import re
from sentence_transformers import SentenceTransformer, util
from PIL import Image, ImageDraw, ImageFont
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np

# Parent directories
image_parent_dir = "/tmp/megha/Complete/slides/ICML_slides"
transcript_parent_dir = "/tmp/megha/Complete/Corr_trans/ICML_corr_trans"
aws_ocr_parent_dir = "/tmp/megha/Complete/Layout/ICML_layout/json"
output_parent_dir = "/tmp/megha/Complete/WITHOUT/ICML_res/Sci-BERT/T-1"


# Load the SPECTER model
model_name = "allenai/scibert_scivocab_uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
MAX_TOKENS = 512  # SciBERT's actual token limit

# time from srt
def parse_srt(file_path):
    """Extracts timestamps and text from an SRT file."""
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    entries = []
    srt_pattern = re.compile(r"(\d+)\n(\d{2}:\d{2}:\d{2},\d{3}) --> (\d{2}:\d{2}:\d{2},\d{3})\n(.*?)\n\n", re.DOTALL)
    
    for match in srt_pattern.finditer(content):
        index, start_time, end_time, text = match.groups()
        text = text.replace("\n", " ").strip()
        entries.append({"start_time": start_time, "end_time": end_time, "text": text})
    
    return entries

# to get ocr_regions
def load_aws_ocr(file_path):
    """Loads AWS OCR results and extracts recognized text with bounding boxes and IDs."""
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)

    ocr_text_map = {
        block["Id"]: block.get("Text", "") 
        for block in data.get("Blocks", []) 
        if block.get("BlockType") == "LINE"
    }

    regions = []
    for block in data.get("Blocks", []):
        if block.get("BlockType", "").startswith("LAYOUT_"):
            child_blocks = [
                child_id 
                for rel in block.get("Relationships", []) 
                if rel["Type"] == "CHILD" 
                for child_id in rel.get("Ids", [])
            ]
            
            # Check if any child block is also a LAYOUT_ type
            if any(
                child_block.get("BlockType", "").startswith("LAYOUT_") 
                for child_block in data.get("Blocks", []) 
                if child_block["Id"] in child_blocks
            ):
                print(f"Parent block {block['Id']} has layout children, hence skipped.")
                continue

            text = [ocr_text_map[child_id] for child_id in child_blocks if child_id in ocr_text_map]
            
            regions.append({
                "Text": " ".join(text),
                "BoundingBox": block["Geometry"]["BoundingBox"],
                "BlockType": block["BlockType"],
                "Id": block['Id']
            })

    return regions

# finding best matches
def get_embedding(text):
    """Tokenizes text and gets its embedding using SPECTER, printing if truncation occurs."""
    tokens = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    if tokens['input_ids'].shape[1] == 512:
        print(f"Truncation occurred for text: {text[:50]}...")  # Print truncated text preview
    
    with torch.no_grad():
        embedding = model(**tokens).last_hidden_state[:, 0, :]
    return embedding

def get_best_matches(transcript_text, ocr_regions):
    """Finds the best matching OCR regions for a transcript line using SPECTER."""
    transcript_embedding = get_embedding(transcript_text)
    matches = []
    
    for region in ocr_regions:
        ocr_text = region.get("Text").strip()  # Ensure we always get a string
        if not ocr_text:
            print("Skipping empty OCR text")  # Confirmation message
            continue
        else:
            print('Text not skipped')
        
        ocr_embedding = get_embedding(ocr_text)
        score = util.pytorch_cos_sim(transcript_embedding, ocr_embedding).item()
        threshold = 0.8 if region["BlockType"] != "LAYOUT_FIGURE" else 0.6

        print(f'This is the score: {score}')
        
        if score > threshold:
            print(f'This is the score: {score}')
            region['Score'] = score
            matches.append(region)

    print(f'These are the matches: {matches}, length of matches: {len(matches)}')
    
    return matches

def match_transcript_to_regions(srt_entries, ocr_regions):
    """Matches transcript lines to OCR regions using SPECTER."""
    matched_results = []
    
    for entry in srt_entries:
        matches = get_best_matches(entry["text"], ocr_regions)
        print(f'Matched inside mat: {matches}')
        matched_results.append({
            "start_time": entry["start_time"],
            "end_time": entry["end_time"],
            "transcript": entry["text"],
            "MatchedRegion": matches
        })

    print(f'Matched results before: {matched_results}')
    return matched_results

# for getting entire ocr data
def extract_ocr_text(json_data):
    """Extract full OCR text from AWS OCR JSON."""
    block_map = {block["Id"]: block for block in json_data["Blocks"]}
    ocr_text = []
    
    for block in json_data["Blocks"]:
        if block["BlockType"] == "PAGE" and "Relationships" in block:
            for relation in block["Relationships"]:
                if relation["Type"] == "CHILD":
                    for child_id in relation["Ids"]:
                        child_block = block_map.get(child_id)
                        if child_block and "Text" in child_block:
                            ocr_text.append(child_block["Text"])
    
    return " ".join(ocr_text)

def save_results(matched_results, output_json):
    """Saves the matched results as JSON."""
    with open(output_json, "w", encoding="utf-8") as file:
        json.dump({"MatchedRegions": matched_results}, file, indent=4)


def check_truncation(text, tokenizer):
    """Check if text exceeds SciBERT's token limit and print a warning if truncation is needed."""
    try:
        tokenized_text = tokenizer(text, return_tensors="pt", truncation=False)  # No truncation to check actual length
        token_len = tokenized_text.input_ids.shape[1]

    except Exception as e:
        print(f'The error while calculating token len: {e}')
    if token_len > tokenizer.model_max_length:
        print(f"Warning: Text exceeds SciBERT token limit ({token_len} > {tokenizer.model_max_length}) and may be truncated.")
    
    return token_len  # Return actual token length for further checks


# Gather all transcript embeddings for full-slide similarity
full_transcripts = {}
for sub_dir in sorted(os.listdir(transcript_parent_dir)):  # Sorting for consistency
    sub_transcript_dir = os.path.join(transcript_parent_dir, sub_dir)
    if os.path.isdir(sub_transcript_dir):
        for file in sorted(os.listdir(sub_transcript_dir)):  # Sorting filenames
            if file.endswith(".srt"):
                file_path = os.path.join(sub_transcript_dir, file)
                text = " ".join([entry["text"] for entry in parse_srt(file_path)])
                tokenized_text = check_truncation(text, tokenizer)
                full_transcripts[file] = {
                    "text": text,
                    "embedding":  get_embedding(text)
                }

# Iterate through slides for processing
for sub_dir in sorted(os.listdir(image_parent_dir)):  # Sorting for consistency
    sub_image_dir = os.path.join(image_parent_dir, sub_dir)
    sub_transcript_dir = os.path.join(transcript_parent_dir, sub_dir)
    sub_aws_ocr_dir = os.path.join(aws_ocr_parent_dir, sub_dir)
    output_dir = os.path.join(output_parent_dir, sub_dir)
    os.makedirs(output_dir, exist_ok=True)
    
    if not (os.path.isdir(sub_image_dir) and os.path.isdir(sub_transcript_dir) and os.path.isdir(sub_aws_ocr_dir)):
        continue
    
    for file in sorted(os.listdir(sub_image_dir)):  # Sorting filenames
        if file.endswith(".png"):
            base_name = os.path.splitext(file)[0]
            image_path = os.path.join(sub_image_dir, file)
            transcript_path = os.path.join(sub_transcript_dir, f"{base_name}.srt")
            ocr_path = os.path.join(sub_aws_ocr_dir, f"{base_name}.json")
            output_json = os.path.join(output_dir, f"{base_name}.json")
            output_image = os.path.join(output_dir, f"{base_name}_bbox.png")
            
            if os.path.exists(transcript_path) and os.path.exists(ocr_path):
                srt_entries = parse_srt(transcript_path)
                ocr_regions = load_aws_ocr(ocr_path)
                print(ocr_regions)
                #print(full_transcripts)
                # Step 1: Match transcript lines to OCR regions
                matched_results = match_transcript_to_regions(srt_entries, ocr_regions)
                print(f'This is the matched regions: {matched_results}')

                save_results(matched_results, output_json)


### Method: Semantic Matching:SPECTER

In [None]:
# For text = 0.8, figure = 0.6, model = specter, overlay = 60%

import os
import json
import torch
import re
from sentence_transformers import SentenceTransformer, util
from PIL import Image, ImageDraw, ImageFont
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np

# Parent directories
image_parent_dir = "/tmp/megha/Complete/slides/ICML_slides"
transcript_parent_dir = "/tmp/megha/Complete/transcripts/ICML_trans"
aws_ocr_parent_dir = "/tmp/megha/Complete/Layout/ICML_layout/json"
output_parent_dir = "/tmp/megha/Complete/WITHOUT/ICML_res/SPECTER/T-1"


# Load the SPECTER model
model = AutoModel.from_pretrained('allenai/specter')
tokenizer = AutoTokenizer.from_pretrained('allenai/specter')
MAX_TOKENS = 512  # SciBERT's actual token limit

# time from srt
def parse_srt(file_path):
    """Extracts timestamps and text from an SRT file."""
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    entries = []
    srt_pattern = re.compile(r"(\d+)\n(\d{2}:\d{2}:\d{2},\d{3}) --> (\d{2}:\d{2}:\d{2},\d{3})\n(.*?)\n\n", re.DOTALL)
    
    for match in srt_pattern.finditer(content):
        index, start_time, end_time, text = match.groups()
        text = text.replace("\n", " ").strip()
        entries.append({"start_time": start_time, "end_time": end_time, "text": text})
    
    return entries

# to get ocr_regions
def load_aws_ocr(file_path):
    """Loads AWS OCR results and extracts recognized text with bounding boxes and IDs."""
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)

    ocr_text_map = {
        block["Id"]: block.get("Text", "") 
        for block in data.get("Blocks", []) 
        if block.get("BlockType") == "LINE"
    }

    regions = []
    for block in data.get("Blocks", []):
        if block.get("BlockType", "").startswith("LAYOUT_"):
            child_blocks = [
                child_id 
                for rel in block.get("Relationships", []) 
                if rel["Type"] == "CHILD" 
                for child_id in rel.get("Ids", [])
            ]
            
            # Check if any child block is also a LAYOUT_ type
            if any(
                child_block.get("BlockType", "").startswith("LAYOUT_") 
                for child_block in data.get("Blocks", []) 
                if child_block["Id"] in child_blocks
            ):
                print(f"Parent block {block['Id']} has layout children, hence skipped.")
                continue

            text = [ocr_text_map[child_id] for child_id in child_blocks if child_id in ocr_text_map]
            
            regions.append({
                "Text": " ".join(text),
                "BoundingBox": block["Geometry"]["BoundingBox"],
                "BlockType": block["BlockType"],
                "Id": block['Id']
            })

    return regions

# finding best matches
def get_embedding(text):
    """Tokenizes text and gets its embedding using SPECTER, printing if truncation occurs."""
    tokens = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    if tokens['input_ids'].shape[1] == 512:
        print(f"Truncation occurred for text: {text[:50]}...")  # Print truncated text preview
    
    with torch.no_grad():
        embedding = model(**tokens).last_hidden_state[:, 0, :]
    return embedding

def get_best_matches(transcript_text, ocr_regions):
    """Finds the best matching OCR regions for a transcript line using SPECTER."""
    transcript_embedding = get_embedding(transcript_text)
    matches = []
    
    for region in ocr_regions:
        ocr_text = region.get("Text").strip()  # Ensure we always get a string
        if not ocr_text:
            print("Skipping empty OCR text")  # Confirmation message
            continue
        else:
            print('Text not skipped')
        
        ocr_embedding = get_embedding(ocr_text)
        score = util.pytorch_cos_sim(transcript_embedding, ocr_embedding).item()
        threshold = 0.8 if region["BlockType"] != "LAYOUT_FIGURE" else 0.6

        print(f'This is the score: {score}')
        
        if score > threshold:
            print(f'This is the score: {score}')
            region['Score'] = score
            matches.append(region)

    print(f'These are the matches: {matches}, length of matches: {len(matches)}')
    
    return matches

def match_transcript_to_regions(srt_entries, ocr_regions):
    """Matches transcript lines to OCR regions using SPECTER."""
    matched_results = []
    
    for entry in srt_entries:
        matches = get_best_matches(entry["text"], ocr_regions)
        print(f'Matched inside mat: {matches}')
        matched_results.append({
            "start_time": entry["start_time"],
            "end_time": entry["end_time"],
            "transcript": entry["text"],
            "MatchedRegion": matches
        })

    print(f'Matched results before: {matched_results}')
    return matched_results

# for getting entire ocr data
def extract_ocr_text(json_data):
    """Extract full OCR text from AWS OCR JSON."""
    block_map = {block["Id"]: block for block in json_data["Blocks"]}
    ocr_text = []
    
    for block in json_data["Blocks"]:
        if block["BlockType"] == "PAGE" and "Relationships" in block:
            for relation in block["Relationships"]:
                if relation["Type"] == "CHILD":
                    for child_id in relation["Ids"]:
                        child_block = block_map.get(child_id)
                        if child_block and "Text" in child_block:
                            ocr_text.append(child_block["Text"])
    
    return " ".join(ocr_text)

# save overlay and regions
def save_results(matched_results, output_json):
    """Saves the matched results as JSON."""
    with open(output_json, "w", encoding="utf-8") as file:
        json.dump({"MatchedRegion": matched_results}, file, indent=4)


def check_truncation(text, tokenizer):
    """Check if text exceeds SciBERT's token limit and print a warning if truncation is needed."""
    try:
        tokenized_text = tokenizer(text, return_tensors="pt", truncation=False)  # No truncation to check actual length
        token_len = tokenized_text.input_ids.shape[1]

    except Exception as e:
        print(f'The error while calculating token len: {e}')
    if token_len > tokenizer.model_max_length:
        print(f"Warning: Text exceeds SciBERT token limit ({token_len} > {tokenizer.model_max_length}) and may be truncated.")
    
    return token_len  # Return actual token length for further checks


# Gather all transcript embeddings for full-slide similarity
full_transcripts = {}
for sub_dir in sorted(os.listdir(transcript_parent_dir)):  # Sorting for consistency
    sub_transcript_dir = os.path.join(transcript_parent_dir, sub_dir)
    if os.path.isdir(sub_transcript_dir):
        for file in sorted(os.listdir(sub_transcript_dir)):  # Sorting filenames
            if file.endswith(".srt"):
                file_path = os.path.join(sub_transcript_dir, file)
                text = " ".join([entry["text"] for entry in parse_srt(file_path)])
                tokenized_text = check_truncation(text, tokenizer)
                full_transcripts[file] = {
                    "text": text,
                    "embedding":  get_embedding(text)
                }

# Iterate through slides for processing
for sub_dir in sorted(os.listdir(image_parent_dir)):  # Sorting for consistency
    sub_image_dir = os.path.join(image_parent_dir, sub_dir)
    sub_transcript_dir = os.path.join(transcript_parent_dir, sub_dir)
    sub_aws_ocr_dir = os.path.join(aws_ocr_parent_dir, sub_dir)
    output_dir = os.path.join(output_parent_dir, sub_dir)
    os.makedirs(output_dir, exist_ok=True)
    
    if not (os.path.isdir(sub_image_dir) and os.path.isdir(sub_transcript_dir) and os.path.isdir(sub_aws_ocr_dir)):
        continue
    
    for file in sorted(os.listdir(sub_image_dir)):  # Sorting filenames
        if file.endswith(".png"):
            base_name = os.path.splitext(file)[0]
            image_path = os.path.join(sub_image_dir, file)
            transcript_path = os.path.join(sub_transcript_dir, f"{base_name}.srt")
            ocr_path = os.path.join(sub_aws_ocr_dir, f"{base_name}.json")
            output_json = os.path.join(output_dir, f"{base_name}.json")
            output_image = os.path.join(output_dir, f"{base_name}_bbox.png")
            
            if os.path.exists(transcript_path) and os.path.exists(ocr_path):
                srt_entries = parse_srt(transcript_path)
                ocr_regions = load_aws_ocr(ocr_path)
                print(ocr_regions)
                #print(full_transcripts)
                # Step 1: Match transcript lines to OCR regions
                matched_results = match_transcript_to_regions(srt_entries, ocr_regions)
                print(f'This is the matched regions: {matched_results}')
                save_results(matched_results, output_json)


### Method: Semantic Matching:T5

In [None]:
import os
import json
import torch
import re
from transformers import AutoModelForCausalLM, AutoTokenizer
from sentence_transformers import SentenceTransformer, util
from PIL import Image, ImageDraw, ImageFont
import ast
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load T5 model and tokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base", torch_dtype=torch.float16).to(device)

# Parent directories
image_parent_dir = "/tmp/megha/Complete/slides/ICML_slides"
transcript_parent_dir = "/tmp/megha/Complete/transcripts/ICML_trans"
aws_ocr_parent_dir = "/tmp/megha/Complete/Layout/ICML_layout/json"
output_parent_dir = "/tmp/megha/Complete/WITHOUT/ICML_res/T5"

# time from srt
def parse_srt(file_path):
    """Extracts timestamps and text from an SRT file."""
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    entries = []
    srt_pattern = re.compile(r"(\d+)\n(\d{2}:\d{2}:\d{2},\d{3}) --> (\d{2}:\d{2}:\d{2},\d{3})\n(.*?)\n\n", re.DOTALL)
    
    for match in srt_pattern.finditer(content):
        index, start_time, end_time, text = match.groups()
        text = text.replace("\n", " ").strip()
        entries.append({"start_time": start_time, "end_time": end_time, "text": text})
    
    return entries

# to get ocr_regions
def load_aws_ocr(file_path):
    """Loads AWS OCR results and extracts recognized text with bounding boxes and IDs."""
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)

    ocr_text_map = {
        block["Id"]: block.get("Text", "") 
        for block in data.get("Blocks", []) 
        if block.get("BlockType") == "LINE"
    }

    regions = []
    for block in data.get("Blocks", []):
        if block.get("BlockType", "").startswith("LAYOUT_"):
            child_blocks = [
                child_id 
                for rel in block.get("Relationships", []) 
                if rel["Type"] == "CHILD" 
                for child_id in rel.get("Ids", [])
            ]
            
            # Check if any child block is also a LAYOUT_ type
            if any(
                child_block.get("BlockType", "").startswith("LAYOUT_") 
                for child_block in data.get("Blocks", []) 
                if child_block["Id"] in child_blocks
            ):
                print(f"Parent block {block['Id']} has layout children, hence skipped.")
                continue

            text = [ocr_text_map[child_id] for child_id in child_blocks if child_id in ocr_text_map]
            
            regions.append({
                "Text": " ".join(text),
                "BoundingBox": block["Geometry"]["BoundingBox"],
                "BlockType": block["BlockType"],
                "Id": block['Id']
            })

    return regions

def extract_output_list(response):
    """Extracts only the list after 'Output List:' from the response."""
    match = re.search(r"Output List:\s*(\[[^\]]*\])", response)
    if match:
        return match.group(1).strip()  # Extract and return only the list
    return "[]"  # Return an empty list if not found


# Additional function to check for figure relevance - just string matching 
def check_figure_relevance(transcript_text, figure_text):
    """Checks if a LAYOUT_FIGURE contains keywords related to the transcript."""
    transcript_words = set(transcript_text.lower().split())
    figure_words = set(figure_text.lower().split())
    
    return any(word in figure_words for word in transcript_words)

# Ask flan-t5 for each ocr_text whether it is relevant or not for the transcript_text - for Region
def check_relevance(transcript, ocr_texts):
    device = "cuda" if torch.cuda.is_available() else "cpu"  # Check available device
    model.to(device)  # Move the model to the correct device
    relevant_texts_list = []
    for ocr in ocr_texts:
        input_text = f"Given the transcript: {transcript}, is this OCR text relevant? {ocr_texts[ocr]['Text']}"
        # Calculate the number of tokens before processing
        num_tokens = len(tokenizer.encode(input_text, truncation=False))  
        #print(f"Input text token count: {num_tokens}")
        inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True).to(device)  # Move inputs to the same device
        output = model.generate(**inputs)
        result = tokenizer.decode(output[0], skip_special_tokens=True)
        if "yes" in result.lower() or "relevant" in result.lower():
            relevant_texts_list.append(ocr_texts[ocr])
    print(f'This is the relevant_texts_dict: {relevant_texts_list}')
    
    return relevant_texts_list

# Getting best matches
def get_best_matches(transcript_text, ocr_regions):
    """Finds the best matching OCR regions for a transcript line using T5."""
    ocr_texts = [f"Region {i+1}: {region['Text']} ({region['BlockType']})" for i, region in enumerate(ocr_regions)]
    ocr_texts_extract = {
        f"Region: {i+1}": {
            "Text": region["Text"],
            "BlockType": region["BlockType"],
            "BoundingBox": region["BoundingBox"],
            "Id": region['Id']
        } 
        for i, region in enumerate(ocr_regions)
    }
    
    #prompt = generate_prompt(transcript_text, ocr_texts_extracts)
    # pass for flan-t5 checking
    matched_regions = check_relevance(transcript_text, ocr_texts_extract)

    
    #matched_regions = [ocr_texts_extract[region] for region in output_list if region in ocr_texts_extract]
    #print(f'The matched regions: {matched_regions}')
    
    return matched_regions


# Slide matching
def match_transcript_to_regions(srt_entries, ocr_regions):
    """Matches transcript lines to OCR regions using Qwen."""
    matched_results = []
    for entry in srt_entries:
        try:
            matches = get_best_matches(entry["text"], ocr_regions)
            print(f'these are matches: {matches}')
            matched_results.append({
                "start_time": entry["start_time"],
                "end_time": entry["end_time"],
                "transcript": entry["text"],
                "MatchedRegion": [
                    match for match in matches # Exclude empty text matches
                ]
            })
            for match in matches:
                print(f'This is match text: {match['Text']}')
                print(f'This is bbox: {match["BoundingBox"]}')
            print(f'Formatted matched regions: {matched_results}')
        except Exception as e:
            print(f'Error: {e}')
    return matched_results


# for getting entire ocr data
def extract_ocr_text(json_data):
    """Extract full OCR text from AWS OCR JSON."""
    block_map = {block["Id"]: block for block in json_data["Blocks"]}
    ocr_text = []
    
    for block in json_data["Blocks"]:
        if block["BlockType"] == "PAGE" and "Relationships" in block:
            for relation in block["Relationships"]:
                if relation["Type"] == "CHILD":
                    for child_id in relation["Ids"]:
                        child_block = block_map.get(child_id)
                        if child_block and "Text" in child_block:
                            ocr_text.append(child_block["Text"])
    
    return " ".join(ocr_text)


# content overlap - to avoid animated slides (not used)
def check_content_overlap(text1, text2, threshold=0.90):
    """Checks if at least `threshold` proportion of text1 is present in text2."""
    words1, words2 = text1.split(), text2.split()
    if len(words1) == 0:
        return False
    match_count = sum(1 for word in words1 if word in words2)
    return (match_count / len(words1)) >= threshold


# Prompts for similar slides 
def generate_prompt_slides(transcript_text, current_ocr_text, formatted_ocr_texts):
    """Generates a prompt for Qwen to find the best matching OCR texts."""
    
    prompt = (
        f"Given the transcript: \"{transcript_text}\" and the OCR text: \"{current_ocr_text}\", "
        "find the most relevant slides from the following list that discuss similar topics. "
        "Select slides where key concepts overlap.\n\n"
        f"Slides OCR Texts:\n{formatted_ocr_texts}\n\n"
        "Output format:\n['Slide <Slide Number>', .... ]\n\n"
        "Output List:"
    )
    return prompt


def save_results(matched_results, output_json):
    """Saves the matched results as JSON in the required format."""
    
    formatted_results = []
    
    for result in matched_results:
        formatted_entry = {
            "start_time": result["start_time"],
            "end_time": result["end_time"],
            "transcript": result["transcript"],
            "MatchedRegions": []
        }

        for region in result.get("MatchedRegion", []):  # Ensure it exists
            #print(f'Inside loop: {region}')
            #if region['Region'].get("Text"):  # Only include meaningful text regions
                #print('text exist')
            formatted_entry["MatchedRegions"].append(region)

        formatted_results.append(formatted_entry)

    with open(output_json, "w", encoding="utf-8") as file:
        json.dump({"MatchedRegions": formatted_results}, file, indent=4)


# Gather all transcript embeddings for full-slide similarity
full_transcripts = {}
for sub_dir in sorted(os.listdir(transcript_parent_dir)):  # Sorting for consistency
    sub_transcript_dir = os.path.join(transcript_parent_dir, sub_dir)
    if os.path.isdir(sub_transcript_dir):
        for file in sorted(os.listdir(sub_transcript_dir)):  # Sorting filenames
            if file.endswith(".srt"):
                file_path = os.path.join(sub_transcript_dir, file)
                text = " ".join([entry["text"] for entry in parse_srt(file_path)])
                full_transcripts[file] = {"text": text }


# Iterate through slides for processing
for sub_dir in sorted(os.listdir(image_parent_dir)):  # Sorting for consistency
    sub_image_dir = os.path.join(image_parent_dir, sub_dir)
    sub_transcript_dir = os.path.join(transcript_parent_dir, sub_dir)
    sub_aws_ocr_dir = os.path.join(aws_ocr_parent_dir, sub_dir)
    output_dir = os.path.join(output_parent_dir, sub_dir)
    os.makedirs(output_dir, exist_ok=True)
    
    if not (os.path.isdir(sub_image_dir) and os.path.isdir(sub_transcript_dir) and os.path.isdir(sub_aws_ocr_dir)):
        continue
    
    for file in sorted(os.listdir(sub_image_dir)):  # Sorting filenames
        if file.endswith(".png"):
            base_name = os.path.splitext(file)[0]
            image_path = os.path.join(sub_image_dir, file)
            transcript_path = os.path.join(sub_transcript_dir, f"{base_name}.srt")
            ocr_path = os.path.join(sub_aws_ocr_dir, f"{base_name}.json")
            output_json = os.path.join(output_dir, f"{base_name}.json")
            output_image = os.path.join(output_dir, f"{base_name}_bbox.png")
            
            if os.path.exists(transcript_path) and os.path.exists(ocr_path):
                srt_entries = parse_srt(transcript_path)
                ocr_regions = load_aws_ocr(ocr_path)
                #print(ocr_regions)
                #print(full_transcripts)
                # Step 1: Match transcript lines to OCR regions
                matched_results = match_transcript_to_regions(srt_entries, ocr_regions)
                print(f'This is the matching regions: {matched_results}')

                # Step 2: Find similar slides based on full transcript
                #matched_slides = find_similar_slides(full_transcripts, f"{base_name}.srt",sub_aws_ocr_dir )
                #print(f'This is the macthed slides: {matched_slides}')

                save_results(matched_results, output_json)
                #draw_bounding_boxes(image_path, matched_results, output_image)


### Method: Semantic Matching:T5

In [None]:
# LLM code with everything
# REGIONS Working
# Now will check Slides

# part 8 - vid_68 had some issue (check into if time)

import os
import json
import torch
import re
from transformers import AutoModelForCausalLM, AutoTokenizer
from sentence_transformers import SentenceTransformer, util
from PIL import Image, ImageDraw, ImageFont
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import ast

# Load Qwen model and tokenizer
MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(device)

image_parent_dir = "/ssd_scratch/cvit/megha/Dataset/slides/NeurIPS_slides"
transcript_parent_dir = "/ssd_scratch/cvit/megha/Results/Corr_trans/NeurIPS_corr_trans" # use the corrected transcripts
aws_ocr_parent_dir = "/ssd_scratch/cvit/megha/Dataset/Layout/NeurIPS_layout/json"
output_parent_dir = "/ssd_scratch/cvit/megha/new/NeurIPS_res"

''''# Parent directories
image_parent_dir = "/tmp/megha/Complete/slides/ICML_slides"
transcript_parent_dir = "/tmp/megha/Complete/transcripts/ICML_trans"
aws_ocr_parent_dir = "/tmp/megha/Complete/Layout/ICML_layout/json"
output_parent_dir = "/tmp/megha/Complete/WITHOUT/ICML_res/Qwen"'''

# time from srt
def parse_srt(file_path):
    """Extracts timestamps and text from an SRT file."""
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    entries = []
    srt_pattern = re.compile(r"(\d+)\n(\d{2}:\d{2}:\d{2},\d{3}) --> (\d{2}:\d{2}:\d{2},\d{3})\n(.*?)\n\n", re.DOTALL)
    
    for match in srt_pattern.finditer(content):
        index, start_time, end_time, text = match.groups()
        text = text.replace("\n", " ").strip()
        entries.append({"start_time": start_time, "end_time": end_time, "text": text})
    
    return entries

# to get ocr_regions
def load_aws_ocr(file_path):
    """Loads AWS OCR results and extracts recognized text with bounding boxes and IDs."""
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)

    ocr_text_map = {
        block["Id"]: block.get("Text", "") 
        for block in data.get("Blocks", []) 
        if block.get("BlockType") == "LINE"
    }

    regions = []
    for block in data.get("Blocks", []):
        if block.get("BlockType", "").startswith("LAYOUT_"):
            child_blocks = [
                child_id 
                for rel in block.get("Relationships", []) 
                if rel["Type"] == "CHILD" 
                for child_id in rel.get("Ids", [])
            ]
            
            # Check if any child block is also a LAYOUT_ type
            if any(
                child_block.get("BlockType", "").startswith("LAYOUT_") 
                for child_block in data.get("Blocks", []) 
                if child_block["Id"] in child_blocks
            ):
                print(f"Parent block {block['Id']} has layout children, hence skipped.")
                continue

            text = [ocr_text_map[child_id] for child_id in child_blocks if child_id in ocr_text_map]
            
            regions.append({
                "Text": " ".join(text),
                "BoundingBox": block["Geometry"]["BoundingBox"],
                "BlockType": block["BlockType"],
                "Id": block['Id']
            })

    return regions

def extract_output_list(response):
    """Extracts only the list after 'Output List:' from the response."""
    match = re.search(r"Output List:\s*(\[[^\]]*\])", response)
    if match:
        return match.group(1).strip()  # Extract and return only the list
    return "[]"  # Return an empty list if not found

# for regions
def generate_prompt(transcript_text, ocr_texts):
    """Creates a prompt for Qwen to find the best matching OCR regions."""
    ocr_texts_str = "\n".join(ocr_texts)
    
    prompt = (
        f"Given the transcript: \"{transcript_text}\", find the best matching OCR regions from the list below. "
        "Return only the most relevant ones, including figures if they contain relevant keywords.\n\n"
        f"OCR Regions:\n{ocr_texts_str}\n\n"
        "Output format:\n['Region: <Region Number>', ...]\n\n"
        "Output List:"
    )
    return prompt

# Additional function to check for figure relevance - just string matching 
def check_figure_relevance(transcript_text, figure_text):
    """Checks if a LAYOUT_FIGURE contains keywords related to the transcript."""
    transcript_words = set(transcript_text.lower().split())
    figure_words = set(figure_text.lower().split())
    
    return any(word in figure_words for word in transcript_words)

# Getting best matches
def get_best_matches(transcript_text, ocr_regions):
    """Finds the best matching OCR regions for a transcript line using Qwen."""
    ocr_texts = [f"Region {i+1}: {region['Text']} ({region['BlockType']})" for i, region in enumerate(ocr_regions)]
    ocr_texts_extract = {
    f"Region: {i+1}": {
        "Text": region["Text"],
        "BlockType": region["BlockType"],
        "BoundingBox": region["BoundingBox"],
        "Id": region['Id']
    } 
    for i, region in enumerate(ocr_regions)
}
    prompt = generate_prompt(transcript_text, ocr_texts)
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    
    with torch.no_grad():
        output = model.generate(**inputs, max_length=2000)
    
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    output_list = extract_output_list(response)
    output_list = ast.literal_eval(output_list)  # Convert to list
    #print(f'this is tyoe of out list: {type(output_list)}')
    '''for region in output_list:
        print(f'This is region: {region}')'''
           
    matched_regions = [ocr_texts_extract[region] for region in output_list if region in ocr_texts_extract]
    #print(f'this is matched regions: {matched_regions}')

    return matched_regions

# Slide matching
def match_transcript_to_regions(srt_entries, ocr_regions):
    """Matches transcript lines to OCR regions using Qwen."""
    matched_results = []
    for entry in srt_entries:
        try:
            matches = get_best_matches(entry["text"], ocr_regions)
            print(f'Checking matches: {matches}')
            matched_results.append({
                "start_time": entry["start_time"],
                "end_time": entry["end_time"],
                "transcript": entry["text"],
                "MatchedRegion": [
                    match
                    for match in matches if match["Text"]  # Exclude empty text matches
                ]
            })
        except Exception as e:
            print(f'An error ocurred: {e}')
    return matched_results


# for getting entire ocr data
def extract_ocr_text(json_data):
    """Extract full OCR text from AWS OCR JSON."""
    block_map = {block["Id"]: block for block in json_data["Blocks"]}
    ocr_text = []
    
    for block in json_data["Blocks"]:
        if block["BlockType"] == "PAGE" and "Relationships" in block:
            for relation in block["Relationships"]:
                if relation["Type"] == "CHILD":
                    for child_id in relation["Ids"]:
                        child_block = block_map.get(child_id)
                        if child_block and "Text" in child_block:
                            ocr_text.append(child_block["Text"])
    
    return " ".join(ocr_text)


# content overlap - to avoid animated slides (not used)
def check_content_overlap(text1, text2, threshold=0.90):
    """Checks if at least `threshold` proportion of text1 is present in text2."""
    words1, words2 = text1.split(), text2.split()
    if len(words1) == 0:
        return False
    match_count = sum(1 for word in words1 if word in words2)
    return (match_count / len(words1)) >= threshold


# To save results
def save_results(matched_results, output_json):
    """Saves the matched results as JSON in the required format."""
    
    formatted_results = []
    
    for result in matched_results:
        formatted_entry = {
            "start_time": result["start_time"],
            "end_time": result["end_time"],
            "transcript": result["transcript"],
            "MatchedRegions": []
        }

        for region in result.get("MatchedRegions", []):  # Ensure it exists
            #print(f'Inside loop: {region}')
            #if region['Region'].get("Text"):  # Only include meaningful text regions
                #print('text exist')
            formatted_entry["MatchedRegion"].append(region)

        formatted_results.append(formatted_entry)

    with open(output_json, "w", encoding="utf-8") as file:
        json.dump({"MatchedRegions": formatted_results}, file, indent=4)


# Gather all transcript embeddings for full-slide similarity
full_transcripts = {}
for sub_dir in sorted(os.listdir(transcript_parent_dir)):  # Sorting for consistency
    sub_transcript_dir = os.path.join(transcript_parent_dir, sub_dir)
    if os.path.isdir(sub_transcript_dir):
        for file in sorted(os.listdir(sub_transcript_dir)):  # Sorting filenames
            if file.endswith(".srt"):
                file_path = os.path.join(sub_transcript_dir, file)
                text = " ".join([entry["text"] for entry in parse_srt(file_path)])
                full_transcripts[file] = {"text": text }


# Iterate through slides for processing
for sub_dir in sorted(os.listdir(image_parent_dir)):  # Sorting for consistency
    sub_image_dir = os.path.join(image_parent_dir, sub_dir)
    sub_transcript_dir = os.path.join(transcript_parent_dir, sub_dir)
    sub_aws_ocr_dir = os.path.join(aws_ocr_parent_dir, sub_dir)
    output_dir = os.path.join(output_parent_dir, sub_dir)
    os.makedirs(output_dir, exist_ok=True)
    
    if not (os.path.isdir(sub_image_dir) and os.path.isdir(sub_transcript_dir) and os.path.isdir(sub_aws_ocr_dir)):
        continue
    
    for file in sorted(os.listdir(sub_image_dir)):  # Sorting filenames
        if file.endswith(".png"):
            base_name = os.path.splitext(file)[0]
            file_path = os.path.join(output_parent_dir, sub_dir, f'{base_name}.json')
            print(f'This is file path: {file_path}')
            if os.path.exists(file_path):
                print(f'It is present: {file_path}')
                continue
            print(f'This is the basename: {base_name}')
            image_path = os.path.join(sub_image_dir, file)
            transcript_path = os.path.join(sub_transcript_dir, f"{base_name}.srt")
            ocr_path = os.path.join(sub_aws_ocr_dir, f"{base_name}.json")
            output_json = os.path.join(output_dir, f"{base_name}.json")
            output_image = os.path.join(output_dir, f"{base_name}_bbox.png")
            
            if os.path.exists(transcript_path) and os.path.exists(ocr_path):
                srt_entries = parse_srt(transcript_path)
                ocr_regions = load_aws_ocr(ocr_path)
                matched_results = match_transcript_to_regions(srt_entries, ocr_regions)
                print(f'This is the matching regions: {matched_results}')
                save_results(matched_results, output_json)
