In [None]:
#Run this to install whisper in Colab.
!pip install git+https://github.com/openai/whisper.git

In [None]:
import whisper
import json
import os
import math

# Load the model
model = whisper.load_model("large-v2")

In [None]:
# Configuration
n = 5  # Number of chunks to merge
input_folder = "audios"
output_folder = "jsons"

os.makedirs(output_folder, exist_ok=True)

if not os.path.exists(input_folder):
    print(f"Error: '{input_folder}' folder not found.")
    exit()

files = os.listdir(input_folder)
print(f"Found {len(files)} files in '{input_folder}'. Starting process...\n")

for audio in files:
    try:
        if audio.startswith('.'):
            continue

        # Splitting based on " - " and removing extension
        # Note: This specific format "Number - Title.ext"
        parts = audio.split(" - ")
        if len(parts) < 2:
            print(f"Skipping {audio}: Filename format mismatch.")
            continue
            
        number = parts[0]
        title = parts[1][:-4] 
        
        print(f"Processing: {number} - {title}")

        # Using the English transcription settings.
        # If audios in other language check whisper documentation.
        result = model.transcribe(audio = f"{input_folder}/{audio}",
                                  task="transcribe",
                                  word_timestamps=False)

        # Create initial raw chunks
        raw_chunks = []
        for segments in result["segments"]:
            raw_chunks.append({
                "number" : number,
                "title" : title,
                "start" : segments["start"],
                "end" : segments["end"],
                "text" : segments["text"]
            })

        merged_chunks = []
        num_chunks = len(raw_chunks)
        num_groups = math.ceil(num_chunks/n)

        for i in range(num_groups):
            start_idx = i*n
            end_idx = min((i+1)*n, num_chunks)

            chunk_group = raw_chunks[start_idx: end_idx]

            merged_chunks.append({
                "number": number,
                "title": title,
                "start": chunk_group[0]["start"],
                "end": chunk_group[-1]["end"],
                "text": " ".join(c["text"] for c in chunk_group)
            })

        # Saving directly to the 'jsons' folder with indentation
        final_data = {"chunks": merged_chunks, "text": result["text"]}

        output_path = f"{output_folder}/{number} - {title}.json"
        with open(output_path, "w", encoding="utf-8") as f:
            json.dump(final_data, f, indent=4)
            
        print(f"Saved: {output_path}")

    except Exception as e:
        print(f"Error processing {audio}: {e}")

print("\nAll files processed and merged!")