# Audio to JSON (Transcription & Chunking)

This notebook converts lecture audio files into structured JSON documents.

Pipeline:
1. Load MP3 lecture audio files
2. Transcribe audio using OpenAI Whisper (large-v2)
3. Merge Whisper segments into RAG-optimized chunks
4. Store timestamps, lecture metadata, and text
5. Save output as reusable JSON files

Chunking Strategy:
- Time-based chunking (15â€“25 seconds)
- Small overlap to preserve context
- Designed for embedding and retrieval performance


### Imports & Paths

In [None]:
import os
import json
import whisper
from tqdm import tqdm


In [None]:
BASE_DIR = "/content/drive/MyDrive/RAG_BAS_PROJECT"
AUDIO_DIR = os.path.join(BASE_DIR, "AUDIOS")
JSON_DIR = os.path.join(BASE_DIR, "jsons")

os.makedirs(JSON_DIR, exist_ok=True)

print("Directories ready")


### Load Whisper Model

In [None]:
model = whisper.load_model("large-v2")
print("Whisper large-v2 model loaded")


## Chunking Parameters

- MAX_DURATION: Target chunk length (seconds)
- OVERLAP: Context overlap between chunks


In [None]:
MAX_DURATION = 20   # seconds
OVERLAP = 3         # seconds


### Chunk Creation Function

In [None]:
def create_chunks(segments, lecture_number, lecture_title):
    chunks = []
    buffer_text = []
    chunk_start = None

    for seg in segments:
        if chunk_start is None:
            chunk_start = seg["start"]

        buffer_text.append(seg["text"].strip())
        duration = seg["end"] - chunk_start

        if duration >= MAX_DURATION:
            chunks.append({
                "Number": lecture_number,
                "Title": lecture_title,
                "Start": round(chunk_start, 2),
                "End": round(seg["end"], 2),
                "Text": " ".join(buffer_text)
            })

            # Overlap handling
            chunk_start = seg["end"] - OVERLAP
            buffer_text = []

    # Handle leftover text
    if buffer_text:
        chunks.append({
            "Number": lecture_number,
            "Title": lecture_title,
            "Start": round(chunk_start, 2),
            "End": round(segments[-1]["end"], 2),
            "Text": " ".join(buffer_text)
        })

    return chunks


#### List Audio Files

In [None]:
audio_files = sorted([
    f for f in os.listdir(AUDIO_DIR)
    if f.endswith(".mp3")
])

print(f"Found {len(audio_files)} audio files")


#### Transcribe & Save JSON

In [None]:
for audio in tqdm(audio_files):
    lecture_number = audio.split("_")[0]
    lecture_title = audio.split("_", 1)[1].replace(".mp3", "")
    audio_path = os.path.join(AUDIO_DIR, audio)

    output_json_path = os.path.join(
        JSON_DIR,
        f"{lecture_number}_{lecture_title}.json"
    )

    # Skip if already processed
    if os.path.exists(output_json_path):
        print(f"Skipping (already exists): {lecture_number}_{lecture_title}")
        continue

    print(f"\nTranscribing: {audio}")

    result = model.transcribe(
        audio=audio_path,
        language="en",
        task="transcribe",
        word_timestamps=False
    )

    chunks = create_chunks(
        segments=result["segments"],
        lecture_number=lecture_number,
        lecture_title=lecture_title
    )

    output = {
        "lecture_number": lecture_number,
        "lecture_title": lecture_title,
        "full_text": result["text"],
        "chunks": chunks
    }

    with open(output_json_path, "w") as f:
        json.dump(output, f, indent=2)

    print(f"Saved JSON: {output_json_path}")


In [None]:
print("03_audio_to_json.ipynb completed successfully.")
