In [64]:
import os
from dotenv import load_dotenv
import json
import requests
import pandas as pd
from pydub import AudioSegment
from openai import OpenAI
import time

In [65]:
env_path = Path(".") / ".env"
load_dotenv(dotenv_path= env_path)

openai_api_key = os.getenv("OPENAI_API_KEY")

client = OpenAI(api_key= openai_api_key)

In [74]:
# Transcribe audio file using OpenAI Whisper API with timestamp data

def transcribe_with_timestamps(audio_file_path, model= "whisper-1"):
    print(f"Starting transcription for: {audio_file_path}")

    try:
        with open(audio_file_path, "rb") as audio_file:
            transcript = client.audio.transcriptions.create(
                model = model,
                file = audio_file,
                response_format = "verbose_json",
                timestamp_granularities = ["segment"],
            )
        print(f"Transcription completed. Found {len(transcript.segments)} segments")
        return transcript.segments

    except Exception as e:
        print(f"Error during transcription: {str(e)}")
        return None


In [75]:
print(transcribe_with_timestamps("/Users/mehmetpektas/Documents/Belgeler - Mehmet MacBook Air/Documents/Services/Creating_Dataset_With_Whisper/Talking_Book(mp3cut.net).mp3"))

Starting transcription for: /Users/mehmetpektas/Documents/Belgeler - Mehmet MacBook Air/Documents/Services/Creating_Dataset_With_Whisper/Talking_Book(mp3cut.net).mp3
Transcription completed. Found 23 segments
[TranscriptionSegment(id=0, avg_logprob=-0.3386313319206238, compression_ratio=1.26630437374115, end=5.480000019073486, no_speech_prob=0.027056396007537842, seek=0, start=0.0, temperature=0.0, text=' BİLGELİK TAPINAĞINA AÇILAN KAPI', tokens=[50364, 363, 20320, 43, 38, 3158, 20320, 42, 314, 4715, 1464, 32, 128, 252, 1464, 32, 316, 27707, 4620, 1770, 591, 4715, 40, 50638]), TranscriptionSegment(id=1, avg_logprob=-0.3386313319206238, compression_ratio=1.26630437374115, end=8.720000267028809, no_speech_prob=0.027056396007537842, seek=0, start=5.480000019073486, temperature=0.0, text=' 14. Bölüm 6. HİS', tokens=[50638, 3499, 13, 363, 13072, 8966, 1386, 13, 389, 20320, 50, 50800]), TranscriptionSegment(id=2, avg_logprob=-0.3386313319206238, compression_ratio=1.26630437374115, end=11.279

In [76]:
#Split audio file into segments based on timestamp data

def split_audio_by_segments(audio_file_path, segments, output_dir="output_segments"):
    print(f"Loading audio file: {audio_file_path}")

    audio = AudioSegment.from_file(audio_file_path)
    Path(output_dir).mkdir(exist_ok = true)
    
    segments_data = []
    
    print(f"Splitting audio into {len(segments)} segments...")
    
    for i, segment in enumerate(segments):
        # Convert seconds to milliseconds for pydub
        start_ms = int(segment.start * 1000)
        end_ms = int(segment.end * 1000)
        
        # Extract segment
        segment_audio = audio[start_ms:end_ms]

        filename = f"segment_{i+1:03d}.wav"
        output_path = os.path.join(output_dir, filename)

        segment_audio.export(output_path, format="wav")
        
        # Store segment data with relative path for portability
        segment_data = {
            "id": i + 1,
            "filename": filename,
            "start": segment.start,
            "end": segment.end,
            "duration": segment.end - segment.start,
            "text": segment.text.strip(),
            "file_path": filename  # Use relative path within the output directory
        }

        segments_data.append(segment_data)
        
        if (i + 1) % 10 == 0:
            print(f"Processed {i + 1}/{len(segments)} segments")
    
    print(f"Audio splitting completed. All segments saved to: {output_dir}")
    return segments_data