In [5]:
import os
from dotenv import load_dotenv
import json
import requests
import pandas as pd
from pydub import AudioSegment
from openai import OpenAI
from pathlib import Path

In [6]:
env_path = Path(".") / ".env"
load_dotenv(dotenv_path=env_path)

openai_api_key = os.getenv("OPENAI_API_KEY")

client = OpenAI(api_key=openai_api_key)

In [7]:
# Transcribe audio file using OpenAI Whisper API with timestamp data


def transcribe_with_timestamps(audio_file_path, model="whisper-1"):
    print(f"Starting transcription for: {audio_file_path}")

    try:
        with open(audio_file_path, "rb") as audio_file:
            transcript = client.audio.transcriptions.create(
                model=model,
                file=audio_file,
                response_format="verbose_json",
                timestamp_granularities=["segment"],
            )
        print(f"Transcription completed. Found {len(transcript.segments)} segments")
        return transcript

    except Exception as e:
        print(f"Error during transcription: {str(e)}")
        return None

In [None]:
# Split audio file into segments based on timestamp data


def split_audio_by_segments(audio_file_path, segments, output_dir="output_segments"):
    print(f"Loading audio file: {audio_file_path}")

    audio = AudioSegment.from_file(audio_file_path)
    Path(output_dir).mkdir(exist_ok=True)

    segments_data = []

    print(f"Splitting audio into {len(segments)} segments...")

    for i, segment in enumerate(segments):
        # Convert seconds to milliseconds for pydub
        start_ms = int(segment.start * 1000)
        end_ms = int(segment.end * 1000)

        # Extract segment
        segment_audio = audio[start_ms:end_ms]

        filename = f"segment_{i+1:03d}.wav"
        output_path = os.path.join(output_dir, filename)

        segment_audio.export(output_path, format="wav")

        # Store segment data with relative path for portability
        segment_data = {
            "id": i + 1,
            "filename": filename,
            "start": segment.start,
            "end": segment.end,
            "duration": segment.end - segment.start,
            "text": segment.text.strip(),
            "file_path": filename,  # Use relative path within the output directory
        }

        segments_data.append(segment_data)

        if (i + 1) % 10 == 0:
            print(f"Processed {i + 1}/{len(segments)} segments")

    print(f"Audio splitting completed. All segments saved to: {output_dir}")
    return segments_data

In [None]:
# Save the dataset to CSV file in the same directory as audio segments


def save_dataset(segments_data, output_dir="output_segments"):
    csv_file_path = os.path.join(output_dir, "dataset.csv")
    print(f"Saving dataset to: {csv_file_path}")

    # Convert to DataFrame
    df = pd.DataFrame(segments_data)

    # Save to CSV
    df.to_csv(csv_file_path, index=False, encoding="utf-8")

    print(f"Dataset saved successfully with {len(segments_data)} entries")
    print(f"CSV file location: {csv_file_path}")

    return csv_file_path

In [None]:
def process_large_audio_file(audio_file_path, chunk_duration_minutes=3):  # === FIXED: Reduced to 3 minutes ===
    """
    Process large audio files by splitting them into chunks first
    """
    print(f"Processing large audio file: {audio_file_path}")

    # Load audio
    audio = AudioSegment.from_file(audio_file_path)
    duration_minutes = len(audio) / (1000 * 60)

    print(f"Audio duration: {duration_minutes:.2f} minutes")

    if duration_minutes <= chunk_duration_minutes:
        print("File is small enough, processing directly...")
        return process_audio_file(audio_file_path)

    # === FIXED: More aggressive chunk size calculation ===
    file_size_mb = os.path.getsize(audio_file_path) / (1024 * 1024)

    # Calculate chunks needed to keep each under 20MB when converted to audio format
    estimated_audio_size_per_minute = file_size_mb / duration_minutes * 5  # WAV is ~5x larger than MP4
    target_chunk_minutes = min(15 / estimated_audio_size_per_minute, chunk_duration_minutes)  # Target 15MB chunks
    target_chunk_minutes = max(1, target_chunk_minutes)  # Minimum 1 minute

    print(f"Estimated audio size per minute: {estimated_audio_size_per_minute:.1f} MB")
    print(f"Target chunk duration: {target_chunk_minutes:.1f} minutes")

    # Split into chunks
    chunk_duration_ms = int(target_chunk_minutes * 60 * 1000)
    chunks = []

    print(f"Splitting into {target_chunk_minutes:.1f}-minute chunks...")

    chunk_count = 0
    for i in range(0, len(audio), chunk_duration_ms):
        chunk = audio[i:i + chunk_duration_ms]
        chunk_count += 1

        # === FIXED: Use MP3 format instead of WAV to reduce file size ===
        chunk_filename = f"temp_chunk_{chunk_count}.mp3"

        # === FIXED: Export with lower bitrate to ensure small file size ===
        chunk.export(
            chunk_filename,
            format="mp3",
            parameters=["-ar", "16000", "-ac", "1", "-b:a", "64k"]  # 16kHz mono 64kbps
        )

        # Check chunk file size
        chunk_size_mb = os.path.getsize(chunk_filename) / (1024 * 1024)
        print(f"Created {chunk_filename} - Size: {chunk_size_mb:.1f} MB")

        # === FIXED: If chunk is still too big, split it further ===
        if chunk_size_mb > 20:
            print(f"Chunk {chunk_filename} is too large ({chunk_size_mb:.1f} MB), splitting further...")
            os.remove(chunk_filename)  # Remove the large chunk

            # Split this chunk into smaller pieces
            sub_chunk_duration = chunk_duration_ms // 2
            for j in range(0, len(chunk), sub_chunk_duration):
                sub_chunk = chunk[j:j + sub_chunk_duration]
                sub_chunk_filename = f"temp_chunk_{chunk_count}_{j//sub_chunk_duration + 1}.mp3"

                sub_chunk.export(
                    sub_chunk_filename,
                    format="mp3",
                    parameters=["-ar", "16000", "-ac", "1", "-b:a", "64k"]
                )

                sub_chunk_size = os.path.getsize(sub_chunk_filename) / (1024 * 1024)
                print(f"Created sub-chunk {sub_chunk_filename} - Size: {sub_chunk_size:.1f} MB")

                chunks.append({
                    'filename': sub_chunk_filename,
                    'start_offset': (i + j) / 1000
                })
        else:
            chunks.append({
                'filename': chunk_filename,
                'start_offset': i / 1000  # in seconds
            })

    print(f"Created {len(chunks)} chunks")

    all_segments_data = []

    # Process each chunk
    for idx, chunk_info in enumerate(chunks):
        print(f"Processing chunk {idx + 1}/{len(chunks)}: {chunk_info['filename']}")

        # === FIXED: Check file size before sending to API ===
        chunk_size_mb = os.path.getsize(chunk_info['filename']) / (1024 * 1024)
        if chunk_size_mb > 25:
            print(f"Skipping {chunk_info['filename']} - still too large ({chunk_size_mb:.1f} MB)")
            continue

        # Transcribe chunk
        transcript = transcribe_with_timestamps(chunk_info['filename'])

        if transcript and transcript.segments:
            # Adjust timestamps based on chunk offset
            for segment in transcript.segments:
                segment.start += chunk_info['start_offset']
                segment.end += chunk_info['start_offset']

            # Split audio for this chunk - use the chunk file, not the original
            chunk_output_dir = f"1.audio_output_segments_chunk_{idx + 1}"

            # Adjust segments back to chunk-relative timestamps for splitting
            chunk_segments = []
            for segment in transcript.segments:
                # Create a copy of segment with chunk-relative timestamps
                chunk_segment = type('obj', (object,), {
                    'start': segment.start - chunk_info['start_offset'],  # Convert to chunk-relative
                    'end': segment.end - chunk_info['start_offset'],     # Convert to chunk-relative
                    'text': segment.text
                })
                chunk_segments.append(chunk_segment)

            segments_data = split_audio_by_segments(
                chunk_info['filename'],  # Use chunk file instead of original audio_file_path
                chunk_segments,          # Use chunk-relative timestamps
                chunk_output_dir
            )

            # Now update segments_data with absolute timestamps and file paths
            for j, segment in enumerate(segments_data):
                segment['start'] = transcript.segments[j].start      # Use absolute timestamp
                segment['end'] = transcript.segments[j].end          # Use absolute timestamp
                segment['duration'] = segment['end'] - segment['start']  # Recalculate duration
                segment['file_path'] = segment['filename']

            all_segments_data.extend(segments_data)

        # Clean up temporary chunk file
        if os.path.exists(chunk_info['filename']):
            os.remove(chunk_info['filename'])

        # Add delay to respect API rate limits
        time.sleep(1)

    print(f"Large file processing completed. Total segments: {len(all_segments_data)}")

    # Move all segments to a single output directory and save dataset
    final_output_dir = "output_segments"
    Path(final_output_dir).mkdir(exist_ok=True)

    print("Consolidating all segments into single directory...")

    # Move files and update paths
    for i, segment in enumerate(all_segments_data):
        # Find the source file
        source_file = None
        for chunk_idx in range(len(chunks)):
            chunk_dir = f"1.audio_output_segments_chunk_{chunk_idx + 1}"
            potential_path = os.path.join(chunk_dir, segment['filename'])
            if os.path.exists(potential_path):
                source_file = potential_path
                break

        if source_file:
            # Create new filename to avoid conflicts
            new_filename = f"segment_{i+1:03d}.wav"
            new_path = os.path.join(final_output_dir, new_filename)

            # Move file
            os.rename(source_file, new_path)

            # Update segment data
            segment['filename'] = new_filename
            segment['file_path'] = new_filename

    # Clean up chunk directories
    for chunk_idx in range(len(chunks)):
        chunk_dir = f"1.audio_output_segments_chunk_{chunk_idx + 1}"
        if os.path.exists(chunk_dir):
            try:
                os.rmdir(chunk_dir)
            except:
                pass  # Directory might not be empty, that's ok

    # Save consolidated dataset
    save_dataset(all_segments_data, final_output_dir)

    return all_segments_data

In [None]:
#Main function to process a single audio file

def process_audio_file(audio_file_path):
    print(f"Starting audio processing pipeline for: {audio_file_path}")

    transcript = transcribe_with_timestamps(audio_file_path)

    if not transcript or not transcript.segments:
        print("Transcription failed or no segments found")
        return None

    segments_data = split_audio_by_segments(audio_file_path, transcript.segments)

    save_dataset(segments_data, "output_segments")

    return segments_data

In [None]:
if __name__ == "__main__":
    # Configuration
    AUDIO_FILE_PATH = "{You need to add here the path of your file}"

    # Check if file exists
    if not os.path.exists(AUDIO_FILE_PATH):
        print(f"Audio file not found: {AUDIO_FILE_PATH}")
        print("Please update AUDIO_FILE_PATH with your actual audio file path")
    else:
        # Get file size to determine processing method
        file_size_mb = os.path.getsize(AUDIO_FILE_PATH) / (1024 * 1024)
        print(f"Audio file size: {file_size_mb:.2f} MB")

        if file_size_mb > 25:  # OpenAI has 25MB limit
            print("File is larger than 25MB, using chunked processing...")
            segments_data = process_large_audio_file(AUDIO_FILE_PATH)
        else:
            print("File size is acceptable, processing directly...")
            segments_data = process_audio_file(AUDIO_FILE_PATH)

        if segments_data:
            print(f"Processing completed successfully!")
            print(f"Total segments created: {len(segments_data)}")
            print(f"Check 'output_segments' directory for audio files and dataset.csv")

            # Show CSV file location
            csv_path = os.path.join("output_segments", "dataset.csv")
            if os.path.exists(csv_path):
                print(f"Dataset CSV file: {csv_path}")
