In [12]:
# Cell 1: Setup and Imports
import os
import sys
from pathlib import Path
from dotenv import load_dotenv

# Add project root to Python path
project_root = Path.cwd().parent
sys.path.append(str(project_root))

# Load environment variables
load_dotenv(project_root / '.env')

# Test API key
assemblyai_key = os.getenv('ASSEMBLYAI_API_KEY')
print(f"AssemblyAI API Key loaded: {'✅' if assemblyai_key else '❌'}")
print(f"Key starts with: {assemblyai_key[:10] if assemblyai_key else 'None'}...")


AssemblyAI API Key loaded: ✅
Key starts with: 972365f41d...


In [17]:
# Cell 2: Import Dependencies
import assemblyai as aai
from langgraph.graph import StateGraph
from typing import TypedDict, Optional, List  # ← Added List here
import glob
import time
from pathlib import Path  # ← Also added Path here

# Import our database
from database.db_operations import db
from database.models import ConversationCreate

print("✅ All imports successful")

✅ All imports successful


In [14]:
# Cell 3: Define LangGraph State
class AudioPipelineState(TypedDict):
    file_path: str
    filename: str
    transcript_text: Optional[str]
    conversation_id: Optional[int]
    error: Optional[str]
    status: str

print("✅ State defined")

✅ State defined


In [15]:
# Cell 4: Test AssemblyAI Connection
# Configure AssemblyAI
aai.settings.api_key = os.getenv('ASSEMBLYAI_API_KEY')

# Test with a simple transcription (we'll use a file from temp folder)
def test_assemblyai_connection():
    """Test if AssemblyAI is working"""
    try:
        # Just test the API key is valid
        transcriber = aai.Transcriber()
        print("✅ AssemblyAI connection successful")
        return True
    except Exception as e:
        print(f"❌ AssemblyAI connection failed: {e}")
        return False

test_assemblyai_connection()

✅ AssemblyAI connection successful


True

In [18]:
# Cell 5: Batch File Discovery and Management
def find_audio_files(temp_folder: Path) -> List[Path]:
    """Find all audio files in temp folder"""
    audio_extensions = ['*.wav', '*.mp3', '*.m4a']
    audio_files = []
    
    for ext in audio_extensions:
        audio_files.extend(temp_folder.glob(ext))
    
    return sorted(audio_files)

def display_batch_info(audio_files: List[Path]):
    """Display information about the batch of files"""
    if not audio_files:
        print("❌ No audio files found in temp folder!")
        return False
    
    total_size_mb = sum(f.stat().st_size for f in audio_files) / (1024 * 1024)
    
    print(f"📊 BATCH PROCESSING INFO:")
    print(f"   Files to process: {len(audio_files)}")
    print(f"   Total size: {total_size_mb:.1f} MB")
    print(f"\n📁 Files found:")
    
    for i, file_path in enumerate(audio_files, 1):
        size_mb = file_path.stat().st_size / (1024 * 1024)
        print(f"   {i}. {file_path.name} ({size_mb:.1f} MB)")
    
    return True

def cleanup_processed_files(processed_files: List[Path]):
    """Delete all successfully processed files"""
    print(f"\n🗑️ CLEANUP: Deleting {len(processed_files)} processed files...")
    deleted_count = 0
    
    for file_path in processed_files:
        try:
            file_path.unlink()  # Delete file
            print(f"   ✅ Deleted: {file_path.name}")
            deleted_count += 1
        except Exception as e:
            print(f"   ❌ Failed to delete {file_path.name}: {e}")
    
    print(f"🗑️ Cleanup complete: {deleted_count}/{len(processed_files)} files deleted")

# Discover files in temp folder
temp_folder = project_root / 'data' / 'temp'
temp_folder.mkdir(parents=True, exist_ok=True)  # Ensure folder exists

audio_files = find_audio_files(temp_folder)
files_available = display_batch_info(audio_files)

if files_available:
    print(f"\n🚀 Ready to process {len(audio_files)} files!")
else:
    print("\n💡 TIP: Add .wav files to data/temp/ folder for testing")

📊 BATCH PROCESSING INFO:
   Files to process: 3
   Total size: 2.1 MB

📁 Files found:
   1. blog_barchthreee.wav (0.7 MB)
   2. blog_batchone.wav (0.9 MB)
   3. blog_batxhtwo.wav (0.5 MB)

🚀 Ready to process 3 files!


In [19]:
# Cell 6: Define LangGraph Nodes
def transcription_node(state: AudioPipelineState) -> AudioPipelineState:
    """Node 1: Transcribe audio file with AssemblyAI"""
    try:
        print(f"🎙️ Transcribing: {state['filename']}")
        
        # Configure transcriber
        transcriber = aai.Transcriber()
        
        # Transcribe the file
        transcript = transcriber.transcribe(state['file_path'])
        
        if transcript.status == aai.TranscriptStatus.error:
            return {
                **state,
                "error": f"AssemblyAI error: {transcript.error}",
                "status": "transcription_failed"
            }
        
        return {
            **state,
            "transcript_text": transcript.text,
            "status": "transcribed"
        }
        
    except Exception as e:
        return {
            **state,
            "error": f"Transcription error: {str(e)}",
            "status": "transcription_failed"
        }

def database_saver_node(state: AudioPipelineState) -> AudioPipelineState:
    """Node 2: Save transcript to database"""
    try:
        print(f"💾 Saving to database: {state['filename']}")
        
        # Create conversation object
        conversation = ConversationCreate(
            title=f"Audio: {state['filename']}",
            raw_text=state['transcript_text'],
            source="transcribed"
        )
        
        # Save to database
        conversation_id = db.create_conversation(conversation)
        
        return {
            **state,
            "conversation_id": conversation_id,
            "status": "completed"
        }
        
    except Exception as e:
        return {
            **state,
            "error": f"Database error: {str(e)}",
            "status": "database_failed"
        }

print("✅ LangGraph nodes defined")

✅ LangGraph nodes defined


In [20]:
# Cell: Initialize Database
from database.init_db import create_database

print("🔧 Checking/creating database...")
try:
    create_database()
    print("✅ Database ready!")
except Exception as e:
    print(f"❌ Database error: {e}")

# Verify tables exist
import sqlite3
conn = sqlite3.connect('data/app.db')
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()
print(f"📊 Available tables: {[table[0] for table in tables]}")
conn.close()

🔧 Checking/creating database...
Creating database at: data/app.db
✅ Database schema created successfully!
✅ Created tables: ['conversations', 'sqlite_sequence', 'blog_post_ideas', 'processing_status']
✅ Database ready!
📊 Available tables: ['conversations', 'sqlite_sequence', 'blog_post_ideas', 'processing_status']


In [21]:
# Cell 7: Build LangGraph Workflow
def build_pipeline():
    """Build the LangGraph workflow"""
    workflow = StateGraph(AudioPipelineState)
    
    # Add nodes
    workflow.add_node("transcribe", transcription_node)
    workflow.add_node("save_to_db", database_saver_node)
    
    # Add edges
    workflow.add_edge("transcribe", "save_to_db")
    workflow.set_entry_point("transcribe")
    workflow.set_finish_point("save_to_db")
    
    return workflow.compile()

# Build the pipeline
pipeline = build_pipeline()
print("✅ LangGraph pipeline compiled and ready")

✅ LangGraph pipeline compiled and ready


In [22]:
# Cell 8: Batch Processing Function
def process_audio_batch(audio_files: List[Path], pipeline) -> dict:
    """Process all audio files in batch"""
    
    if not audio_files:
        print("❌ No files to process")
        return {"processed": [], "failed": [], "total": 0}
    
    print(f"\n🚀 STARTING BATCH PROCESSING - {len(audio_files)} files")
    print("=" * 60)
    
    processed_files = []
    failed_files = []
    results = []
    
    for i, file_path in enumerate(audio_files, 1):
        print(f"\n📂 Processing {i}/{len(audio_files)}: {file_path.name}")
        print("-" * 40)
        
        # Create initial state
        initial_state = {
            "file_path": str(file_path),
            "filename": file_path.name,
            "transcript_text": None,
            "conversation_id": None,
            "error": None,
            "status": "processing"
        }
        
        try:
            # Run through pipeline
            result = pipeline.invoke(initial_state)
            
            if result["status"] == "completed":
                print(f"✅ SUCCESS: {file_path.name}")
                print(f"   Conversation ID: {result['conversation_id']}")
                print(f"   Transcript preview: {result['transcript_text'][:100]}...")
                processed_files.append(file_path)
            else:
                print(f"❌ FAILED: {file_path.name}")
                print(f"   Error: {result.get('error', 'Unknown error')}")
                failed_files.append(file_path)
            
            results.append(result)
            
        except Exception as e:
            print(f"❌ PIPELINE ERROR: {file_path.name}")
            print(f"   Exception: {str(e)}")
            failed_files.append(file_path)
            
            results.append({
                **initial_state,
                "error": str(e),
                "status": "pipeline_error"
            })
    
    return {
        "processed": processed_files,
        "failed": failed_files,
        "total": len(audio_files),
        "results": results
    }

print("✅ Batch processing function ready")

✅ Batch processing function ready


In [23]:
# Cell 9: Execute Batch Processing with Cleanup
if files_available:
    print("🎯 Starting batch processing...")
    
    # Process all files
    batch_results = process_audio_batch(audio_files, pipeline)
    
    # Display summary
    print(f"\n📊 BATCH PROCESSING COMPLETE!")
    print("=" * 60)
    print(f"✅ Successfully processed: {len(batch_results['processed'])}")
    print(f"❌ Failed: {len(batch_results['failed'])}")
    print(f"📁 Total files: {batch_results['total']}")
    
    # Show failed files
    if batch_results['failed']:
        print(f"\n❌ Failed files:")
        for file_path in batch_results['failed']:
            print(f"   - {file_path.name}")
    
    # Cleanup successfully processed files
    if batch_results['processed']:
        confirm = input(f"\n🗑️ Delete {len(batch_results['processed'])} processed files? (y/N): ")
        if confirm.lower() in ['y', 'yes']:
            cleanup_processed_files(batch_results['processed'])
        else:
            print("🔧 Files kept in temp folder for inspection")
    
    print("\n🎉 Batch processing complete!")
    
else:
    print("💡 Add audio files to data/temp/ folder and rerun this cell")

🎯 Starting batch processing...

🚀 STARTING BATCH PROCESSING - 3 files

📂 Processing 1/3: blog_barchthreee.wav
----------------------------------------
🎙️ Transcribing: blog_barchthreee.wav
💾 Saving to database: blog_barchthreee.wav
✅ SUCCESS: blog_barchthreee.wav
   Conversation ID: 2
   Transcript preview: Testing batch processing number three....

📂 Processing 2/3: blog_batchone.wav
----------------------------------------
🎙️ Transcribing: blog_batchone.wav
💾 Saving to database: blog_batchone.wav
✅ SUCCESS: blog_batchone.wav
   Conversation ID: 3
   Transcript preview: Testing batch processing one....

📂 Processing 3/3: blog_batxhtwo.wav
----------------------------------------
🎙️ Transcribing: blog_batxhtwo.wav
💾 Saving to database: blog_batxhtwo.wav
✅ SUCCESS: blog_batxhtwo.wav
   Conversation ID: 4
   Transcript preview: Testing batch process into....

📊 BATCH PROCESSING COMPLETE!
✅ Successfully processed: 3
❌ Failed: 0
📁 Total files: 3

🗑️ CLEANUP: Deleting 3 processed files...
 

In [24]:
# Cell 10: View Results in Database
def show_recent_conversations(limit=10):
    """Display recent conversations from database"""
    conversations = db.get_all_conversations()
    
    if not conversations:
        print("📝 No conversations found in database")
        return
    
    print(f"📝 Recent Conversations (showing {min(limit, len(conversations))}):")
    print("-" * 60)
    
    for conv in conversations[:limit]:
        print(f"ID: {conv.id} | Title: {conv.title}")
        print(f"Source: {conv.source} | Words: {conv.word_count} | Status: {conv.status}")
        print(f"Created: {conv.created_at}")
        print(f"Preview: {conv.raw_text[:150]}...")
        print("-" * 60)

# Show results
show_recent_conversations()

📝 Recent Conversations (showing 4):
------------------------------------------------------------
ID: 4 | Title: Audio: blog_batxhtwo.wav
Source: transcribed | Words: 4 | Status: pending
Created: 2025-09-24 08:20:07
Preview: Testing batch process into....
------------------------------------------------------------
ID: 3 | Title: Audio: blog_batchone.wav
Source: transcribed | Words: 4 | Status: pending
Created: 2025-09-24 08:20:02
Preview: Testing batch processing one....
------------------------------------------------------------
ID: 2 | Title: Audio: blog_barchthreee.wav
Source: transcribed | Words: 5 | Status: pending
Created: 2025-09-24 08:19:57
Preview: Testing batch processing number three....
------------------------------------------------------------
ID: 1 | Title: Audio: blog_recordcomtines.wav
Source: transcribed | Words: 30 | Status: pending
Created: 2025-09-24 07:48:54
Preview: I'm uploading a file from my telephone, and the idea is that this file is going to be monitored 