In [1]:
# Cell 1: Setup and Imports
import os
import sys
from pathlib import Path
from dotenv import load_dotenv

# Add project root to Python path
project_root = Path.cwd().parent
sys.path.append(str(project_root))

# Load environment variables
load_dotenv(project_root / '.env')

# Test API key
assemblyai_key = os.getenv('ASSEMBLYAI_API_KEY')
print(f"AssemblyAI API Key loaded: {'✅' if assemblyai_key else '❌'}")
print(f"Key starts with: {assemblyai_key[:10] if assemblyai_key else 'None'}...")


AssemblyAI API Key loaded: ✅
Key starts with: 972365f41d...


In [6]:
# Cell 2: Import Dependencies
import assemblyai as aai
from langgraph.graph import StateGraph, END, START
from typing import TypedDict, Optional
import glob

# Import our database
from database.db_operations import db
from database.models import ConversationCreate

print("✅ All imports successful")

✅ All imports successful


In [7]:
# Cell 3: Define LangGraph State
class AudioPipelineState(TypedDict):
    file_path: str
    filename: str
    transcript_text: Optional[str]
    conversation_id: Optional[int]
    error: Optional[str]
    status: str

print("✅ State defined")

✅ State defined


In [4]:
# Cell 4: Test AssemblyAI Connection
# Configure AssemblyAI
aai.settings.api_key = os.getenv('ASSEMBLYAI_API_KEY')

# Test with a simple transcription (we'll use a file from temp folder)
def test_assemblyai_connection():
    """Test if AssemblyAI is working"""
    try:
        # Just test the API key is valid
        transcriber = aai.Transcriber()
        print("✅ AssemblyAI connection successful")
        return True
    except Exception as e:
        print(f"❌ AssemblyAI connection failed: {e}")
        return False

test_assemblyai_connection()

✅ AssemblyAI connection successful


True

In [5]:
# Cell 5: Find Test Audio Files
# Let's see what audio files we have in temp
temp_folder = project_root / 'data' / 'temp'
audio_files = list(temp_folder.glob('*.wav')) + list(temp_folder.glob('*.mp3'))

print(f"📁 Found {len(audio_files)} audio files in temp:")
for file_path in audio_files:
    size_mb = file_path.stat().st_size / (1024 * 1024)
    print(f"   {file_path.name} ({size_mb:.1f} MB)")

# Select first file for testing
test_file = audio_files[0] if audio_files else None
if test_file:
    print(f"\n🎯 Will use for testing: {test_file.name}")
else:
    print("\n❌ No audio files found! Please add a .wav file to data/temp/")

📁 Found 1 audio files in temp:
   blog_record.wav (30.8 MB)

🎯 Will use for testing: blog_record.wav


In [8]:
# Cell 6: AssemblyAI Transcription Node
def audio_transcriber_node(state: AudioPipelineState) -> AudioPipelineState:
    """Node 1: Transcribe audio file using AssemblyAI"""
    
    print(f"🎙️ Starting transcription for: {state['filename']}")
    
    try:
        # Configure transcriber
        transcriber = aai.Transcriber()
        
        # Start transcription
        print("📡 Uploading to AssemblyAI...")
        transcript = transcriber.transcribe(state['file_path'])
        
        # Wait for completion
        print("⏳ Waiting for transcription...")
        
        if transcript.status == aai.TranscriptStatus.error:
            error_msg = f"AssemblyAI transcription failed: {transcript.error}"
            print(f"❌ {error_msg}")
            return {
                **state,
                "error": error_msg,
                "status": "transcription_failed"
            }
        
        print(f"✅ Transcription completed!")
        print(f"📝 Transcript length: {len(transcript.text)} characters")
        print(f"🎯 First 100 chars: {transcript.text[:100]}...")
        
        return {
            **state,
            "transcript_text": transcript.text,
            "status": "transcribed"
        }
        
    except Exception as e:
        error_msg = f"Transcription error: {str(e)}"
        print(f"❌ {error_msg}")
        return {
            **state,
            "error": error_msg,
            "status": "transcription_failed"
        }

print("✅ Transcription node defined")

✅ Transcription node defined


In [13]:
# Cell: Initialize Database
from database.init_db import create_database

print("🔧 Checking/creating database...")
try:
    create_database()
    print("✅ Database ready!")
except Exception as e:
    print(f"❌ Database error: {e}")

# Verify tables exist
import sqlite3
conn = sqlite3.connect('data/app.db')
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()
print(f"📊 Available tables: {[table[0] for table in tables]}")
conn.close()

🔧 Checking/creating database...
Creating database at: data/app.db
✅ Database schema created successfully!
✅ Created tables: ['conversations', 'sqlite_sequence', 'blog_post_ideas', 'processing_status']
✅ Database ready!
📊 Available tables: ['conversations', 'sqlite_sequence', 'blog_post_ideas', 'processing_status']


In [9]:
# Cell 7: Database Saving Node
def database_saver_node(state: AudioPipelineState) -> AudioPipelineState:
    """Node 2: Save transcript to database"""
    
    print(f"💾 Saving to database: {state['filename']}")
    
    try:
        # Create conversation object
        conversation = ConversationCreate(
            title=f"Audio: {state['filename']}",
            raw_text=state['transcript_text'],
            source="transcribed"
        )
        
        # Save to database
        conversation_id = db.create_conversation(conversation)
        
        print(f"✅ Saved conversation with ID: {conversation_id}")
        print(f"📊 Word count: {conversation.word_count}")
        
        return {
            **state,
            "conversation_id": conversation_id,
            "status": "completed"
        }
        
    except Exception as e:
        error_msg = f"Database save error: {str(e)}"
        print(f"❌ {error_msg}")
        return {
            **state,
            "error": error_msg,
            "status": "database_save_failed"
        }

print("✅ Database saver node defined")

✅ Database saver node defined


In [10]:
# Cell 8: Build LangGraph Pipeline
def create_audio_pipeline():
    """Create the LangGraph workflow"""
    
    # Create state graph
    workflow = StateGraph(AudioPipelineState)
    
    # Add nodes
    workflow.add_node("transcribe", audio_transcriber_node)
    workflow.add_node("save_to_db", database_saver_node)
    
    # Define the flow
    workflow.add_edge(START, "transcribe")
    workflow.add_edge("transcribe", "save_to_db") 
    workflow.add_edge("save_to_db", END)
    
    # Compile the graph
    app = workflow.compile()
    
    return app

# Create the pipeline
pipeline = create_audio_pipeline()
print("✅ LangGraph pipeline created")

# Visualize the graph (optional)
try:
    # This will show the graph structure
    print("\n📊 Pipeline Structure:")
    print("START → transcribe → save_to_db → END")
except:
    print("Graph visualization not available")

✅ LangGraph pipeline created

📊 Pipeline Structure:
START → transcribe → save_to_db → END


In [11]:
# Cell 9: Test End-to-End Processing
def test_single_file(file_path):
    """Test the complete pipeline with one file"""
    
    if not file_path.exists():
        print(f"❌ File not found: {file_path}")
        return None
    
    print(f"🚀 Testing pipeline with: {file_path.name}")
    print("=" * 50)
    
    # Create initial state
    initial_state = {
        "file_path": str(file_path),
        "filename": file_path.name,
        "transcript_text": None,
        "conversation_id": None,
        "error": None,
        "status": "starting"
    }
    
    # Run the pipeline
    result = pipeline.invoke(initial_state)
    
    print("\n" + "=" * 50)
    print("📊 Final Result:")
    print(f"   Status: {result['status']}")
    print(f"   Conversation ID: {result.get('conversation_id')}")
    print(f"   Error: {result.get('error', 'None')}")
    
    if result.get('transcript_text'):
        print(f"   Transcript length: {len(result['transcript_text'])} chars")
        print(f"   First 200 chars: {result['transcript_text'][:200]}...")
    
    return result

# Test with the first available file
if test_file:
    test_result = test_single_file(test_file)
else:
    print("❌ No test file available. Add a .wav file to data/temp/ first!")

🚀 Testing pipeline with: blog_record.wav
🎙️ Starting transcription for: blog_record.wav
📡 Uploading to AssemblyAI...
⏳ Waiting for transcription...
✅ Transcription completed!
📝 Transcript length: 1444 characters
🎯 First 100 chars: I was sick yesterday. I had stomach problems. It was Tuesday and it was two days after a big party o...
💾 Saving to database: blog_record.wav
❌ Database save error: no such table: conversations

📊 Final Result:
   Status: database_save_failed
   Conversation ID: None
   Error: Database save error: no such table: conversations
   Transcript length: 1444 chars
   First 200 chars: I was sick yesterday. I had stomach problems. It was Tuesday and it was two days after a big party on Saturday and I was feeling like with nausea and I wanted to vomit. And so today is Wednesday was a...


In [12]:
# Cell 10: Check Database Results
def verify_database_results():
    """Check what was saved to the database"""
    
    print("🔍 Checking recent database entries...")
    
    # Get recent conversations
    conversations = db.get_all_conversations()
    
    if not conversations:
        print("❌ No conversations found in database")
        return
    
    print(f"📊 Found {len(conversations)} conversations:")
    
    for conv in conversations[:3]:  # Show last 3
        print(f"\n📝 ID {conv.id}: {conv.title}")
        print(f"   📅 Created: {conv.created_at}")
        print(f"   📊 Words: {conv.word_count}")
        print(f"   🎯 Source: {conv.source}")
        print(f"   📄 Text preview: {conv.raw_text[:150]}...")

verify_database_results()

🔍 Checking recent database entries...


OperationalError: no such table: conversations