In [2]:
# Cell 1: Setup and Imports
import os
import sys
from pathlib import Path
from dotenv import load_dotenv

# Add project root to Python path
project_root = Path.cwd().parent
sys.path.append(str(project_root))

# Load environment variables
load_dotenv(project_root / '.env')

# Test API key
assemblyai_key = os.getenv('ASSEMBLYAI_API_KEY')
print(f"AssemblyAI API Key loaded: {'‚úÖ' if assemblyai_key else '‚ùå'}")
print(f"Key starts with: {assemblyai_key[:10] if assemblyai_key else 'None'}...")


AssemblyAI API Key loaded: ‚úÖ
Key starts with: 972365f41d...


In [27]:
import os, getpass

def _set_env(var: str):
    if not os.environ.get(var):
        os.environ[var] = getpass.getpass(f"{var}: ")

_set_env("OPENAI_API_KEY")

In [28]:
_set_env("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = "ai_content_ops"

In [3]:
import sqlite3

conn = sqlite3.connect("data/app.db")
cursor = conn.cursor()

# Get all table names
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()

print("üìä Tables in app.db:")
for table in tables:
    table_name = table[0]
    
    # Get column info for each table
    cursor.execute(f"PRAGMA table_info({table_name})")
    columns = cursor.fetchall()
    
    print(f"\nüîß {table_name}:")
    for col in columns:
        print(f"   - {col[1]} ({col[2]})")  # column_name (type)

conn.close()

üìä Tables in app.db:

üîß conversations:
   - id (INTEGER)
   - title (TEXT)
   - raw_text (TEXT)
   - source (TEXT)
   - word_count (INTEGER)
   - created_at (DATETIME)
   - status (TEXT)

üîß sqlite_sequence:
   - name ()
   - seq ()

üîß blog_post_ideas:
   - id (INTEGER)
   - conversation_id (INTEGER)
   - title (TEXT)
   - description (TEXT)
   - usefulness_potential (INTEGER)
   - fitwith_seo_strategy (INTEGER)
   - fitwith_content_strategy (INTEGER)
   - inspiration_potential (INTEGER)
   - collaboration_potential (INTEGER)
   - innovation (INTEGER)
   - difficulty (INTEGER)
   - total_score (INTEGER)
   - sent_to_prod (BOOLEAN)
   - raw_llm_response (TEXT)
   - created_at (DATETIME)

üîß processing_status:
   - id (INTEGER)
   - conversation_id (INTEGER)
   - stage (TEXT)
   - status (TEXT)
   - error_message (TEXT)
   - started_at (DATETIME)
   - completed_at (DATETIME)


In [4]:
# Cell 2: Import Dependencies
import assemblyai as aai
from langgraph.graph import StateGraph
from typing import TypedDict, Optional, List, Dict  # ‚Üê Added List here
import glob
import time
from pathlib import Path  # ‚Üê Also added Path here

# Import our database
from database.db_operations import db
from database.models import ConversationCreate

print("‚úÖ All imports successful")

‚úÖ All imports successful


In [5]:
## 1. Pydantic Model for Structured Output

from pydantic import BaseModel, Field
from typing import List, Optional
from enum import Enum

class SpeakerRole(str, Enum):
    """Possible speaker roles in the conversation"""
    CLIENT = "client"
    INTERVIEWER = "interviewer"

class Speaker(BaseModel):
    """Information about a person speaking in the conversation"""
    name: Optional[str] = Field(default=None, description="Name of the speaker if mentioned")
    role: Optional[SpeakerRole] = Field(default=None, description="Role of the speaker in the conversation")
    company: Optional[str] = Field(default=None, description="Company they work for if mentioned")

class Challenge(BaseModel):
    """A challenge or problem mentioned in the conversation"""
    description: Optional[str] = Field(default=None, description="Description of the challenge")
    impact: Optional[str] = Field(default=None, description="How this challenge affects them")
    urgency: Optional[str] = Field(default=None, description="Low, Medium, or High urgency")

class CurrentSolution(BaseModel):
    """How they currently solve their problems"""
    solution: Optional[str] = Field(default=None, description="What they're currently doing")
    satisfaction_level: Optional[str] = Field(default=None, description="How satisfied they are: Very Satisfied, Satisfied, Neutral, Unsatisfied, Very Unsatisfied")
    limitations: Optional[List[str]] = Field(default=[], description="Limitations of current solution")

class Need(BaseModel):
    """A need identified using psychology frameworks like NVC"""
    need_category: Optional[str] = Field(default=None, description="Category of need (e.g., autonomy, efficiency, security, connection)")
    description: Optional[str] = Field(default=None, description="Specific need description")
    intensity: Optional[str] = Field(default=None, description="Low, Medium, or High intensity")

class ExtractedInsights(BaseModel):
    """Complete structured output from conversation analysis"""
    
    # Speakers
    speakers: Optional[List[Speaker]] = Field(default=[], description="People identified in the conversation")
    
    # What they care about
    core_values: Optional[List[str]] = Field(default=[], description="What this person/company cares about most")
    priorities: Optional[List[str]] = Field(default=[], description="Their current priorities and focus areas")
    
    # Challenges
    primary_challenges: Optional[List[Challenge]] = Field(default=[], description="Main problems they're facing")
    secondary_challenges: Optional[List[Challenge]] = Field(default=[], description="Secondary or related problems")
    
    # Current solutions
    current_solutions: Optional[List[CurrentSolution]] = Field(default=[], description="How they solve problems today")
    
    # Needs analysis
    psychological_needs: Optional[List[Need]] = Field(default=[], description="Underlying needs using NVC or similar frameworks")
 


In [6]:
# Cell 12: Raw Blog Idea Model (Simple)
class RawBlogIdea(BaseModel):
    """Raw blog idea from creative agent"""
    title: str
    description: str
    target_audience: str
    content_angle: str
    business_value: str

print("‚úÖ Simple RawBlogIdea model ready")

‚úÖ Simple RawBlogIdea model ready


In [7]:
def validate_raw_blog_ideas(raw_ideas: List[Dict]) -> List[RawBlogIdea]:
    """Validate and convert raw JSON to Pydantic models"""
    validated_ideas = []
    
    for idea in raw_ideas:
        try:
            validated_idea = RawBlogIdea(**idea)
            validated_ideas.append(validated_idea)
        except Exception as e:
            print(f"‚ö†Ô∏è Invalid blog idea skipped: {e}")
    
    print(f"‚úÖ Validated {len(validated_ideas)} out of {len(raw_ideas)} raw ideas")
    return validated_ideas

print("‚úÖ RawBlogIdea model and validation ready")

‚úÖ RawBlogIdea model and validation ready


In [8]:
# Cell 3: Define LangGraph State
class AudioPipelineState(TypedDict):
    file_path: str
    filename: str
    transcript_text: Optional[str]
    conversation_id: Optional[int]
    extracted_insights: Optional[ExtractedInsights]  
    raw_blog_ideas: Optional[List[Dict]]           # Pydantic objects from creative agent

    
    # Status & error handling
    status: str
    error: Optional[str]


print("‚úÖ State defined")

‚úÖ State defined


In [9]:
# Cell: Updated Company Strategy Context Loader (3 Documents)
def load_company_strategy_context():
    """Load company strategy, SEO strategy, and content strategy for context"""
    
    strategy_context = {}
    
    try:
        # Load company strategy
        company_strategy_path = "../data/processed/company_strategy.mkd"
        if os.path.exists(company_strategy_path):
            with open(company_strategy_path, "r", encoding="utf-8") as f:
                strategy_context["company_strategy"] = f.read()
            print(f"‚úÖ Loaded company strategy ({len(strategy_context['company_strategy'])} chars)")
        else:
            strategy_context["company_strategy"] = "Company strategy document not available."
            print("‚ö†Ô∏è Company strategy document not found")
        
        # Load SEO strategy
        seo_strategy_path = "../data/processed/seo_strategy.mkd"
        if os.path.exists(seo_strategy_path):
            with open(seo_strategy_path, "r", encoding="utf-8") as f:
                strategy_context["seo_strategy"] = f.read()
            print(f"‚úÖ Loaded SEO strategy ({len(strategy_context['seo_strategy'])} chars)")
        else:
            strategy_context["seo_strategy"] = "SEO strategy document not available."
            print("‚ö†Ô∏è SEO strategy document not found")
        
        # Load content strategy (NEW)
        content_strategy_path = "../data/processed/content_strategy.mkd"
        if os.path.exists(content_strategy_path):
            with open(content_strategy_path, "r", encoding="utf-8") as f:
                strategy_context["content_strategy"] = f.read()
            print(f"‚úÖ Loaded content strategy ({len(strategy_context['content_strategy'])} chars)")
        else:
            strategy_context["content_strategy"] = "Content strategy document not available."
            print("‚ö†Ô∏è Content strategy document not found")
            
    except Exception as e:
        print(f"‚ùå Error loading strategy documents: {e}")
        strategy_context = {
            "company_strategy": "Strategy document not available",
            "seo_strategy": "SEO strategy document not available", 
            "content_strategy": "Content strategy document not available"
        }
    
    return strategy_context

# Test loading all three documents
strategy_context = load_company_strategy_context()
print(f"üìä Strategy context keys: {list(strategy_context.keys())}")
print(f"üìä Total context size: {sum(len(v) for v in strategy_context.values() if isinstance(v, str))} chars")

‚úÖ Loaded company strategy (6555 chars)
‚úÖ Loaded SEO strategy (1120 chars)
‚úÖ Loaded content strategy (4469 chars)
üìä Strategy context keys: ['company_strategy', 'seo_strategy', 'content_strategy']
üìä Total context size: 12144 chars


In [10]:
# Cell: Fixed Creative Agent Function
def generate_blog_ideas_from_insights(insights: ExtractedInsights, strategy_context: dict) -> List[Dict]:
    """
    Fixed creative agent that handles Claude's markdown JSON response
    """
    
    creative_prompt = f"""
    You are a creative content strategist for Big Kids Automation, a company that helps businesses implement AI and automation solutions.
    
    COMPANY CONTEXT:
    {strategy_context.get('company_strategy', 'Strategy not available')[:1000]}...
    
    SEO STRATEGY:
    {strategy_context.get('seo_strategy', 'SEO strategy not available')[:500]}...
    
    CONVERSATION INSIGHTS TO WORK FROM:
    
    Speakers: {[f"{s.name} ({s.role}) from {s.company}" for s in insights.speakers] if insights.speakers else "Unknown speakers"}
    
    Core Values: {", ".join(insights.core_values) if insights.core_values else "None identified"}
    
    Priorities: {", ".join(insights.priorities) if insights.priorities else "None identified"}
    
    Primary Challenges:
    {chr(10).join([f"- {c.description} (Impact: {c.impact}, Urgency: {c.urgency})" for c in insights.primary_challenges]) if insights.primary_challenges else "None identified"}
    
    Current Solutions:
    {chr(10).join([f"- {s.solution} (Satisfaction: {s.satisfaction_level})" for s in insights.current_solutions]) if insights.current_solutions else "None identified"}
    
    Psychological Needs:
    {chr(10).join([f"- {n.description} ({n.need_category}, {n.intensity} intensity)" for n in insights.psychological_needs]) if insights.psychological_needs else "None identified"}
    
    TASK:
    Generate 4-5 creative blog post ideas that:
    1. Address the challenges and needs identified in this conversation
    2. Align with Big Kids Automation's mission to help businesses with AI/automation
    3. Provide value to potential clients facing similar challenges
    4. Support our SEO and content marketing strategy
    5. Are actionable and practical, not just theoretical
    
    For each blog post idea, provide:
    - title: Clear, engaging title that includes relevant keywords
    - description: 2-3 sentence description of what the post will cover
    - target_audience: Who this post is primarily for
    - content_angle: The unique angle or approach this post takes
    - business_value: How this post helps our business goals
    
    IMPORTANT: Return ONLY the JSON array, no markdown formatting, no code blocks, no explanatory text.
    
    Format:
    [
        {{
            "title": "How AI Proposal Systems Balance Speed with Brand Differentiation",
            "description": "A practical guide showing how modern AI-powered proposal systems solve the common problem of maintaining company uniqueness while leveraging automation. Includes real case studies and implementation steps.",
            "target_audience": "Business development directors and proposal managers at consulting firms",
            "content_angle": "Problem-solution with real case studies",
            "business_value": "Attracts prospects struggling with proposal automation while maintaining differentiation"
        }}
    ]
    """
    
    try:
        # Generate ideas using Claude
        response = llm.invoke(creative_prompt)
        raw_content = response.content.strip()
        
        print(f"üìù Raw response length: {len(raw_content)} chars")
        print(f"üìù Response starts with: {raw_content[:50]}...")
        
        # Handle markdown code blocks
        if raw_content.startswith('```'):
            print("üîß Removing markdown code blocks...")
            # Remove ```json and ``` wrappers
            lines = raw_content.split('\n')
            # Remove first line if it's ```json or ```
            if lines[0].startswith('```'):
                lines = lines[1:]
            # Remove last line if it's ```
            if lines and lines[-1].strip() == '```':
                lines = lines[:-1]
            raw_content = '\n'.join(lines).strip()
            print(f"üîß Cleaned content starts with: {raw_content[:50]}...")
        
        # Parse JSON response
        blog_ideas = json.loads(raw_content)
        
        print(f"‚úÖ Creative agent successfully parsed {len(blog_ideas)} blog ideas")
        return blog_ideas
        
    except json.JSONDecodeError as e:
        print(f"‚ùå JSON parsing error in creative agent: {e}")
        print(f"üìù Cleaned content: {raw_content[:500]}...")
        return []
    except Exception as e:
        print(f"‚ùå Error in creative agent: {e}")
        return []

print("‚úÖ Fixed creative agent function ready")

‚úÖ Fixed creative agent function ready


In [11]:
# Cell 13: Creative Agent Node (Direct Pydantic)
def creative_agent_node(state: AudioPipelineState) -> AudioPipelineState:
    """Creative agent that generates raw blog ideas as Pydantic objects"""
    
    try:
        print("üé® Starting creative blog idea generation...")
        
        insights = state.get('extracted_insights')
        if not insights:
            return {**state, "error": "No insights available", "status": "error"}
        
        # Load strategy context
        strategy_context = load_company_strategy_context()
        
        # Generate ideas (returns JSON)
        raw_ideas_json = generate_blog_ideas_from_insights(insights, strategy_context)
        
        # Convert directly to Pydantic objects
        raw_blog_ideas = []
        for idea_json in raw_ideas_json:
            try:
                idea = RawBlogIdea(**idea_json)
                raw_blog_ideas.append(idea)
            except Exception as e:
                print(f"‚ö†Ô∏è Skipping invalid idea: {e}")
        
        if raw_blog_ideas:
            print(f"üéâ Generated {len(raw_blog_ideas)} valid blog ideas")
            return {
                **state,
                "raw_blog_ideas": raw_blog_ideas,  # Direct Pydantic objects
                "status": "raw_ideas_generated"
            }
        else:
            return {**state, "error": "No valid ideas generated", "status": "error"}
            
    except Exception as e:
        print(f"‚ùå Creative agent error: {e}")
        return {**state, "error": str(e), "status": "error"}

print("‚úÖ Creative agent node (direct Pydantic) ready")

‚úÖ Creative agent node (direct Pydantic) ready


In [12]:
# Cell 4: Test AssemblyAI Connection
# Configure AssemblyAI
aai.settings.api_key = os.getenv('ASSEMBLYAI_API_KEY')

# Test with a simple transcription (we'll use a file from temp folder)
def test_assemblyai_connection():
    """Test if AssemblyAI is working"""
    try:
        # Just test the API key is valid
        transcriber = aai.Transcriber()
        print("‚úÖ AssemblyAI connection successful")
        return True
    except Exception as e:
        print(f"‚ùå AssemblyAI connection failed: {e}")
        return False

test_assemblyai_connection()

‚úÖ AssemblyAI connection successful


True

In [13]:
# Cell 5: Batch File Discovery and Management
def find_audio_files(temp_folder: Path) -> List[Path]:
    """Find all audio files in temp folder"""
    audio_extensions = ['*.wav', '*.mp3', '*.m4a']
    audio_files = []
    
    for ext in audio_extensions:
        audio_files.extend(temp_folder.glob(ext))
    
    return sorted(audio_files)

def display_batch_info(audio_files: List[Path]):
    """Display information about the batch of files"""
    if not audio_files:
        print("‚ùå No audio files found in temp folder!")
        return False
    
    total_size_mb = sum(f.stat().st_size for f in audio_files) / (1024 * 1024)
    
    print(f"üìä BATCH PROCESSING INFO:")
    print(f"   Files to process: {len(audio_files)}")
    print(f"   Total size: {total_size_mb:.1f} MB")
    print(f"\nüìÅ Files found:")
    
    for i, file_path in enumerate(audio_files, 1):
        size_mb = file_path.stat().st_size / (1024 * 1024)
        print(f"   {i}. {file_path.name} ({size_mb:.1f} MB)")
    
    return True

def cleanup_processed_files(processed_files: List[Path]):
    """Delete all successfully processed files"""
    print(f"\nüóëÔ∏è CLEANUP: Deleting {len(processed_files)} processed files...")
    deleted_count = 0
    
    for file_path in processed_files:
        try:
            file_path.unlink()  # Delete file
            print(f"   ‚úÖ Deleted: {file_path.name}")
            deleted_count += 1
        except Exception as e:
            print(f"   ‚ùå Failed to delete {file_path.name}: {e}")
    
    print(f"üóëÔ∏è Cleanup complete: {deleted_count}/{len(processed_files)} files deleted")

# Discover files in temp folder
temp_folder = project_root / 'data' / 'temp'
temp_folder.mkdir(parents=True, exist_ok=True)  # Ensure folder exists

audio_files = find_audio_files(temp_folder)
files_available = display_batch_info(audio_files)

if files_available:
    print(f"\nüöÄ Ready to process {len(audio_files)} files!")
else:
    print("\nüí° TIP: Add .wav files to data/temp/ folder for testing")

üìä BATCH PROCESSING INFO:
   Files to process: 1
   Total size: 19.3 MB

üìÅ Files found:
   1. blog_record_(manuelillo_cto).wav (19.3 MB)

üöÄ Ready to process 1 files!


In [14]:
# Batch Processing Function (Updated with Full Insights Display)
def process_audio_batch(audio_files: List[Path], pipeline) -> dict:
    """Process all audio files in batch with detailed insights display"""
    
    if not audio_files:
        print("‚ùå No files to process")
        return {"processed": [], "failed": [], "total": 0}
    
    print(f"\nüöÄ STARTING BATCH PROCESSING - {len(audio_files)} files")
    print("=" * 60)
    
    processed_files = []
    failed_files = []
    results = []
    
    for i, file_path in enumerate(audio_files, 1):
        print(f"\nüìÇ Processing {i}/{len(audio_files)}: {file_path.name}")
        print("-" * 40)
        
        # Create initial state
        initial_state = {
            "file_path": str(file_path),
            "filename": file_path.name,
            "transcript_text": None,
            "conversation_id": None,
            "extracted_insights": None,  
            "error": None,
            "status": "processing"
        }
        
        try:
            # Run through pipeline
            result = pipeline.invoke(initial_state)
            
            if result["status"] in ["completed", "insights_extracted"]:
                print(f"‚úÖ SUCCESS: {file_path.name}")
                print(f"   Conversation ID: {result['conversation_id']}")
                print(f"   Transcript preview: {result['transcript_text'][:100]}...")
                
                # FULL INSIGHTS DISPLAY
                if result.get('extracted_insights'):
                    insights = result['extracted_insights']
                    print(f"\nüß† === EXTRACTED INSIGHTS FOR: {file_path.name} ===")
                    print("=" * 50)
                    
                    # Speakers
                    if insights.speakers:
                        print("üë• SPEAKERS:")
                        for speaker in insights.speakers:
                            print(f"   ‚Ä¢ Name: {speaker.name or 'Unknown'}")
                            print(f"     Role: {speaker.role or 'Unknown'}")  
                            print(f"     Company: {speaker.company or 'Unknown'}")
                    
                    # Core Values
                    if insights.core_values:
                        print("üíé CORE VALUES:")
                        for value in insights.core_values:
                            print(f"   ‚Ä¢ {value}")
                    
                    # Priorities
                    if insights.priorities:
                        print("üéØ PRIORITIES:")
                        for priority in insights.priorities:
                            print(f"   ‚Ä¢ {priority}")
                    
                    # Primary Challenges
                    if insights.primary_challenges:
                        print("üî• PRIMARY CHALLENGES:")
                        for challenge in insights.primary_challenges:
                            print(f"   ‚Ä¢ Challenge: {challenge.description}")
                            print(f"     Impact: {challenge.impact}")
                            print(f"     Urgency: {challenge.urgency}")
                    
                    # Secondary Challenges
                    if insights.secondary_challenges:
                        print("‚ö†Ô∏è  SECONDARY CHALLENGES:")
                        for challenge in insights.secondary_challenges:
                            print(f"   ‚Ä¢ Challenge: {challenge.description}")
                            print(f"     Impact: {challenge.impact}")
                            print(f"     Urgency: {challenge.urgency}")
                    
                    # Current Solutions
                    if insights.current_solutions:
                        print("üîß CURRENT SOLUTIONS:")
                        for solution in insights.current_solutions:
                            print(f"   ‚Ä¢ Solution: {solution.solution}")
                            print(f"     Satisfaction: {solution.satisfaction_level}")
                            if solution.limitations:
                                print(f"     Limitations: {', '.join(solution.limitations)}")
                    
                    # Psychological Needs
                    if insights.psychological_needs:
                        print("üßò PSYCHOLOGICAL NEEDS:")
                        for need in insights.psychological_needs:
                            print(f"   ‚Ä¢ {need.description}")
                            print(f"     Category: {need.need_category}")
                            print(f"     Intensity: {need.intensity}")
                    
                    print("üß† === END INSIGHTS ===")
                    print("-" * 50)
                
                processed_files.append(file_path)
            else:
                print(f"‚ùå FAILED: {file_path.name}")
                print(f"   Status: {result.get('status', 'Unknown')}")
                print(f"   Error: {result.get('error', 'Unknown error')}")
                failed_files.append(file_path)
            
            results.append(result)
            
        except Exception as e:
            print(f"‚ùå PIPELINE ERROR: {file_path.name}")
            print(f"   Exception: {str(e)}")
            failed_files.append(file_path)
            
            results.append({
                **initial_state,
                "error": str(e),
                "status": "pipeline_error"
            })
    
    # Final Summary
    print(f"\nüìä BATCH PROCESSING COMPLETE!")
    print("=" * 60)
    print(f"‚úÖ Successfully processed: {len(processed_files)}")
    print(f"‚ùå Failed: {len(failed_files)}")
    print(f"üìÅ Total files: {len(audio_files)}")
    
    if failed_files:
        print(f"\n‚ùå Failed files:")
        for failed_file in failed_files:
            print(f"   - {failed_file.name}")
    
    return {
        "processed": processed_files,
        "failed": failed_files,
        "total": len(audio_files),
        "results": results
    }

print("‚úÖ Batch processing function ready with full insights display")

‚úÖ Batch processing function ready with full insights display


In [15]:
# Cell 6: Define LangGraph Nodes
def transcription_node(state: AudioPipelineState) -> AudioPipelineState:
    """Node 1: Transcribe audio file with AssemblyAI"""
    try:
        print(f"üéôÔ∏è Transcribing: {state['filename']}")
        
        # Configure transcriber
        transcriber = aai.Transcriber()
        
        # Transcribe the file
        transcript = transcriber.transcribe(state['file_path'])
        
        if transcript.status == aai.TranscriptStatus.error:
            return {
                **state,
                "error": f"AssemblyAI error: {transcript.error}",
                "status": "transcription_failed"
            }
        
        return {
            **state,
            "transcript_text": transcript.text,
            "status": "transcribed"
        }
        
    except Exception as e:
        return {
            **state,
            "error": f"Transcription error: {str(e)}",
            "status": "transcription_failed"
        }

def database_saver_node(state: AudioPipelineState) -> AudioPipelineState:
    """Node 2: Save transcript to database"""
    try:
        print(f"üíæ Saving to database: {state['filename']}")
        
        # Create conversation object
        conversation = ConversationCreate(
            title=f"Audio: {state['filename']}",
            raw_text=state['transcript_text'],
            source="transcribed"
        )
        
        # Save to database
        conversation_id = db.create_conversation(conversation)
        
        return {
            **state,
            "conversation_id": conversation_id,
            "status": "completed"
        }
        
    except Exception as e:
        return {
            **state,
            "error": f"Database error: {str(e)}",
            "status": "database_failed"
        }

print("‚úÖ LangGraph nodes defined")

‚úÖ LangGraph nodes defined


In [16]:
def pain_extractor_node(state: AudioPipelineState) -> AudioPipelineState:
    """
    LangGraph node: Extract structured insights from conversation transcript
    """
    print("üß† Starting pain extraction...")
    
    try:
        # Extract insights using OpenAI structured output
        insights = extract_insights_from_transcript(state['transcript_text'])
        
        if insights:
            print(f"‚úÖ Extracted insights: {len(insights.primary_challenges)} primary challenges, {len(insights.speakers)} speakers")
            
            return {
                **state,
                "extracted_insights": insights,
                "status": "insights_extracted"
            }
        else:
            return {
                **state,
                "error": "Failed to extract insights from transcript",
                "status": "error"
            }
            
    except Exception as e:
        print(f"‚ùå Pain extraction failed: {e}")
        return {
            **state,
            "error": f"Pain extraction error: {str(e)}",
            "status": "error"
        }

In [17]:
# Cell: Build Current Pipeline (4 Nodes) - FIXED
def build_pipeline():
    """Build the current LangGraph workflow with transcription, save, insights, and creative agent"""
    workflow = StateGraph(AudioPipelineState)
    
    # Add current nodes (use consistent naming - no spaces)
    workflow.add_node("transcribe", transcription_node)
    workflow.add_node("save_to_db", database_saver_node)  
    workflow.add_node("extract_insights", pain_extractor_node)
    workflow.add_node("creative_agent", creative_agent_node)  # ‚Üê Fixed: no space
    
    # Chain them together (use exact node names)
    workflow.add_edge("transcribe", "save_to_db")
    workflow.add_edge("save_to_db", "extract_insights")
    workflow.add_edge("extract_insights", "creative_agent")  # ‚Üê Fixed: consistent names
    
    workflow.set_entry_point("transcribe")
    workflow.set_finish_point("creative_agent")  # ‚Üê Fixed: no space
    
    return workflow.compile()

# Build the pipeline
pipeline = build_pipeline()
print("‚úÖ LangGraph pipeline compiled (4 nodes: transcribe ‚Üí save_to_db ‚Üí extract_insights ‚Üí creative_agent)")

‚úÖ LangGraph pipeline compiled (4 nodes: transcribe ‚Üí save_to_db ‚Üí extract_insights ‚Üí creative_agent)


In [18]:
# Cell: Clean Conversations Table
def clean_conversations_table():
    """Delete all records from conversations table"""
    
    # First show what will be deleted
    conversations = db.get_all_conversations()
    print(f"üìä Found {len(conversations)} conversations to delete:")
    for conv in conversations[:5]:  # Show first 5
        print(f"  - ID {conv.id}: {conv.title}")
    if len(conversations) > 5:
        print(f"  ... and {len(conversations) - 5} more")
    
    # Ask for confirmation
    response = input(f"\n‚ùì Delete all {len(conversations)} conversations? (y/N): ")
    
    if response.lower() in ['y', 'yes']:
        conn = db.get_connection()
        try:
            cursor = conn.cursor()
            
            # Delete all conversations (this will also delete related blog_post_ideas due to foreign key)
            cursor.execute("DELETE FROM blog_post_ideas")
            cursor.execute("DELETE FROM processing_status") 
            cursor.execute("DELETE FROM conversations")
            conn.commit()
            
            print("‚úÖ All conversations deleted!")
            print("‚úÖ Related blog ideas deleted!")
            print("‚úÖ Processing status cleared!")
            
        finally:
            conn.close()
    else:
        print("‚ùå Deletion cancelled")

# Run the cleaner
clean_conversations_table()

üìä Found 2 conversations to delete:
  - ID 14: Audio: blog_record_(manuelillo_cto).wav
  - ID 13: Audio: blog_record_(manuel_cto).wav
‚úÖ All conversations deleted!
‚úÖ Related blog ideas deleted!
‚úÖ Processing status cleared!


In [41]:
# Cell 9: Execute Batch Processing with Cleanup
if files_available:
    print("üéØ Starting batch processing...")
    
    # Process all files
    batch_results = process_audio_batch(audio_files, pipeline)
    
    # Display summary
    print(f"\nüìä BATCH PROCESSING COMPLETE!")
    print("=" * 60)
    print(f"‚úÖ Successfully processed: {len(batch_results['processed'])}")
    print(f"‚ùå Failed: {len(batch_results['failed'])}")
    print(f"üìÅ Total files: {batch_results['total']}")
    
    # Show failed files
    if batch_results['failed']:
        print(f"\n‚ùå Failed files:")
        for file_path in batch_results['failed']:
            print(f"   - {file_path.name}")
    
    # Cleanup successfully processed files
    if batch_results['processed']:
        confirm = input(f"\nüóëÔ∏è Delete {len(batch_results['processed'])} processed files? (y/N): ")
        if confirm.lower() in ['y', 'yes']:
            cleanup_processed_files(batch_results['processed'])
        else:
            print("üîß Files kept in temp folder for inspection")
    
    print("\nüéâ Batch processing complete!")
    
else:
    print("üí° Add audio files to data/temp/ folder and rerun this cell")

üí° Add audio files to data/temp/ folder and rerun this cell


In [19]:
# Cell 7: Setup Anthropic LLM for Insights Extraction (FIXED)
from langchain_anthropic import ChatAnthropic
import json

# Initialize Anthropic with correct model name
anthropic_key = os.getenv('ANTHROPIC_API_KEY')
if not anthropic_key:
    print("‚ö†Ô∏è  ANTHROPIC_API_KEY not found in .env file")
    print("Please add: ANTHROPIC_API_KEY=your_key_here")
else:
    llm = ChatAnthropic(
        model="claude-haiku-4-5",  # ‚Üê Updated model name
        api_key=anthropic_key,
        temperature=0.1
    )
    print("‚úÖ Anthropic LLM initialized with Claude Haiku 4.5")

‚úÖ Anthropic LLM initialized with Claude Haiku 4.5


In [20]:
## 3. PainExtractor Node Implementation


import openai
import json
from typing import Dict, Any

# System prompt
PAIN_EXTRACTOR_SYSTEM_PROMPT = """
You are a UX researcher and business analyst for BigKids Automation. Your job is listening to transcripts from interviews with users and potential clients. 

You pay special attention to problems that users have regarding how their company is automating, using web apps and AI to save time and move towards a more ethical and sovereign tech infrastructure.

You will be given the transcript of an interview with a user or potential client.

Your task is to extract structured information about:
- Who is speaking and their role
- What this person cares about (values, priorities)
- Their main primary and secondary challenges
- How they are solving problems today
- Are there AI agents that can assist them?
- Their underlying psychological needs (using frameworks like NVC - Non-Violent Communication)

Focus on automation, web apps, AI, time-saving, ethical tech, and sovereign infrastructure themes.

Be thorough but concise. 

IMPORTANT: Only extract information that is explicitly mentioned in the transcript. 
If information is not clearly stated, leave the field empty/null rather than guessing or inferring.
Do not hallucinate or make assumptions about missing information.
"""

In [21]:
# Cell: Extract Insights Function - ROLE FIXED VERSION
def extract_insights_from_transcript(transcript: str) -> ExtractedInsights:
    """Extract structured insights using Anthropic Claude - ROLE FIXED VERSION"""
    
    prompt = f"""
    Analyze this conversation transcript and extract structured insights:
    
    Transcript: {transcript}
    
    IMPORTANT: For speaker roles, use ONLY these exact values:
    - "client" for the person being interviewed/consulted (CTO, CEO, Manager, business owner, etc.)
    - "interviewer" for the person asking questions or conducting the interview
    
    Extract the following information in JSON format:
    - speakers: List of people mentioned with name, role (client/interviewer only), company
    - core_values: What they care about most  
    - priorities: Current focus areas
    - primary_challenges: Main problems they face with description, impact, urgency
    - secondary_challenges: Secondary problems
    - current_solutions: How they solve problems now with satisfaction level
    - psychological_needs: Underlying needs with category, description, intensity
    
    Return ONLY valid JSON in this exact structure - no markdown, no code blocks:
    {{
        "speakers": [
            {{"name": "Manuel", "role": "client", "company": "Drone flytech"}}
        ],
        "core_values": ["efficiency", "transparency"],
        "priorities": ["improving processes"],
        "primary_challenges": [
            {{
                "description": "Tracking payment issues",
                "impact": "Creates confusion in processes", 
                "urgency": "High"
            }}
        ],
        "secondary_challenges": [
            {{
                "description": "Secondary challenge",
                "impact": "Secondary impact",
                "urgency": "Medium"
            }}
        ],
        "current_solutions": [
            {{
                "solution": "Current approach",
                "satisfaction_level": "Neutral",
                "limitations": ["limitation1", "limitation2"]
            }}
        ],
        "psychological_needs": [
            {{
                "need_category": "security",
                "description": "Need for confidence",
                "intensity": "High"
            }}
        ]
    }}
    
    Remember: 
    - Use "client" for Manuel (even though he's CTO)
    - Use "interviewer" for the person asking questions
    - Use exact urgency values: "Low", "Medium", "High"
    - Use exact satisfaction levels: "Very Satisfied", "Satisfied", "Neutral", "Unsatisfied", "Very Unsatisfied"
    - Use exact intensity values: "Low", "Medium", "High"
    """
    
    try:
        # Use the Claude LLM you already set up
        response = llm.invoke(prompt)
        
        print(f"üìù Raw response length: {len(response.content)} chars")
        print(f"üìù Response starts with: {response.content[:50]}...")
        
        # Clean markdown code blocks
        content = response.content.strip()
        if content.startswith('```json'):
            print("üîß Removing JSON markdown blocks...")
            content = content.replace('```json', '').replace('```', '').strip()
            print(f"üîß Cleaned content starts with: {content[:50]}...")
        
        # Parse the cleaned JSON response
        insights_data = json.loads(content)
        
        # Convert to Pydantic model
        result = ExtractedInsights(**insights_data)
        print(f"‚úÖ Successfully extracted insights with correct speaker roles!")
        return result
        
    except json.JSONDecodeError as e:
        print(f"‚ùå JSON parsing error: {e}")
        print(f"üìù Raw response: {response.content[:500]}...")
        raise
    except Exception as e:
        print(f"‚ùå Error in LLM call: {e}")
        raise

print("‚úÖ Updated extract_insights_from_transcript with speaker role fix")

‚úÖ Updated extract_insights_from_transcript with speaker role fix


In [22]:
# Cell: Test Fixed Pain Extraction
def test_pain_extraction_fix():
    """Test the fixed pain extraction with your transcript"""
    
    # Get the conversation that just failed
    conversations = db.get_all_conversations()
    latest_conversation = conversations[0] if conversations else None
    
    if latest_conversation and latest_conversation.raw_text:
        print("üß™ Testing fixed pain extraction...")
        print(f"üìù Using conversation: {latest_conversation.title}")
        
        try:
            # Test the fixed extraction
            insights = extract_insights_from_transcript(latest_conversation.raw_text)
            print(f"‚úÖ Success! Extracted {len(insights.primary_challenges)} challenges")
            print(f"üë• Found {len(insights.speakers)} speakers")
            return True
        except Exception as e:
            print(f"‚ùå Still failing: {e}")
            return False
    else:
        print("‚ùå No conversation found to test with")
        return False

# Test the fix
test_pain_extraction_fix()

‚ùå No conversation found to test with


False

In [23]:
# Cell: Audio File Finder Function
from pathlib import Path
import glob

def find_audio_files_in_temp():
    """Find all audio files in temp folder"""
    temp_folder = Path("../data/temp")  # Adjust path based on notebook location
    
    if not temp_folder.exists():
        print(f"‚ùå Temp folder not found: {temp_folder}")
        return []
    
    # Find audio files
    audio_extensions = ['*.wav', '*.mp3', '*.m4a']
    audio_files = []
    
    for ext in audio_extensions:
        files = list(temp_folder.glob(ext))
        audio_files.extend(files)
    
    return sorted(audio_files)

# Test the function
audio_files = find_audio_files_in_temp()
print(f"üìÅ Found {len(audio_files)} audio files in temp:")
for file in audio_files:
    size_mb = file.stat().st_size / (1024 * 1024)
    print(f"   {file.name} ({size_mb:.1f} MB)")

üìÅ Found 1 audio files in temp:
   blog_record_(manuelillo_cto).wav (19.3 MB)


In [37]:
# Cell: Complete 4-Node Pipeline Test
def test_complete_4_node_pipeline():
    """Test the complete pipeline: Audio ‚Üí Transcribe ‚Üí Save ‚Üí Insights ‚Üí Blog Ideas"""
    
    print("üöÄ TESTING COMPLETE 4-NODE PIPELINE")
    print("=" * 60)
    print("Flow: Audio ‚Üí Transcribe ‚Üí Save to DB ‚Üí Extract Insights ‚Üí Generate Ideas")
    print("=" * 60)
    
    # Check if pipeline is built
    if 'pipeline' not in globals():
        print("‚ùå Pipeline not found!")
        print("üí° Please run the build_pipeline() cell first")
        return None
    
    # Find audio files in temp
    audio_files = find_audio_files_in_temp()
    
    if not audio_files:
        print("‚ùå No audio files found in data/temp/ folder")
        print("\nüí° SOLUTIONS:")
        print("   1. Add a .wav file manually to data/temp/")
        print("   2. Disable cleanup in file_monitor.py and upload new file")
        print("   3. Copy an existing audio file:")
        print("      cp /path/to/audio.wav data/temp/test_file.wav")
        return None
    
    # Use the first audio file
    test_file = audio_files[0]
    print(f"üìÅ Found {len(audio_files)} audio files")
    print(f"üéØ Testing with: {test_file.name}")
    print(f"üìä File size: {test_file.stat().st_size / 1024:.1f} KB")
    
    # Create initial state for complete 4-node pipeline
    initial_state = {
        "file_path": str(test_file),
        "filename": test_file.name,
        "transcript_text": None,           # Will be filled by Node 1
        "conversation_id": None,           # Will be filled by Node 2  
        "extracted_insights": None,        # Will be filled by Node 3
        "raw_blog_ideas": None,            # Will be filled by Node 4
        "scored_blog_ideas": None,         # For future Node 5
        "saved_idea_ids": None,            # For future Node 6
        "error": None,
        "status": "processing"
    }
    
    print(f"\nüé¨ STARTING COMPLETE PIPELINE EXECUTION...")
    print("=" * 60)
    
    try:
        # Execute the complete 4-node pipeline
        print("‚è≥ Running pipeline.invoke()...")
        final_state = pipeline.invoke(initial_state)
        
        print(f"\nüìä COMPLETE PIPELINE RESULTS:")
        print("=" * 60)
        
        # Check final status
        final_status = final_state.get('status', 'unknown')
        print(f"üéØ Final Status: {final_status}")
        
        # Check each stage
        print(f"\nüìã STAGE RESULTS:")
        print(f"   üéôÔ∏è  Transcription: {'‚úÖ' if final_state.get('transcript_text') else '‚ùå'}")
        print(f"   üíæ Database Save: {'‚úÖ' if final_state.get('conversation_id') else '‚ùå'}")
        print(f"   üß† Insights Extraction: {'‚úÖ' if final_state.get('extracted_insights') else '‚ùå'}")
        print(f"   üé® Blog Ideas Generation: {'‚úÖ' if final_state.get('raw_blog_ideas') else '‚ùå'}")
        
        # Show detailed results if successful
        if final_state.get('raw_blog_ideas'):
            ideas = final_state['raw_blog_ideas']
            insights = final_state.get('extracted_insights')
            
            print(f"\nüéâ COMPLETE SUCCESS! End-to-end pipeline worked!")
            print("=" * 70)
            print(f"üìù Conversation ID: {final_state.get('conversation_id')}")
            print(f"üìä Transcript Length: {len(final_state.get('transcript_text', ''))} characters")
            
            if insights:
                print(f"üß† Extracted Insights:")
                print(f"   üë• Speakers: {len(insights.speakers)}")
                print(f"   üî• Primary Challenges: {len(insights.primary_challenges)}")
                print(f"   üßò Psychological Needs: {len(insights.psychological_needs)}")
                print(f"   üíé Core Values: {len(insights.core_values)}")
            
            print(f"\nüé® Generated Blog Ideas ({len(ideas)}):")
            print("-" * 70)
            
            for i, idea in enumerate(ideas, 1):
                print(f"\nüí° IDEA {i}:")
                # Handle both dict and Pydantic object formats
                if hasattr(idea, 'title'):
                    # Pydantic object
                    print(f"   üìù Title: {idea.title}")
                    print(f"   üìÑ Description: {idea.description[:100]}...")
                    print(f"   üéØ Target Audience: {idea.target_audience}")
                    print(f"   üìà Business Value: {idea.business_value}")
                else:
                    # Dictionary
                    print(f"   üìù Title: {idea.get('title', 'No title')}")
                    print(f"   üìÑ Description: {idea.get('description', 'No description')[:100]}...")
                    print(f"   üéØ Target Audience: {idea.get('target_audience', 'Unknown')}")
                    print(f"   üìà Business Value: {idea.get('business_value', 'Unknown')}")
            
            print("=" * 70)
            print("üéâ COMPLETE 4-NODE PIPELINE: SUCCESS!")
            print("‚úÖ System is working end-to-end!")
            print("üöÄ Ready to build Node 5 (Analyst Agent)")
            
            return final_state
            
        else:
            # Pipeline failed somewhere
            print(f"\n‚ùå PIPELINE INCOMPLETE")
            print("=" * 50)
            
            error_msg = final_state.get('error', 'No specific error message')
            print(f"‚ùå Error: {error_msg}")
            print(f"üîç Status: {final_status}")
            
            # Debug info
            print(f"\nüîç DEBUG INFO:")
            print(f"   Transcript exists: {bool(final_state.get('transcript_text'))}")
            if final_state.get('transcript_text'):
                print(f"   Transcript preview: {final_state['transcript_text'][:100]}...")
            print(f"   Insights exist: {bool(final_state.get('extracted_insights'))}")
            print(f"   Ideas exist: {bool(final_state.get('raw_blog_ideas'))}")
            
            return final_state
        
    except Exception as e:
        print(f"\n‚ùå COMPLETE PIPELINE EXECUTION FAILED!")
        print("=" * 60)
        print(f"üí• Exception: {str(e)}")
        
        # Show full traceback for debugging
        import traceback
        print(f"\nüîç FULL ERROR TRACEBACK:")
        traceback.print_exc()
        
        print(f"\nüí° DEBUGGING TIPS:")
        print("   1. Check if all 4 nodes are properly defined")
        print("   2. Verify AssemblyAI API key is working")
        print("   3. Check Anthropic API key is working")
        print("   4. Ensure audio file is not corrupted")
        
        return None

# Execute the complete pipeline test
print("üß™ EXECUTING COMPLETE 4-NODE PIPELINE TEST...")
print("This will test: Audio ‚Üí Transcribe ‚Üí Save ‚Üí Insights ‚Üí Blog Ideas")
print("-" * 60)

complete_test_result = test_complete_4_node_pipeline()

# Show final summary
if complete_test_result:
    print(f"\nüìã FINAL TEST SUMMARY:")
    print(f"   Test Status: {'SUCCESS' if complete_test_result.get('raw_blog_ideas') else 'PARTIAL/FAILED'}")
    print(f"   Pipeline Status: {complete_test_result.get('status', 'unknown')}")
    print(f"   Blog Ideas Generated: {len(complete_test_result.get('raw_blog_ideas', []))}")
else:
    print(f"\nüìã FINAL TEST SUMMARY:")
    print(f"   Test Status: FAILED")
    print(f"   Pipeline could not complete execution")

üß™ EXECUTING COMPLETE 4-NODE PIPELINE TEST...
This will test: Audio ‚Üí Transcribe ‚Üí Save ‚Üí Insights ‚Üí Blog Ideas
------------------------------------------------------------
üöÄ TESTING COMPLETE 4-NODE PIPELINE
Flow: Audio ‚Üí Transcribe ‚Üí Save to DB ‚Üí Extract Insights ‚Üí Generate Ideas
üìÅ Found 1 audio files
üéØ Testing with: blog_record_(manuelillo_cto).wav
üìä File size: 19758.0 KB

üé¨ STARTING COMPLETE PIPELINE EXECUTION...
‚è≥ Running pipeline.invoke()...
üéôÔ∏è Transcribing: blog_record_(manuelillo_cto).wav
üíæ Saving to database: blog_record_(manuelillo_cto).wav
üß† Starting pain extraction...
üìù Raw response length: 2538 chars
üìù Response starts with: ```json
{
    "speakers": [
        {"name": "Manu...
üîß Removing JSON markdown blocks...
üîß Cleaned content starts with: {
    "speakers": [
        {"name": "Manuel", "ro...
‚úÖ Successfully extracted insights with correct speaker roles!
‚úÖ Extracted insights: 3 primary challenges, 1 speakers
üé

In [24]:
# Cell 16: Enhanced Strategy Context for Scoring (Updated for 3 Documents)
def prepare_strategy_context_for_scoring():
    """Prepare strategy context for scoring using all three strategy documents"""
    
    # Load all three strategy documents
    strategy_context = load_company_strategy_context()
    
    # Add scoring guidelines
    strategy_context["scoring_guidelines"] = """
    SCORING CRITERIA (1-10 scale):
    
    1. usefulness_potential: How useful will this post be to readers with problems?
    2. fitwith_seo_strategy: How well does this align with our SEO strategy and keywords?
    3. fitwith_content_strategy: How well does this fit our content strategy and voice?
    4. inspiration_potential: How likely is this to inspire readers to take action?
    5. collaboration_potential: How likely is this to encourage prospects to contact us?
    6. innovation: How unique/differentiated is this topic (10 = very unique)?
    7. difficulty: How complex is this to write (1 = easy, 10 = very complex)?
    """
    
    # Create summaries for LLM prompt efficiency (all three documents)
    if strategy_context.get('company_strategy'):
        strategy_context["company_strategy_summary"] = strategy_context['company_strategy'][:800] + "..."
    
    if strategy_context.get('seo_strategy'):
        strategy_context["seo_strategy_summary"] = strategy_context['seo_strategy'][:600] + "..."
    
    if strategy_context.get('content_strategy'):  # NEW
        strategy_context["content_strategy_summary"] = strategy_context['content_strategy'][:600] + "..."
    
    print(f"‚úÖ Enhanced strategy context for scoring with 3 documents")
    print(f"   Company strategy: {len(strategy_context.get('company_strategy', ''))} chars")
    print(f"   SEO strategy: {len(strategy_context.get('seo_strategy', ''))} chars")
    print(f"   Content strategy: {len(strategy_context.get('content_strategy', ''))} chars")
    
    return strategy_context

# Test the enhanced context
enhanced_context = prepare_strategy_context_for_scoring()

‚úÖ Loaded company strategy (6555 chars)
‚úÖ Loaded SEO strategy (1120 chars)
‚úÖ Loaded content strategy (4469 chars)
‚úÖ Enhanced strategy context for scoring with 3 documents
   Company strategy: 6555 chars
   SEO strategy: 1120 chars
   Content strategy: 4469 chars


In [25]:
# Cell 17: Updated Scoring Engine with Content Strategy Context
def score_blog_idea_with_llm(idea: dict, strategy_context: dict, conversation_context: str = "") -> dict:
    """Score a single blog idea using LLM with all three strategy contexts"""
    
    scoring_prompt = f"""
    You are an expert content strategist for Big Kids Automation. Score this blog post idea on a 1-10 scale using our strategic context.
    
    COMPANY STRATEGY:
    {strategy_context.get('company_strategy_summary', 'Not available')}
    
    SEO STRATEGY:
    {strategy_context.get('seo_strategy_summary', 'Not available')}
    
    CONTENT STRATEGY:
    {strategy_context.get('content_strategy_summary', 'Not available')}
    
    BLOG IDEA TO SCORE:
    Title: {idea.get('title', 'No title')}
    Description: {idea.get('description', 'No description')}
    Target Audience: {idea.get('target_audience', 'Unknown')}
    Business Value: {idea.get('business_value', 'Unknown')}
    Content Angle: {idea.get('content_angle', 'Unknown')}
    
    CONVERSATION CONTEXT:
    {conversation_context[:300] if conversation_context else 'No context available'}...
    
    SCORING INSTRUCTIONS:
    Rate each criterion from 1-10 (10 = excellent, 1 = poor):
    
    1. usefulness_potential: How useful will this be to readers with real problems?
    2. fitwith_seo_strategy: How well does this align with our SEO keywords and strategy?
    3. fitwith_content_strategy: How well does this fit our content strategy, voice, and approach?
    4. inspiration_potential: How likely to inspire readers to take meaningful action?
    5. collaboration_potential: How likely to generate leads/prospects who contact us?
    6. innovation: How unique is this topic compared to existing content?
    7. difficulty: How complex/time-consuming will this be to write? (1=easy, 10=very hard)
    
    Return ONLY valid JSON with your scores and brief reasoning:
    {{
        "usefulness_potential": 8,
        "fitwith_seo_strategy": 7,
        "fitwith_content_strategy": 9,
        "inspiration_potential": 6,
        "collaboration_potential": 8,
        "innovation": 7,
        "difficulty": 4,
        "reasoning": "This idea scores well because it aligns with our content strategy focus on..."
    }}
    """
    
    # ... rest of the function stays the same
    try:
        response = llm.invoke(scoring_prompt)
        
        content = response.content.strip()
        if content.startswith('```json'):
            content = content.replace('```json', '').replace('```', '').strip()
        
        scores = json.loads(content)
        
        # Validate scores are in range
        for criterion in ['usefulness_potential', 'fitwith_seo_strategy', 'fitwith_content_strategy', 
                         'inspiration_potential', 'collaboration_potential', 'innovation', 'difficulty']:
            if criterion in scores:
                scores[criterion] = max(1, min(10, scores[criterion]))
        
        # Calculate total score
        total_score = sum([
            scores.get('usefulness_potential', 5),
            scores.get('fitwith_seo_strategy', 5),
            scores.get('fitwith_content_strategy', 5),
            scores.get('inspiration_potential', 5),
            scores.get('collaboration_potential', 5),
            scores.get('innovation', 5),
            scores.get('difficulty', 5)
        ])
        
        scores['total_score'] = total_score
        return scores
        
    except Exception as e:
        print(f"‚ùå Error scoring idea: {e}")
        return {
            "usefulness_potential": 5, "fitwith_seo_strategy": 5, "fitwith_content_strategy": 5,
            "inspiration_potential": 5, "collaboration_potential": 5, "innovation": 5,
            "difficulty": 5, "total_score": 35, "reasoning": f"Default scores due to error: {str(e)}"
        }

print("‚úÖ Updated LLM scoring engine with content strategy context")

‚úÖ Updated LLM scoring engine with content strategy context


In [26]:
# Cell: Test Three-Document Strategy Loading
def test_three_document_loading():
    """Test loading all three strategy documents"""
    
    print("üß™ Testing three-document strategy loading...")
    
    # Test basic loading
    context = load_company_strategy_context()
    
    # Test enhanced loading for scoring
    enhanced = prepare_strategy_context_for_scoring()
    
    print(f"\nüìä DOCUMENT SUMMARY:")
    for doc_type in ['company_strategy', 'seo_strategy', 'content_strategy']:
        if doc_type in context:
            length = len(context[doc_type]) if context[doc_type] else 0
            status = "‚úÖ Loaded" if length > 100 else "‚ö†Ô∏è Missing/Short"
            print(f"   {doc_type}: {status} ({length} chars)")
    
    return context

# Run the test
test_context = test_three_document_loading()

üß™ Testing three-document strategy loading...
‚úÖ Loaded company strategy (6555 chars)
‚úÖ Loaded SEO strategy (1120 chars)
‚úÖ Loaded content strategy (4469 chars)
‚úÖ Loaded company strategy (6555 chars)
‚úÖ Loaded SEO strategy (1120 chars)
‚úÖ Loaded content strategy (4469 chars)
‚úÖ Enhanced strategy context for scoring with 3 documents
   Company strategy: 6555 chars
   SEO strategy: 1120 chars
   Content strategy: 4469 chars

üìä DOCUMENT SUMMARY:
   company_strategy: ‚úÖ Loaded (6555 chars)
   seo_strategy: ‚úÖ Loaded (1120 chars)
   content_strategy: ‚úÖ Loaded (4469 chars)


In [27]:
def test_analyst_components():
    """Test the analyst agent components using existing functions"""
    
    # Sample idea from your recent pipeline success
    sample_idea = {
        "title": "From Chaos to Clarity: How Automation Fixes Invoice Tracking Without Losing Control",
        "description": "A practical guide exploring how SMEs can implement AI-powered invoice tracking systems...",
        "target_audience": "Finance managers and business owners at SMEs struggling with manual invoice tracking",
        "business_value": "Directly addresses high-urgency pain points while positioning Big Kids as trusted guide"
    }
    
    print("üß™ Testing Analyst Agent Components...")
    print(f"üìù Sample idea: {sample_idea['title'][:50]}...")
    
    # Use the enhanced existing function
    context = prepare_strategy_context_for_scoring()
    
    # Score the idea
    scores = score_blog_idea_with_llm(sample_idea, context)
    
    print(f"\nüìä SCORING RESULTS:")
    print(f"   Usefulness: {scores.get('usefulness_potential', 0)}/10")
    print(f"   SEO Fit: {scores.get('fitwith_seo_strategy', 0)}/10") 
    print(f"   Content Fit: {scores.get('fitwith_content_strategy', 0)}/10")
    print(f"   Inspiration: {scores.get('inspiration_potential', 0)}/10")
    print(f"   Collaboration: {scores.get('collaboration_potential', 0)}/10")
    print(f"   Innovation: {scores.get('innovation', 0)}/10")
    print(f"   Difficulty: {scores.get('difficulty', 0)}/10")
    print(f"   TOTAL SCORE: {scores.get('total_score', 0)}/70")
    
    if scores.get('reasoning'):
        print(f"\nüí≠ Reasoning: {scores['reasoning']}")
    
    return scores

# Run the test with existing functions
test_scores = test_analyst_components()

üß™ Testing Analyst Agent Components...
üìù Sample idea: From Chaos to Clarity: How Automation Fixes Invoic...
‚úÖ Loaded company strategy (6555 chars)
‚úÖ Loaded SEO strategy (1120 chars)
‚úÖ Loaded content strategy (4469 chars)
‚úÖ Enhanced strategy context for scoring with 3 documents
   Company strategy: 6555 chars
   SEO strategy: 1120 chars
   Content strategy: 4469 chars

üìä SCORING RESULTS:
   Usefulness: 8/10
   SEO Fit: 8/10
   Content Fit: 7/10
   Inspiration: 6/10
   Collaboration: 8/10
   Innovation: 0/10
   Difficulty: 4/10
   TOTAL SCORE: 46/70

üí≠ Reasoning: Strong alignment with SME pain points and long-tail keywords ('business process automation for small companies', 'how to implement AI in small business workflows'). Directly addresses high-urgency finance problems. However, the content angle is undefined and the title leans toward operational efficiency rather than the deeper philosophical relationship with tech that defines Big Kids' mission. The 'Without Los