In [7]:
import os
import re
import json
from pathlib import Path

In [8]:
def preprocess_log_trace(log_content):
    """Clean up raw terminal log traces by removing shell artifacts and keeping only relevant content.
    
    This function processes raw terminal logs to extract only the meaningful content like:
    - Generated code blocks (between ```python and ``` tags)
    - MetaGPT system messages about role actions and progress
    
    Args:
        log_content (str): Raw terminal log content as string
        
    Returns:
        str: Cleaned log content with only relevant information
        
    Example:
        raw_log = '''
        [?2004h(base) ]0;mert@bliss1: ~$ python3 examples/build...
        [32m2025-01-17 11:39:37.311[0m | [1mINFO[0m | Alice(SimpleCoder): writing code...
        ```python
        def my_function():
            pass
        ```
        '''
        cleaned = preprocess_log_trace(raw_log)
        # Returns only the INFO message and code block
    """
    lines = log_content.split('\n')
    cleaned_lines = []
    in_code_block = False
    
    for line in lines:
        # Skip terminal control sequences and prompts
        if any(x in line for x in ['[?2004', ']0;', '[01;', '[00m', '(base)', '(multiagent)']):
            continue
            
        # Skip command echo lines
        if line.strip().startswith('$') or 'mert@bliss1' in line:
            continue
            
        # Skip empty lines
        if not line.strip():
            continue
            
        # Keep code blocks
        if line.startswith('```'):
            in_code_block = not in_code_block
            cleaned_lines.append(line)
            continue
            
        # Keep code content
        if in_code_block:
            cleaned_lines.append(line)
            continue
            
        # Keep MetaGPT system messages
        if 'INFO' in line and any(role in line for role in ['Alice', 'Bob', 'Charlie']):
            # Remove ANSI color codes
            clean_line = re.sub(r'\x1b\[[0-9;]*[a-zA-Z]', '', line)
            cleaned_lines.append(clean_line)
            
    return '\n'.join(cleaned_lines)

def parse_log_file(log_path):
    """Parse and clean a log file to extract only relevant content.
    
    This function reads a log file and processes it to remove shell artifacts
    while preserving important content like code blocks and system messages.
    
    Args:
        log_path (str): Path to the log file
        
    Returns:
        dict: Contains cleaned log content and timestamp
            {
                'timestamp': str,  # Extracted from first INFO message
                'content': str     # Cleaned log content
            }
            
    Example:
        log_data = parse_log_file('logs/1.txt')
        print(log_data['content'])  # Shows only relevant content
    """
    with open(log_path, 'r') as f:
        content = f.read()
        
    # Clean up the raw trace
    cleaned_content = preprocess_log_trace(content)
    
    # Extract timestamp from first INFO line
    timestamp_match = re.search(r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})', cleaned_content)
    timestamp = timestamp_match.group(1) if timestamp_match else None
    
    # Extract costs using regex
    cost_matches = re.findall(r'Total running cost: \$(\d+\.\d+)', content)
    final_cost = float(cost_matches[-1]) if cost_matches else None

    return {
        'timestamp': timestamp,
        'cost': final_cost,
        'content': cleaned_content
    }



In [9]:


def extract_idea_from_command(command):
    """Extract the idea from a command string."""
    match = re.search(r'--idea "([^"]+)"', command)
    return match.group(1) if match else None

def build_log_database():
    """Build a database of all runs with their associated commands and logs."""
    # Read commands file
    with open('commands.txt', 'r') as f:
        commands = [line.strip() for line in f if line.strip()]
    
    database = []
    
    # Create runs directory if it doesn't exist
    runs_dir = Path('runs')
    runs_dir.mkdir(exist_ok=True)
    
    # Process each command and its corresponding log
    for i, command in enumerate(commands, 1):
        log_path = f'logs/{i}.txt'
        if not os.path.exists(log_path):
            continue
            
        idea = extract_idea_from_command(command)
        log_data = parse_log_file(log_path)
        
        # Generate keywords from the idea
        keywords = [word.lower() for word in idea.split() if len(word) > 3]
        
        entry = {
            'run_id': i,
            'prompt': idea,
            'content': log_data['content'],
            'timestamp': log_data['timestamp'],
            'cost': log_data['cost'],
            'log_file': log_path
        }
        
        # Save individual run data to separate JSON file
        run_file = runs_dir / f'run_{i}_newer.json'
        with open(run_file, 'w') as f:
            json.dump(entry, f, indent=2)
            
        database.append(entry)
    
    print(f"Created {len(database)} individual run files in {runs_dir}")
    return database

database = build_log_database()


Created 31 individual run files in runs


In [None]:
def generate_visualizations(database):
    """
    Generate visualizations from the log database.
    """
    import matplotlib.pyplot as plt
    import seaborn as sns
    from datetime import datetime
    import pandas as pd
    
    # Convert database to DataFrame
    df = pd.DataFrame(database)
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    
    # Set style
    plt.style.use('seaborn')
    
    # Create figure with subplots
    fig = plt.figure(figsize=(15, 10))
    
    # 1. Cost over time
    ax1 = plt.subplot(2, 2, 1)
    sns.scatterplot(data=df, x='timestamp', y='cost', ax=ax1)
    ax1.set_title('Cost Over Time')
    ax1.set_xlabel('Timestamp')
    ax1.set_ylabel('Cost')
    plt.xticks(rotation=45)
    
    # 2. Cost distribution
    ax2 = plt.subplot(2, 2, 2)
    sns.histplot(data=df, x='cost', bins=20, ax=ax2)
    ax2.set_title('Cost Distribution')
    ax2.set_xlabel('Cost')
    ax2.set_ylabel('Count')
    
    # 3. Prompt length vs Cost
    ax3 = plt.subplot(2, 2, 3)
    df['prompt_length'] = df['prompt'].str.len()
    sns.scatterplot(data=df, x='prompt_length', y='cost', ax=ax3)
    ax3.set_title('Prompt Length vs Cost')
    ax3.set_xlabel('Prompt Length (characters)')
    ax3.set_ylabel('Cost')
    
    # 4. Response length vs Cost
    ax4 = plt.subplot(2, 2, 4)
    df['content_length'] = df['content'].str.len()
    sns.scatterplot(data=df, x='content_length', y='cost', ax=ax4)
    ax4.set_title('Response Length vs Cost')
    ax4.set_xlabel('Response Length (characters)')
    ax4.set_ylabel('Cost')
    
    plt.tight_layout()
    plt.show()
    
    # Additional time series analysis
    daily_costs = df.set_index('timestamp').resample('D')['cost'].sum()
    
    plt.figure(figsize=(12, 4))
    daily_costs.plot(kind='bar')
    plt.title('Daily Total Costs')
    plt.xlabel('Date')
    plt.ylabel('Total Cost')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
    
    return df

# Generate visualizations from the database
analysis_df = generate_visualizations(database)
