In [1]:
# variables = ['datadate', 'ticker', 'agr', 'bm', 'mom12m', 'mve', 'operprof', 'roeq', 'absacc', 'acc', 'aeavol', 'age', 'baspread', 'BETA', 'bm_ia', 
#              'cash', 'cashdebt', 'cashpr', 'cfp', 'cfp_ia', 'chatoia', 'chcsho', 'chempia', 'chfeps', 'chinv', 'chmom', 'chnanalyst', 
#              'chpmia', 'chtx', 'cinvest', 'convind', 'currat', 'depr', 'disp', 'divi', 'divo', 'dy', 'ear', 'egr', 'ep', 'fgr5yr', 
#              'gma', 'grcapx', 'grltnoa', 'herf', 'hire', 'idiovol', 'ill', 'indmom', 'invest', 'IPO', 'lev', 'mom1m', 'mom36m', 'ms', 
#              'mve_ia', 'nanalyst', 'nincr', 'orgcap', 'pchcapx_ia', 'pchcurrat', 'pchdepr', 'pchgm_pchsale', 'pchsale_pchinvt', 
#              'pchsale_pchrect', 'pchsale_pchxsga', 'pchsaleinv', 'pctacc', 'pricedelay', 'ps', 'rd', 'rd_mve', 'rd_sale', 'realestate', 
#              'retvol', 'roaq', 'roavol', 'roic', 'rsup', 'salecash', 'saleinv', 'salerec', 'secured', 'securedind', 'sfe', 'sgr', 'sin', 
#              'sp', 'std_dolvol', 'std_turn', 'stdcf', 'sue', 'tang', 'tb', 'turn', 'zerotrade']
variables = ['datadate', 'ticker', 'bm', 'mom12m', 'mve']

In [6]:
from dotenv import find_dotenv, load_dotenv
from getpass import getpass
import os

load_dotenv(find_dotenv())

if not os.getenv("GEMINI_API_KEY"):
    os.environ["GEMINI_API_KEY"] = getpass("Enter your GEMINI_API_KEY:")

In [7]:
import sqlite3
import pandas as pd
from ddgs import DDGS
from crewai.tools import tool
from crewai import Agent, Task, Crew, Process
from agentics import AG

# ============================================
# STEP 1: One-time database setup
# ============================================
def setup_firm_database(csv_path, db_path='firm_data.db', selected_columns=None):
    """
    Load your CSV into SQLite database (run this once).
    
    Args:
        csv_path: Path to your 600MB CSV file
        db_path: Path where SQLite database will be created
        selected_columns: List of column names to include (None = include all columns)
    """
    print(f"Loading CSV from {csv_path}...")
    
    # First, read a sample to see available columns
    print("Reading CSV header...")
    sample_df = pd.read_csv(csv_path, nrows=5)
    
    print(f"\nAvailable columns ({len(sample_df.columns)} total):")
    for i, col in enumerate(sample_df.columns, 1):
        print(f"  {i}. {col}")
    
    # If no columns selected, prompt user or use all
    if selected_columns is None:
        print("\n" + "="*70)
        print("No columns specified. You can either:")
        print("1. Use all columns (press Enter)")
        print("2. Or modify the code to pass selected_columns parameter")
        print("="*70)
        response = input("\nPress Enter to use ALL columns, or type 'stop' to exit and modify code: ").strip()
        
        if response.lower() == 'stop':
            print("Exiting. Please modify your code to specify selected_columns.")
            return
        
        selected_columns = sample_df.columns.tolist()
    else:
        # Validate that selected columns exist
        missing = [col for col in selected_columns if col not in sample_df.columns]
        if missing:
            print(f"\n❌ Error: These columns don't exist in CSV: {missing}")
            print("Please check your column names and try again.")
            return
        
        print(f"\n✓ Using {len(selected_columns)} selected columns:")
        for col in selected_columns:
            print(f"  - {col}")
    
    # Filter to selected columns
    sample_df = sample_df[selected_columns]
    
    # Check for duplicates in selected columns
    duplicates = [col for col in selected_columns if selected_columns.count(col) > 1]
    if duplicates:
        print(f"\n⚠ Warning: Found duplicate columns in selection: {set(duplicates)}")
        print("Removing duplicates...")
        selected_columns = list(dict.fromkeys(selected_columns))  # Remove duplicates, preserve order
    
    # Clean column names - remove special characters that SQL doesn't like
    clean_columns = [col.replace(' ', '_').replace('-', '_').replace('.', '_').replace('(', '').replace(')', '').replace('/', '_') 
                    for col in selected_columns]
    
    print(f"\nCleaned column names: {clean_columns[:10]}...")
    
    # Delete existing database to start fresh
    import os
    if os.path.exists(db_path):
        os.remove(db_path)
        print(f"Removed existing database: {db_path}")
    
    # For large files, use chunksize to avoid memory issues
    chunk_size = 50000
    conn = sqlite3.connect(db_path)
    
    first_chunk = True
    chunk_num = 0
    total_rows = 0
    
    print("\nLoading data into database...")
    for chunk in pd.read_csv(csv_path, chunksize=chunk_size, usecols=selected_columns):
        # Apply cleaned column names
        chunk.columns = clean_columns
        
        chunk_num += 1
        total_rows += len(chunk)
        
        if first_chunk:
            chunk.to_sql('firms', conn, if_exists='replace', index=False)
            first_chunk = False
            print(f"\n✓ First chunk loaded. Database columns ({len(clean_columns)} total):")
            print(f"  {', '.join(clean_columns)}")
            print()
        else:
            chunk.to_sql('firms', conn, if_exists='append', index=False)
        
        print(f"  Chunk {chunk_num}: +{len(chunk):,} rows (Total: {total_rows:,})")
    
    # Create indexes for better query performance
    cursor = conn.cursor()
    try:
        cursor.execute("CREATE INDEX IF NOT EXISTS idx_ticker ON firms(ticker)")
        cursor.execute("CREATE INDEX IF NOT EXISTS idx_date ON firms(datadate)")
        print("\nIndexes created successfully")
    except Exception as e:
        print(f"Note: Could not create indexes - {e}")
    
    conn.commit()
    conn.close()
    
    print(f"\n{'='*70}")
    print(f"✓ Database created successfully!")
    print(f"  Location: {db_path}")
    print(f"  Total rows: {total_rows:,}")
    print(f"  Columns: {len(clean_columns)}")
    print(f"{'='*70}")
    
    return clean_columns

# ============================================
# STEP 2: Define database query tools
# ============================================
@tool("query_firm_database")
def query_firm_database(sql_query: str) -> str:
    """
    Execute SQL queries on the S&P 500 firm characteristics database.
    
    Available table: 'firms' (contains all firm characteristic data)
    
    IMPORTANT DATA NOTES:
    - 'mve' column = log(market value of equity), i.e., log size
    - ALL features are standardized: mean = 0, standard deviation = 1
    - Higher values = more extreme positive, lower values = more extreme negative
    
    Example queries:
    - SELECT * FROM firms WHERE ticker = 'AAPL' ORDER BY datadate DESC LIMIT 1
    - SELECT ticker, mve, bm, mom12m FROM firms ORDER BY datadate DESC LIMIT 20
    - SELECT ticker, bm, mom12m FROM firms WHERE bm < -1 AND mom12m > 1 ORDER BY datadate DESC
    
    Args:
        sql_query: A valid SQL SELECT query
    
    Returns:
        Query results as a formatted string
    """
    try:
        conn = sqlite3.connect('firm_data.db')
        df = pd.read_sql_query(sql_query, conn)
        conn.close()
        
        if df.empty:
            return "Query returned no results."
        
        result = f"Query returned {len(df)} rows:\n\n"
        result += df.to_string(index=False)
        
        return result
    except Exception as e:
        return f"Error executing query: {str(e)}\nPlease check your SQL syntax and try again."

@tool("get_database_schema")
def get_database_schema() -> str:
    """
    Get the schema of the firms database including all column names and types.
    
    CRITICAL DATA INFORMATION:
    - 'mve' = log(market value of equity), represents log firm size
    - ALL features are standardized to mean=0, std=1
    - Values represent standard deviations from mean (z-scores)
    - Example: bm=2.0 means book-to-market is 2 std devs above average
    """
    try:
        conn = sqlite3.connect('firm_data.db')
        cursor = conn.cursor()
        
        cursor.execute("PRAGMA table_info(firms)")
        columns = cursor.fetchall()
        
        cursor.execute("SELECT COUNT(*) FROM firms")
        row_count = cursor.fetchone()[0]
        
        conn.close()
        
        schema = f"""Database: firm_data.db
Table: firms
Total rows: {row_count:,}

DATA STANDARDIZATION:
- 'mve' column = log(market value of equity) - represents log firm size
- ALL features are standardized: mean = 0, standard deviation = 1
- Interpret values as z-scores (standard deviations from mean)
- Example: A value of 2.0 means 2 standard deviations above average
- Example: A value of -1.5 means 1.5 standard deviations below average

Columns:
"""
        for col in columns:
            schema += f"  - {col[1]} ({col[2]})\n"
        
        return schema
    except Exception as e:
        return f"Error retrieving schema: {str(e)}"

@tool("analyze_stock_characteristics")
def analyze_stock_characteristics(ticker: str) -> str:
    """
    Get detailed analysis of a specific stock's standardized characteristics.
    Remember: mve = log size, all values are z-scores (mean=0, std=1).
    
    Args:
        ticker: Stock ticker symbol (e.g., 'AAPL', 'MSFT')
    """
    try:
        conn = sqlite3.connect('firm_data.db')
        query = f"SELECT * FROM firms WHERE ticker = '{ticker.upper()}' ORDER BY datadate DESC LIMIT 1"
        df = pd.read_sql_query(query, conn)
        conn.close()
        
        if df.empty:
            return f"No data found for ticker: {ticker}"
        
        analysis = f"Stock Analysis for {ticker.upper()}:\n"
        analysis += "(Remember: mve = log size, all features are z-scores with mean=0, std=1)\n\n"
        analysis += df.to_string(index=False)
        
        return analysis
    except Exception as e:
        return f"Error analyzing stock: {str(e)}"

@tool("screen_stocks_by_criteria")
def screen_stocks_by_criteria(criteria: str) -> str:
    """
    Screen stocks based on standardized firm characteristics.
    
    REMEMBER: 
    - mve = log firm size (standardized)
    - All features are z-scores (mean=0, std=1)
    - Strong signals typically: |z-score| > 1.0
    
    Args:
        criteria: Screening criteria description
    """
    examples = f"""
Screening examples for "{criteria}" (Remember: all values are standardized z-scores):

1. Strong value stocks (low book-to-market):
   SELECT ticker, bm, mom12m, mve FROM firms 
   WHERE bm < -1.0 
   ORDER BY datadate DESC, bm ASC LIMIT 10

2. Strong momentum stocks:
   SELECT ticker, mom12m, bm, mve FROM firms 
   WHERE mom12m > 1.0 
   ORDER BY datadate DESC, mom12m DESC LIMIT 10

3. Large cap value + momentum:
   SELECT ticker, mve, bm, mom12m FROM firms 
   WHERE mve > 1.0 AND bm < -0.5 AND mom12m > 0.5
   ORDER BY datadate DESC LIMIT 10

4. High conviction signals (extreme values):
   SELECT ticker, bm, mom12m, mve FROM firms 
   WHERE ABS(bm) > 1.5 OR ABS(mom12m) > 1.5
   ORDER BY datadate DESC LIMIT 10

Use query_firm_database with appropriate SQL for your analysis.
"""
    return examples

@tool("compare_stocks")
def compare_stocks(tickers: str) -> str:
    """
    Compare multiple stocks' standardized characteristics side-by-side.
    Remember: mve = log size, all values are z-scores.
    
    Args:
        tickers: Comma-separated ticker symbols (e.g., "AAPL,MSFT,GOOGL")
    """
    try:
        ticker_list = [t.strip().upper() for t in tickers.split(',')]
        ticker_str = "','".join(ticker_list)
        
        conn = sqlite3.connect('firm_data.db')
        query = f"SELECT * FROM firms WHERE ticker IN ('{ticker_str}') ORDER BY ticker, datadate DESC"
        df = pd.read_sql_query(query, conn)
        conn.close()
        
        if df.empty:
            return f"No data found for tickers: {tickers}"
        
        # Get most recent data for each ticker
        df = df.groupby('ticker').first().reset_index()
        
        comparison = f"Comparison of {len(ticker_list)} stocks:\n"
        comparison += "(mve = log size, all features are standardized z-scores)\n\n"
        comparison += df.to_string(index=False)
        
        return comparison
    except Exception as e:
        return f"Error comparing stocks: {str(e)}"

@tool("web_search")
def web_search(query: str) -> str:
    """Search web for current market news and context."""
    return str(DDGS().text(query, max_results=10))

# ============================================
# STEP 3: Create specialized agents
# ============================================
research_agent = Agent(
    role="Quantitative Research Analyst",
    goal="Analyze standardized firm characteristics to identify high-conviction BUY/HOLD/SELL opportunities",
    backstory="""
    You are an expert quantitative analyst specializing in factor-based stock recommendations.
    
    CRITICAL DATA UNDERSTANDING:
    - 'mve' column = log(market value of equity), represents log firm size
    - ALL features are standardized: mean = 0, standard deviation = 1
    - Values are z-scores showing standard deviations from mean
    - Strong signals typically have |z-score| > 1.0 (beyond 1 std dev)
    - Very strong signals have |z-score| > 1.5 or 2.0
    
    Your analytical approach:
    1. Check database schema to understand available standardized factors
    2. Screen for stocks with EXTREME characteristic values (strong signals)
    3. Focus on proven factors: value (bm), momentum (mom12m), size (mve)
    4. Identify 5-10 stocks with the most CONFIDENT signals
    5. Interpret z-scores correctly: 
       - High positive momentum (mom12m > 1.5) = strong BUY signal
       - Low book-to-market (bm < -1.0) = undervalued = potential BUY
       - Negative momentum (mom12m < -1.5) = potential SELL
    
    Remember: Only recommend stocks where you have HIGH CONFIDENCE based on extreme factor values.
    Academic research shows factors like value, momentum, and quality predict returns.
    """,
    llm=AG.get_llm_provider(),
    memory=True,
    verbose=True
)

investment_agent = Agent(
    role="Portfolio Manager - Buy/Hold/Sell Decisions",
    goal="Make definitive BUY/HOLD/SELL recommendations for the 5-10 most confident opportunities",
    backstory="""
    You are a decisive portfolio manager who makes clear BUY/HOLD/SELL calls based on quantitative signals.
    
    DATA UNDERSTANDING:
    - All features are standardized z-scores (mean=0, std=1)
    - 'mve' = log firm size
    - Strong signals = |z-score| > 1.0, very strong = |z-score| > 1.5
    
    Your decision framework:
    1. Review research analyst's high-conviction candidates
    2. For EACH stock, make ONE clear decision: BUY, HOLD, or SELL
    3. Base decisions on standardized factor signals:
       - BUY: Strong positive momentum, undervalued, high quality
       - SELL: Negative momentum, overvalued, deteriorating fundamentals  
       - HOLD: Mixed signals or moderate factor values
    4. Only include 5-10 MOST CONFIDENT recommendations
    5. Rank by conviction level (strongest signal first)
    6. Explain reasoning using z-score interpretations
    
    OUTPUT FORMAT for each stock:
    - Ticker: [TICKER]
    - Recommendation: BUY / HOLD / SELL
    - Conviction: HIGH / MEDIUM
    - Key Signals: [e.g., "Momentum z-score = 2.1 (very strong), BM z-score = -1.3 (undervalued)"]
    - Rationale: [Brief explanation]
    
    Be decisive. Every recommendation must be BUY, HOLD, or SELL - no ambiguity.
    """,
    llm=AG.get_llm_provider(),
    memory=True,
    verbose=True
)

# ============================================
# STEP 4: Define tasks
# ============================================
research_task = Task(
    description="""
    Analyze standardized firm characteristics to identify high-conviction stock opportunities.
    
    User request: {input}
    
    REMEMBER: 
    - mve = log firm size
    - All features are z-scores (mean=0, std=1)
    - Focus on extreme values (|z-score| > 1.0 for strong signals)
    
    Process:
    1. Check database schema to understand available factors
    2. Screen for stocks with EXTREME characteristic values
    3. Focus on: value (bm), momentum (mom12m), size (mve), profitability
    4. Identify 5-10 stocks with strongest factor signals
    5. Provide detailed z-score analysis for each candidate
    
    Output: Shortlist of 5-10 stocks with the most confident factor-based signals.
    """,
    expected_output="Detailed analysis of 5-10 stocks with extreme factor values (z-scores), ranked by signal strength with specific z-score values for key characteristics",
    agent=research_agent,
    tools=[
        get_database_schema,
        query_firm_database,
        analyze_stock_characteristics,
        screen_stocks_by_criteria,
        compare_stocks
    ],
)

investment_task = Task(
    description="""
    Make definitive BUY/HOLD/SELL recommendations for the most confident opportunities.
    
    User request: {input}
    
    REMEMBER:
    - All features are standardized z-scores
    - mve = log size
    - Strong signals: |z-score| > 1.0
    
    Process:
    1. Review research analyst's shortlist
    2. For EACH stock, decide: BUY, HOLD, or SELL
    3. Base on factor signals (momentum, value, quality, size)
    4. Only include 5-10 MOST CONFIDENT recommendations
    5. Rank by conviction (strongest first)
    
    Required output format for EACH stock:
    ---
    Ticker: [TICKER]
    Recommendation: BUY / HOLD / SELL
    Conviction: HIGH / MEDIUM
    Key Signals: [z-score values]
    Rationale: [Why this recommendation based on standardized factors]
    ---
    
    Be decisive. Every stock gets exactly ONE recommendation: BUY, HOLD, or SELL.
    """,
    expected_output="Final list of 5-10 stocks with clear BUY/HOLD/SELL recommendations, conviction levels, z-score analysis, and specific rationale for each decision",
    agent=investment_agent,
    tools=[
        analyze_stock_characteristics,
        compare_stocks,
        web_search
    ],
    context=[research_task]
)

crew = Crew(
    agents=[research_agent, investment_agent],
    tasks=[research_task, investment_task],
    process=Process.sequential,
    memory=False,
    verbose=True
)

# ============================================
# STEP 5: Main loop
# ============================================
if __name__ == "__main__":
    # IMPORTANT: Run this ONCE to create your database
    # Uncomment and specify your columns
    # selected_cols = ['ticker', 'datadate', 'mve', 'bm', 'mom12m', ...]  # Add your columns
    # setup_firm_database('../green cleaned.csv', selected_columns=selected_cols)
    
    print("=" * 70)
    print("AI STOCK ANALYST - BUY/HOLD/SELL Recommendations")
    print("=" * 70)
    print("\nGenerates 5-10 high-conviction BUY/HOLD/SELL recommendations")
    print("Based on standardized firm characteristics (z-scores)")
    print("\nData notes:")
    print("  - 'mve' = log firm size")
    print("  - All features standardized: mean=0, std=1")
    print("  - Values are z-scores (standard deviations from mean)")
    print("\nExample queries:")
    print("  - 'Give me your top BUY recommendations'")
    print("  - 'What stocks should I SELL right now?'")
    print("  - 'Analyze value and momentum stocks for BUY/HOLD/SELL'")
    print("  - 'Find stocks with extreme factor signals'")
    print("\nType 'exit' to quit\n")
    print("=" * 70)
    
    conversation = ""
    while user_input := input("\nUSER: ").strip():
        if user_input.lower() in ['exit', 'quit', 'bye']:
            print("\nThank you for using the AI Stock Analyst. Goodbye!")
            break
            
        print(f"\n{'=' * 70}")
        print(f"ANALYZING: {user_input}")
        print('=' * 70)
        
        result = crew.kickoff(inputs={"input": conversation + user_input})
        
        print(f"\n{'=' * 70}")
        print("FINAL RECOMMENDATIONS (BUY/HOLD/SELL):")
        print('=' * 70)
        print(f"\n{result}\n")
        
        conversation += f"User>{user_input}\nAI>{result}\n"

AI STOCK ANALYST - BUY/HOLD/SELL Recommendations

Generates 5-10 high-conviction BUY/HOLD/SELL recommendations
Based on standardized firm characteristics (z-scores)

Data notes:
  - 'mve' = log firm size
  - All features standardized: mean=0, std=1
  - Values are z-scores (standard deviations from mean)

Example queries:
  - 'Give me your top BUY recommendations'
  - 'What stocks should I SELL right now?'
  - 'Analyze value and momentum stocks for BUY/HOLD/SELL'
  - 'Find stocks with extreme factor signals'

Type 'exit' to quit




USER:  what stocks should i sell right now



ANALYZING: what stocks should i sell right now


Output()

Output()

Output()

[91mAn unknown error occurred. Please check the details below.[0m
[91mError details: list index out of range[0m
[91mAn unknown error occurred. Please check the details below.[0m
[91mError details: list index out of range[0m


Output()

Output()

BadRequestError: litellm.BadRequestError: VertexAIException BadRequestError - {
  "error": {
    "code": 400,
    "message": "API key expired. Please renew the API key.",
    "status": "INVALID_ARGUMENT",
    "details": [
      {
        "@type": "type.googleapis.com/google.rpc.ErrorInfo",
        "reason": "API_KEY_INVALID",
        "domain": "googleapis.com",
        "metadata": {
          "service": "generativelanguage.googleapis.com"
        }
      },
      {
        "@type": "type.googleapis.com/google.rpc.LocalizedMessage",
        "locale": "en-US",
        "message": "API key expired. Please renew the API key."
      }
    ]
  }
}
