In [1]:
from dotenv import load_dotenv
load_dotenv()

import pandas as pd
import json
from google import genai
from openai import AsyncOpenAI
import asyncio
from supabase import create_client, Client
import anthropic
import pyperclip
import json
import re
from google import genai
import time
import textwrap
import google.generativeai as genai
from google.generativeai.types import HarmCategory, HarmBlockThreshold

#Supbase
SUPABASE_URL = os.environ["SUPABASE_URL"]
SUPABASE_KEY = os.environ["SUPABASE_KEY"]
SERVICE_ROLE_KEY = os.environ["SUPABASE_SERVICE_ROLE_KEY"]

supabase: Client = create_client(SUPABASE_URL, SERVICE_ROLE_KEY)

# Anthropic
anthropic_api_key = os.environ["ANTHROPIC_API_KEY"]
client = anthropic.Anthropic(
    # defaults to os.environ.get("ANTHROPIC_API_KEY")
    api_key=anthropic_api_key,
)

ANTHORPIC_MODEL = "claude-haiku-4-5-20251001"

# Gemini
gemini_api_key=os.environ["GOOGLE_API_KEY"]

# OpenAI
openai_api_key = os.environ["OPENAI_API_KEY"]

# Setup

## Grab data from Supabase

In [2]:
# ‚úÖ Function to Fetch Data from Supabase
def fetch_data(table_name, batch_size=500, filters=None, related_tables=None):
    try:
        all_data = []
        start = 0

        # Build select string
        if related_tables:
            select_string = "*, " + ", ".join(f"{tbl}(*)" for tbl in related_tables)
        else:
            select_string = "*"

        while True:
            query = supabase.table(table_name).select(select_string)
            
            if filters:
                for column, value in filters.items():
                    if isinstance(value, list):
                        query = query.in_(column, value)
                    elif value is None:
                        query = query.is_(column, None)
                    else:
                        query = query.eq(column, value)
            
            response = query.range(start, start + batch_size - 1).execute()
            
            if response.data:
                all_data.extend(response.data)
                start += batch_size
                if len(response.data) < batch_size:
                    break
            else:
                break

        if all_data:
            print(f"‚úÖ Successfully fetched `{table_name}` table with filter '{filters}' and {len(all_data)} rows.")
            return pd.DataFrame(all_data)
        else:
            print(f"‚ö†Ô∏è `{table_name}` is empty.")
            return pd.DataFrame()

    except Exception as e:
        print(f"‚ùå Error fetching data from '{table_name}': {e}")
        return pd.DataFrame()

# ‚úÖ Fetch data from tables
companies = fetch_data(
    "companies",
     filters={"status": ["trial", "active"]},
)
competitors = fetch_data("competitors")

signals = fetch_data("signals")

# Filter competitors to only trial accounts
filtered_competitors = competitors[competitors["company_id"].isin(companies["id"])]

# Optional: reset index if you want a clean one
filtered_competitors = filtered_competitors.reset_index(drop=True)
competitors = filtered_competitors
print(f"‚úÖ Filtered to {len(companies)} trial and active accounts with {len(competitors)} competitors")

‚úÖ Successfully fetched `companies` table with filter '{'status': ['trial', 'active']}' and 6 rows.
‚úÖ Successfully fetched `competitors` table with filter 'None' and 59 rows.
‚úÖ Successfully fetched `signals` table with filter 'None' and 931 rows.
‚úÖ Filtered to 6 trial and active accounts with 25 competitors


In [3]:
## Add signal url
signals["signal_url"] = signals["id"].apply(
    lambda x: f"/signal/{int(x)}" if pd.notna(x) else None
)
len(signals)

931

## Filter to signals for trial and active accts

In [4]:
# get the set of valid (active) company IDs
active_company_ids = set(companies["id"])

# filter signals where company_id is NOT in companies
signals = signals[signals["company_id"].isin(active_company_ids)]
signals["company_id"].unique()

array([23., 66., 71., 72., 73.,  7.])

## Filter signals to last 7 days

In [5]:
# Ensure datetimes
signals["created_at"] = pd.to_datetime(signals["created_at"], utc=True)
signals["last_updated"] = pd.to_datetime(signals["last_updated"], utc=True)

# Fill last_updated with created_at where it's None / NaT
signals["last_updated"] = signals["last_updated"].fillna(signals["created_at"])

# Filter recent signals based on created_at (unchanged)
recent_signals = signals[
    signals["last_updated"] >= pd.Timestamp.utcnow() - pd.Timedelta(days=7)
]

len(recent_signals)

73

## Group signals

In [6]:
grouped_signals = (
    recent_signals.groupby("company_id")
    .apply(lambda x: x.to_dict(orient="records"))
    .reset_index(name="signal_data")
)
grouped_signals

  .apply(lambda x: x.to_dict(orient="records"))


Unnamed: 0,company_id,signal_data
0,7.0,"[{'id': 2018, 'created_at': 2025-12-08 21:14:4..."
1,23.0,"[{'id': 2528, 'created_at': 2026-01-02 19:54:3..."
2,66.0,"[{'id': 2352, 'created_at': 2025-12-15 18:55:3..."
3,71.0,"[{'id': 2568, 'created_at': 2026-01-07 16:24:5..."
4,72.0,"[{'id': 2772, 'created_at': 2026-01-27 01:59:2..."
5,73.0,"[{'id': 2779, 'created_at': 2026-01-27 01:59:2..."


# Run through LLM

In [7]:
import asyncio
from anthropic import AsyncAnthropic

client = AsyncAnthropic(api_key=anthropic_api_key)
MODEL_NAME = ANTHORPIC_MODEL
MAX_CONCURRENCY = 100

semaphore = asyncio.Semaphore(MAX_CONCURRENCY)
lock = asyncio.Lock()

async def fetch_response(prompt, company_id, progress):
    async with semaphore:
        try:
            response = await client.messages.create(
                model=MODEL_NAME,
                max_tokens=1200,
                messages=[
                    {"role": "user", "content": prompt}
                ]
            )

            # Claude returns content as a list of blocks
            text = response.content[0].text if response.content else None

        except Exception as e:
            text = None
            print(f"‚ö†Ô∏è Error for company {company_id}: {e}")

        # Update progress safely
        async with lock:
            progress["done"] += 1
            done = progress["done"]
            total = progress["total"]
            print(f"‚úÖ Completed {done}/{total} ({done/total:.0%})")

        return {
            "company_id": company_id,
            "response": text
        }


async def process_all(df):
    total = len(df)
    progress = {"done": 0, "total": total}
    tasks = []

    for _, row in df.iterrows():
        prompt = f"""
**Role:** You are a competitive intelligence analyst delivering a weekly briefing to C-suite executives.

**Task:** Analyze the provided signals and synthesize them into 4-5 key strategic themes.

**Output Format (Markdown):**
- Each theme gets an `##` header (keep titles short‚Äî5-7 words max)
- 3 bullet points per theme
- Each bullet must be ONE short sentence‚Äî25 words MAX
- Do not camel case the theme title
- Weave signal links naturally INTO the sentence using proper markdown: `[anchor text](/signals/{id})`
- Do NOT append signal titles at the end
- Multiple signals can be linked in a single bullet
- Each bullet should have AT LEST one signal linked, it is not good to state something without referring to a source

**Quality Standards:**
- Bullets are punchy, not paragraphs
- No preamble or labels
- Be specific: names, numbers, timeframes
- Avoid hedging and filler

**Example Theme:**

## Linear TV budgets up for grabs

- [Disney](/signals/456) and [Netflix](/signals/457) ramping ad tech hiring for linear TV budgets.
- FAST channels proliferating as [low-friction acquisition funnels](/signals/789).
- CTV CPMs at [3-4x premium](/signals/012) over mobile.

**Bad (too long):**
- DoorDash is scaling Dot sidewalk robots and Waymo vehicle delivery with dedicated operations and drone strategy hires in Phoenix/Mesa, integrating autonomous orchestration into DashPass to lower unit costs.

**Good (punchy):**
- [Dot robots](/signals/123) and [Waymo partnership](/signals/456) expanding in Phoenix‚Äîautonomous delivery at scale.

-- Here is the content --
{row['signal_data']}
"""
        tasks.append(
            fetch_response(prompt, row["company_id"], progress)
        )

    results = await asyncio.gather(*tasks)
    return results

results = await process_all(grouped_signals)
results_df = pd.DataFrame(results)

‚úÖ Completed 1/6 (17%)
‚úÖ Completed 2/6 (33%)
‚úÖ Completed 3/6 (50%)
‚úÖ Completed 4/6 (67%)
‚úÖ Completed 5/6 (83%)
‚úÖ Completed 6/6 (100%)


In [8]:
results_df.head(15)

Unnamed: 0,company_id,response
0,7.0,## HappyNest betting on sustained brand awaren...
1,23.0,## Oracle weaponizing security narrative after...
2,66.0,## DoorDash dominates through vertical integra...
3,71.0,## Ethical banking as competitive moat\n\n- [T...
4,72.0,## AI embedding unlocking fintech moats\n\n- [...
5,73.0,## Vertical integration reshaping trade econom...


### Send foresights to supabase

In [9]:
for _, row in results_df.iterrows():
    company_id = int(row["company_id"])
    themes = row["response"]

    try:
        response = (
            supabase
            .table("trends")
            .insert({
                "company_id": company_id,
                "themes": themes
            })
            .execute()
        )

        if response.data:
            print(f"‚úÖ Inserted company {company_id}")
        else:
            print(f"‚ö†Ô∏è Insert returned no data for company {company_id}")

    except Exception as e:
        print(f"üí• Exception inserting company {company_id}: {e}")

‚úÖ Inserted company 7
‚úÖ Inserted company 23
‚úÖ Inserted company 66
‚úÖ Inserted company 71
‚úÖ Inserted company 72
‚úÖ Inserted company 73
