## Company Filter

In [1]:
# company_name = "Nextiva"
# company_name

# Setup

In [2]:
from dotenv import load_dotenv
load_dotenv()

import pandas as pd
import praw
import requests
import time
from datetime import datetime, timezone
from supabase import create_client, Client
import ast
import json
import pyperclip
import anthropic

SUPABASE_URL = os.environ["SUPABASE_URL"]
SUPABASE_KEY = os.environ["SUPABASE_KEY"]
SERVICE_ROLE_KEY = os.environ["SUPABASE_SERVICE_ROLE_KEY"]

supabase: Client = create_client(SUPABASE_URL, SERVICE_ROLE_KEY)

# Anthropic
anthropic_api_key = os.environ["ANTHROPIC_API_KEY"]
client = anthropic.Anthropic(
    api_key=anthropic_api_key,
)

praw_client = praw.Reddit(
    client_id="zqMK7UDnOSUy1N-ak-PCFQ",
    client_secret="eUkjU_vH7QZT_X2Jwm_nJ8kdiNFQYw",
    user_agent="search-script by u/Disastrous-Olive-441"
)

# Fetch data

In [3]:
# ‚úÖ Function to Fetch Data from Supabase
def fetch_data(table_name, batch_size=500, filters=None, related_tables=None):
    try:
        all_data = []
        start = 0

        # Build select string
        if related_tables:
            select_string = "*, " + ", ".join(f"{tbl}(*)" for tbl in related_tables)
        else:
            select_string = "*"

        while True:
            query = supabase.table(table_name).select(select_string)
            
            if filters:
                for column, value in filters.items():
                    if isinstance(value, list):
                        query = query.in_(column, value)
                    elif value is None:
                        query = query.is_(column, None)
                    else:
                        query = query.eq(column, value)
            
            response = query.range(start, start + batch_size - 1).execute()
            
            if response.data:
                all_data.extend(response.data)
                start += batch_size
                if len(response.data) < batch_size:
                    break
            else:
                break

        if all_data:
            print(f"‚úÖ Successfully fetched `{table_name}` table with filter '{filters}' and {len(all_data)} rows.")
            return pd.DataFrame(all_data)
        else:
            print(f"‚ö†Ô∏è `{table_name}` is empty.")
            return pd.DataFrame()

    except Exception as e:
        print(f"‚ùå Error fetching data from '{table_name}': {e}")
        return pd.DataFrame()

In [4]:
# ‚úÖ Fetch data from tables
companies = fetch_data(
    "companies",
     filters={"status": ["trial", "active"]},
)
competitors = fetch_data(
    "competitors",
)

‚úÖ Successfully fetched `companies` table with filter '{'status': ['trial', 'active']}' and 5 rows.
‚úÖ Successfully fetched `competitors` table with filter 'None' and 56 rows.


# Filter competitors by account status

In [5]:
# Filter competitors by account status
filtered_competitors = competitors[competitors["company_id"].isin(companies["id"])]

# Optional: reset index if you want a clean one
filtered_competitors = filtered_competitors.reset_index(drop=True)
competitors = filtered_competitors
print(f"‚úÖ Filtered to {len(companies)} trial accounts with {len(competitors)} competitors")

‚úÖ Filtered to 5 trial accounts with 22 competitors


In [6]:
# ‚úÖ Keep only rows where 'reddit' is not NaN
companies = companies[~companies["reddit"].isna()]
companies

Unnamed: 0,id,created_at,company_name,logo,website_url,status,reddit,primary_color,company_custom_prompt,news_topics_search,company_custom_feed_prompt
1,23,2025-08-12T18:46:46.880546+00:00,Abridge,https://www.google.com/s2/favicons?domain=abri...,https://www.abridge.com/,active,"{'keywords': ['medicalscribe', 'nursepractitio...",ea2c00,Abridge is an AI-powered clinical documentatio...,,## Target Content Classification Prompt\n\nYou...
2,66,2025-12-03T03:09:31.833544+00:00,toast,https://www.google.com/s2/favicons?domain=pos....,https://pos.toasttab.com/,trial,"{'keywords': [], 'subreddits': ['smallbusiness...",ff4c00,Toast POS is a cloud-based point-of-sale and r...,,## Competitive Intelligence Relevance Classifi...
3,72,2026-01-17T21:49:43.563146+00:00,MX Build,,https://mxbuild.co/,trial,"{""keywords"": [], ""subreddits"": [""HomeImproveme...",,MX Build combines field service management wit...,,## Competitive Intelligence Relevance Classifi...
4,71,2026-01-05T21:56:47.393102+00:00,Acumen,,https://acumen.org/,trial,"{""keywords"": [], ""subreddits"": [""philanthropy""...",,Acumen is a global nonprofit impact investment...,,## Competitive Intelligence Relevance Classifi...


# Add competitors to keyword query

In [7]:
comp_lists = (
    competitors
    .groupby('company_id', as_index=False)
    .agg(competitors=('competitor_name', lambda x: [c for c in x.dropna()]))
)

companies_with_comps = (
    companies
    .merge(comp_lists, left_on='id', right_on='company_id', how='left')
    .drop(columns=['company_id'])
)

companies_with_comps['competitors'] = companies_with_comps['competitors'].apply(
    lambda v: v if isinstance(v, list) else []
)

companies = companies_with_comps

## Split out keywords and reddit into own columns

In [8]:
def safe_parse(value):
    if isinstance(value, str):
        try:
            return ast.literal_eval(value)
        except Exception:
            return {}
    return value

def extract_keywords_and_subs(row):
    reddit_data = safe_parse(row.get("reddit", {}))
    competitors_data = safe_parse(row.get("competitors", []))

    # Extract reddit keywords & subreddits
    reddit_keywords = reddit_data.get("keywords", []) if isinstance(reddit_data, dict) else []
    subreddits = reddit_data.get("subreddits", []) if isinstance(reddit_data, dict) else []

    # Ensure both are lists
    if not isinstance(reddit_keywords, list):
        reddit_keywords = []
    if not isinstance(subreddits, list):
        subreddits = []

    # Handle competitors
    if not isinstance(competitors_data, list):
        competitors_data = []

    # Combine competitors + reddit keywords
    all_keywords = list({kw.lower().strip() for kw in (reddit_keywords + competitors_data) if kw})

    return pd.Series({
        "keywords": all_keywords,
        "subreddits": subreddits
    })

# Apply function
companies[["keywords", "subreddits"]] = companies.apply(extract_keywords_and_subs, axis=1)
companies

Unnamed: 0,id,created_at,company_name,logo,website_url,status,reddit,primary_color,company_custom_prompt,news_topics_search,company_custom_feed_prompt,competitors,keywords,subreddits
0,23,2025-08-12T18:46:46.880546+00:00,Abridge,https://www.google.com/s2/favicons?domain=abri...,https://www.abridge.com/,active,"{'keywords': ['medicalscribe', 'nursepractitio...",ea2c00,Abridge is an AI-powered clinical documentatio...,,## Target Content Classification Prompt\n\nYou...,"[Oracle Health, Suki, Pieces, Ambience Healthc...","[pieces, nursepractitioner, oracle health, amb...","[medicalscribe, nursepractitioner, healthtech]"
1,66,2025-12-03T03:09:31.833544+00:00,toast,https://www.google.com/s2/favicons?domain=pos....,https://pos.toasttab.com/,trial,"{'keywords': [], 'subreddits': ['smallbusiness...",ff4c00,Toast POS is a cloud-based point-of-sale and r...,,## Competitive Intelligence Relevance Classifi...,"[Square, SpotOn, DoorDash]","[doordash, spoton, square]","[smallbusiness, restaurantowners, restaurantma..."
2,72,2026-01-17T21:49:43.563146+00:00,MX Build,,https://mxbuild.co/,trial,"{""keywords"": [], ""subreddits"": [""HomeImproveme...",,MX Build combines field service management wit...,,## Competitive Intelligence Relevance Classifi...,"[Field Pulse, Jobber, Workiz, Simply Wise, Ser...","[housecall pro, workiz, field pulse, jobber, s...","[HomeImprovement, smallbusiness, FieldServiceM..."
3,71,2026-01-05T21:56:47.393102+00:00,Acumen,,https://acumen.org/,trial,"{""keywords"": [], ""subreddits"": [""philanthropy""...",,Acumen is a global nonprofit impact investment...,,## Competitive Intelligence Relevance Classifi...,"[The Rockefeller Foundation, Triodos, Ashoka]","[triodos, the rockefeller foundation, ashoka]","[philanthropy, socialenterprise, impactinvesting]"


# Grab posts & comments from subbreddits

In [9]:
import prawcore

posts_by_id: dict[str, dict] = {}
comments: list[dict] = []
seen_comment_ids: set[str] = set()

MAX_POSTS_PER_QUERY = 20
MAX_COMMENTS_PER_POST = None

def _to_list(value) -> list[str]:
    """Normalize a cell value into a list of unique, lowercase strings."""
    if value is None or (isinstance(value, float) and pd.isna(value)):
        return []
    if isinstance(value, (list, tuple, pd.Series)):
        items = [str(x) for x in value if pd.notna(x)]
    else:
        items = [s for s in str(value).split(",")]
    out, seen = [], set()
    for s in items:
        norm = " ".join(s.strip().split()).lower()
        if norm and norm not in seen:
            out.append(norm)
            seen.add(norm)
    return out

for _, row in companies.iterrows():
    company_id = row["id"]

    subs = _to_list(row.get("subreddits"))
    keywords = _to_list(row.get("keywords"))

    if not subs or not keywords:
        # Nothing to search for this company
        continue

    for sub in subs:
        try:
            sr = praw_client.subreddit(sub)
        except Exception as e:
            print(f"‚ö†Ô∏è Unable to init subreddit r/{sub}: {e}")
            continue

        for kw in keywords:
            try:
                submissions = list(sr.search(kw, sort="new", limit=MAX_POSTS_PER_QUERY))
            except prawcore.Forbidden:
                print(f"‚ö†Ô∏è Skipping r/{sub} (search forbidden)")
                continue
            except Exception as e:
                print(f"‚ö†Ô∏è Error searching r/{sub} kw='{kw}': {e}")
                continue

            if not submissions:
                # print(f"‚ÑπÔ∏è No results for r/{sub} kw='{kw}'")
                continue

            for submission in submissions:
                sid = getattr(submission, "id", None)
                title = getattr(submission, "title", "(no title)")
                author = getattr(submission, "author", None)
                print(f"üìÑ Found post in r/{sub} for '{kw}': {title[:100]} by {author}")
                
                if not sid:
                    continue

                if sid not in posts_by_id:
                    posts_by_id[sid] = {
                        "company_id": company_id,
                        "subreddit": sub,
                        "post_id": sid,
                        "post_author": str(submission.author) if getattr(submission, "author", None) else None,
                        "post_title": getattr(submission, "title", None),
                        "post_url": (
                            f"https://www.reddit.com{submission.permalink}"
                            if getattr(submission, "permalink", None) else None
                        ),
                        "post_created_utc": getattr(submission, "created_utc", None),
                        "post_selftext": getattr(submission, "selftext", None),
                        "_matched_keywords_set": {kw},
                    }
                else:
                    posts_by_id[sid].setdefault("_matched_keywords_set", set()).add(kw)

                # Fetch comments
                try:
                    submission.comments.replace_more(limit=0)
                    flat = submission.comments.list()
                    if MAX_COMMENTS_PER_POST is not None:
                        flat = flat[:MAX_COMMENTS_PER_POST]

                    for c in flat:
                        cid = getattr(c, "id", None)
                        if not cid or cid in seen_comment_ids:
                            continue
                        seen_comment_ids.add(cid)

                        comments.append({
                            "post_id": sid,
                            "comment_id": cid,
                            "comment_body": getattr(c, "body", None),
                            "comment_author": str(c.author) if getattr(c, "author", None) else None,
                            "comment_created_utc": getattr(c, "created_utc", None),
                            "parent_id": getattr(c, "parent_id", None),
                            "permalink": (
                                f"https://www.reddit.com{c.permalink}"
                                if getattr(c, "permalink", None) else None
                            ),
                            "search_keyword": kw,
                        })
                except Exception:
                    # swallow any per-post comment scrape errors
                    continue

for v in posts_by_id.values():
    mk = v.pop("_matched_keywords_set", None)
    v["matched_keywords"] = sorted(list(mk)) if mk else []

posts_payload = list(posts_by_id.values())
comments_payload = comments

print(f"‚úÖ Reddit scan complete. Posts: {len(posts_payload)}, Comments: {len(comments_payload)}")

üìÑ Found post in r/medicalscribe for 'pieces': Anyone else feel stuck between ‚Äútype faster‚Äù and ‚ÄúAI will take my job anyway‚Äù? by conquest333
üìÑ Found post in r/medicalscribe for 'pieces': ICU scribing & ScribeU by Interesting_Box948
üìÑ Found post in r/medicalscribe for 'pieces': New scribe having a bad time by sofattoofurious
üìÑ Found post in r/medicalscribe for 'pieces': Scribing Final Mayhem  by RepulsiveMood8901
üìÑ Found post in r/medicalscribe for 'pieces': Worldwide remote medical documentation jobs by mislamrahul
üìÑ Found post in r/medicalscribe for 'pieces': proscribe vs scribeamerica applications (since july 20, 2022) by efinlvy
üìÑ Found post in r/medicalscribe for 'pieces': Any tips for turnover psych notes? 5150/5250/voluntary? by mateyman
üìÑ Found post in r/medicalscribe for 'pieces': is this an acceptable practice? by Accomplished-Humor92
üìÑ Found post in r/medicalscribe for 'pieces': Online certification course? by scribethrowaway9385
üìÑ Found p

# Send to supabase

In [10]:
def to_iso(dt):
    if dt is None:
        return None
    try:
        if isinstance(dt, (int, float)):
            ts = pd.to_datetime(dt, unit="s", utc=True, errors="coerce")
        else:
            ts = pd.to_datetime(dt, utc=True, errors="coerce")
        return ts.isoformat().replace("+00:00", "Z") if pd.notnull(ts) else None
    except Exception:
        return None

def normalize_timestamps(records, keys):
    for r in records:
        for k in keys:
            if k in r and r[k] is not None:
                r[k] = to_iso(r[k])
    return records

posts_payload = normalize_timestamps(posts_payload, ["post_created_utc"])
comments_payload = normalize_timestamps(comments_payload, ["comment_created_utc"])

def chunk_list(data, chunk_size):
    for i in range(0, len(data), chunk_size):
        yield data[i : i + chunk_size]

BATCH_SIZE = 500
POST_TABLE = "reddit_posts"
COMMENT_TABLE = "reddit_comments"

inserted_posts = 0
for batch in chunk_list(posts_payload, BATCH_SIZE):
    resp = supabase.table(POST_TABLE).upsert(batch, on_conflict="post_id").execute()
    if resp.data is not None:
        inserted_posts += len(batch)
        print(f"‚úÖ Upserted {len(batch)} posts")
    else:
        raise RuntimeError(f"‚ùå Post upsert failed: {resp.error}")
print(f"üéâ Total posts sent: {inserted_posts}")

inserted_comments = 0
for batch in chunk_list(comments_payload, BATCH_SIZE):
    resp = supabase.table(COMMENT_TABLE).upsert(batch, on_conflict="comment_id").execute()
    if resp.data is not None:
        inserted_comments += len(batch)
        print(f"‚úÖ Upserted {len(batch)} comments")
    else:
        raise RuntimeError(f"‚ùå Comment upsert failed: {resp.error}")
print(f"üéâ Total comments sent: {inserted_comments}")

‚úÖ Upserted 486 posts
üéâ Total posts sent: 486
‚úÖ Upserted 500 comments
‚úÖ Upserted 500 comments
‚úÖ Upserted 500 comments
‚úÖ Upserted 500 comments
‚úÖ Upserted 500 comments
‚úÖ Upserted 500 comments
‚úÖ Upserted 500 comments
‚úÖ Upserted 500 comments
‚úÖ Upserted 500 comments
‚úÖ Upserted 500 comments
‚úÖ Upserted 500 comments
‚úÖ Upserted 92 comments
üéâ Total comments sent: 5592
