# Setup

In [1]:
print(f"‚úÖ Starting to pull news articles...")

‚úÖ Starting to pull news articles...


In [2]:
from dotenv import load_dotenv
load_dotenv()

import pandas as pd
import json
import datetime
import requests, time
from urllib.parse import urlparse
from supabase import create_client, Client

#Supbase
SUPABASE_URL = os.environ["SUPABASE_URL"]
SUPABASE_KEY = os.environ["SUPABASE_KEY"]
SERVICE_ROLE_KEY = os.environ["SUPABASE_SERVICE_ROLE_KEY"]

supabase: Client = create_client(SUPABASE_URL, SERVICE_ROLE_KEY)

# Grab data from database

In [3]:
# ‚úÖ Function to Fetch Data from Supabase
def fetch_data(table_name, batch_size=500, filters=None, related_tables=None):
    try:
        all_data = []
        start = 0

        # Build select string
        if related_tables:
            select_string = "*, " + ", ".join(f"{tbl}(*)" for tbl in related_tables)
        else:
            select_string = "*"

        while True:
            query = supabase.table(table_name).select(select_string)
            
            if filters:
                for column, value in filters.items():
                    if isinstance(value, list):
                        query = query.in_(column, value)
                    elif value is None:
                        query = query.is_(column, None)
                    else:
                        query = query.eq(column, value)
            
            response = query.range(start, start + batch_size - 1).execute()
            
            if response.data:
                all_data.extend(response.data)
                start += batch_size
                if len(response.data) < batch_size:
                    break
            else:
                break

        if all_data:
            print(f"‚úÖ Successfully fetched `{table_name}` table with filter '{filters}' and {len(all_data)} rows.")
            return pd.DataFrame(all_data)
        else:
            print(f"‚ö†Ô∏è `{table_name}` is empty.")
            return pd.DataFrame()

    except Exception as e:
        print(f"‚ùå Error fetching data from '{table_name}': {e}")
        return pd.DataFrame()

# ‚úÖ Fetch data from tables
companies = fetch_data(
    "companies",
     filters={"status": ["trial", "active"]},
)
competitors = fetch_data(
    "competitors",
)

# Filter competitors to only trial accounts
filtered_competitors = competitors[competitors["company_id"].isin(companies["id"])]

# Optional: reset index if you want a clean one
filtered_competitors = filtered_competitors.reset_index(drop=True)
competitors = filtered_competitors
print(f"‚úÖ Filtered to {len(companies)} trial and active accounts with {len(competitors)} competitors")

‚úÖ Successfully fetched `companies` table with filter '{'status': ['trial', 'active']}' and 6 rows.
‚úÖ Successfully fetched `competitors` table with filter 'None' and 60 rows.
‚úÖ Filtered to 6 trial and active accounts with 26 competitors


# Collect articles

In [4]:
url = "https://google-news13.p.rapidapi.com/search"
headers = {
    "x-rapidapi-key": "e88f5d3d95msh96c8e7a091f4a90p1bee3cjsn2b537c4486ca",
    "x-rapidapi-host": "google-news13.p.rapidapi.com"
}

## Collects news for competitors

In [5]:
news_rows = []

for index, row in competitors.iterrows():
    competitor_id = row["id"]
    website_url = row["website_url"]
    competitor_name = row["competitor_name"]

    # Extract domain
    parsed = urlparse(website_url)
    domain = (parsed.netloc or parsed.path).replace("www.", "").strip("/")

    # LLM-generated search query
    # query = row["news_search_query"]

    # Temporarily replacing search query with competitor name
    query = row["competitor_name"]

    # We now search *both*:
    search_terms = [
        query,
        domain
    ]

    print(f"\nüîé Searching for competitor {competitor_name} using:")
    print("   1) Query:", query)
    print("   2) Domain:", domain)

    for term in search_terms:
        params = {"keyword": term, "lr": "en-US"}

        try:
            response = requests.get(url, headers=headers, params=params)
            data = response.json()
            items = data.get("items", [])

            print(f"   ‚Üí Found {len(items)} articles for '{term}'")

            # Store results
            for article in items:
                news_rows.append({
                    "competitor_id": competitor_id,
                    "domain": domain,
                    "search_term": term,
                    "timestamp": article.get("timestamp"),
                    "title": article.get("title"),
                    "snippet": article.get("snippet"),
                    "url": article.get("newsUrl"),
                    "publisher": article.get("publisher"),
                    "thumbnail": article.get("images", {}).get("thumbnail")
                })

        except Exception as e:
            print(f"‚ùå Error searching '{term}': {e}")

        time.sleep(0.7)

# Build DataFrame + dedupe URLs
news_df = pd.DataFrame(news_rows)
news_df = news_df.drop_duplicates(subset=["url"])

news_df.head(5)


üîé Searching for competitor hampr using:
   1) Query: hampr
   2) Domain: tryhampr.com
   ‚Üí Found 19 articles for 'hampr'
   ‚Üí Found 5 articles for 'tryhampr.com'

üîé Searching for competitor Oracle Health using:
   1) Query: Oracle Health
   2) Domain: oracle.com
   ‚Üí Found 100 articles for 'Oracle Health'
   ‚Üí Found 100 articles for 'oracle.com'

üîé Searching for competitor HappyNest using:
   1) Query: HappyNest
   2) Domain: happynest.com
   ‚Üí Found 52 articles for 'HappyNest'
   ‚Üí Found 32 articles for 'happynest.com'

üîé Searching for competitor Pieces using:
   1) Query: Pieces
   2) Domain: piecestech.com
   ‚Üí Found 100 articles for 'Pieces'
   ‚Üí Found 6 articles for 'piecestech.com'

üîé Searching for competitor Suki using:
   1) Query: Suki
   2) Domain: suki.ai
   ‚Üí Found 100 articles for 'Suki'
   ‚Üí Found 100 articles for 'suki.ai'

üîé Searching for competitor Workiz using:
   1) Query: Workiz
   2) Domain: workiz.com
   ‚Üí Found 29 articles

Unnamed: 0,competitor_id,domain,search_term,timestamp,title,snippet,url,publisher,thumbnail
0,24,tryhampr.com,hampr,1755673200000,Hampr lands $500K investment from Louisiana Gr...,"Hampr, the on-demand laundry service, received...",https://www.theadvocate.com/acadiana/news/busi...,The Advocate,https://news.google.com/api/attachments/CC8iI0...
1,24,tryhampr.com,hampr,1741075200000,Dallas-Based Hampr Turns a Household Chore Int...,Founder Laurel Hess was attending a T-ball gam...,https://dallasinnovates.com/dallas-based-hampr...,Dallas Innovates,https://news.google.com/api/attachments/CC8iK0...
2,24,tryhampr.com,hampr,1669104000000,"I Made $40,000 Last Year Washing Laundry on th...",I can have anywhere from 18 to 25 orders on re...,https://www.businessinsider.com/made-40000-was...,Business Insider,https://news.google.com/api/attachments/CC8iK0...
3,24,tryhampr.com,hampr,1582531200000,Lafayette's Hampr is preparing to bring on-dem...,Hampr charges a yearly membership fee of $39 a...,https://www.theadvertiser.com/story/money/busi...,"The Daily Advertiser | Lafayette, Louisiana",https://news.google.com/api/attachments/CC8iK0...
4,24,tryhampr.com,hampr,1579161600000,"Hampr, A New Local Startup Providing On-demand...",hampr is an on-demand platform where users can...,https://developinglafayette.com/wp/hampr-a-new...,Developing Lafayette -,https://news.google.com/api/attachments/CC8iL0...


## Collect news for company related keywords

In [6]:
companies_with_keywords = companies[companies["news_topics_search"].notnull()]
companies_with_keywords.head()

Unnamed: 0,id,created_at,company_name,logo,website_url,status,reddit,primary_color,company_custom_prompt,news_topics_search,company_custom_feed_prompt


In [7]:
# keyword_news_rows = []

# for _, row in companies_with_keywords.iterrows():
#     company_id = row["id"]
#     company_name = row.get("company_name") or row.get("name")  # just in case naming differs
#     clusters = row["news_topics_search"] or []

#     print(f"\nüîé Searching for company {company_name} ({company_id}):")

#     # clusters is expected to be:
#     # [{"label": "...", "keywords": ["...", "..."]}, ...]
#     for cluster in clusters:
#         label = cluster.get("label", "Unlabeled")
#         keywords = cluster.get("keywords", []) or []

#         for term in keywords:
#             params = {"keyword": term, "lr": "en-US"}  # IMPORTANT: term (not the whole list)

#             try:
#                 response = requests.get(url, headers=headers, params=params, timeout=30)
#                 data = response.json()
#                 items = data.get("items", [])

#                 print(f"   ‚Üí [{label}] '{term}': {len(items)} articles")

#                 for article in items:
#                     keyword_news_rows.append({
#                         "company_id": company_id,
#                         "company_name": company_name,
#                         "topic_label": label,
#                         "search_term": term,
#                         "timestamp": article.get("timestamp"),
#                         "title": article.get("title"),
#                         "snippet": article.get("snippet"),
#                         "url": article.get("newsUrl"),
#                         "publisher": article.get("publisher"),
#                         "thumbnail": (article.get("images") or {}).get("thumbnail"),
#                         "source": "google_news_api"
#                     })

#             except Exception as e:
#                 print(f"‚ùå Error searching [{label}] '{term}': {e}")

#             time.sleep(0.7)

# # Build DataFrame + dedupe URLs
# keyword_news_df = pd.DataFrame(keyword_news_rows)

# if not keyword_news_df.empty:
#     keyword_news_df = keyword_news_df.drop_duplicates(subset=["url"])

# keyword_news_df.head(5)

## Convert timestamp

In [8]:
news_df["timestamp"] = news_df["timestamp"].apply(
    lambda ts: datetime.datetime.fromtimestamp(int(ts)/1000, tz=datetime.timezone.utc)
        .strftime("%Y-%m-%dT%H:%M:%S+00:00")
)
news_df

# keyword_news_df["timestamp"] = keyword_news_df["timestamp"].apply(
#     lambda ts: datetime.datetime.fromtimestamp(int(ts)/1000, tz=datetime.timezone.utc)
#         .strftime("%Y-%m-%dT%H:%M:%S+00:00")
# )
# keyword_news_df

Unnamed: 0,competitor_id,domain,search_term,timestamp,title,snippet,url,publisher,thumbnail
0,24,tryhampr.com,hampr,2025-08-20T07:00:00+00:00,Hampr lands $500K investment from Louisiana Gr...,"Hampr, the on-demand laundry service, received...",https://www.theadvocate.com/acadiana/news/busi...,The Advocate,https://news.google.com/api/attachments/CC8iI0...
1,24,tryhampr.com,hampr,2025-03-04T08:00:00+00:00,Dallas-Based Hampr Turns a Household Chore Int...,Founder Laurel Hess was attending a T-ball gam...,https://dallasinnovates.com/dallas-based-hampr...,Dallas Innovates,https://news.google.com/api/attachments/CC8iK0...
2,24,tryhampr.com,hampr,2022-11-22T08:00:00+00:00,"I Made $40,000 Last Year Washing Laundry on th...",I can have anywhere from 18 to 25 orders on re...,https://www.businessinsider.com/made-40000-was...,Business Insider,https://news.google.com/api/attachments/CC8iK0...
3,24,tryhampr.com,hampr,2020-02-24T08:00:00+00:00,Lafayette's Hampr is preparing to bring on-dem...,Hampr charges a yearly membership fee of $39 a...,https://www.theadvertiser.com/story/money/busi...,"The Daily Advertiser | Lafayette, Louisiana",https://news.google.com/api/attachments/CC8iK0...
4,24,tryhampr.com,hampr,2020-01-16T08:00:00+00:00,"Hampr, A New Local Startup Providing On-demand...",hampr is an on-demand platform where users can...,https://developinglafayette.com/wp/hampr-a-new...,Developing Lafayette -,https://news.google.com/api/attachments/CC8iL0...
...,...,...,...,...,...,...,...,...,...
3434,158,doordash.com,doordash.com,2020-03-21T07:00:00+00:00,DoorDash Launches #OpenForDelivery Campaign To...,"Over the past few days, cities and states have...",https://about.doordash.com/en-us/news/doordash...,DoorDash,
3435,158,doordash.com,doordash.com,2025-05-16T07:00:00+00:00,DoorDash partners with Velocity Frequent Flyer,"By linking a Velocity account to DoorDash, mem...",https://retailworldmagazine.com.au/doordash-pa...,Retail World Magazine,https://news.google.com/api/attachments/CC8iK0...
3436,158,doordash.com,doordash.com,2025-01-15T08:00:00+00:00,Georgia DoorDash drivers could be eligible soo...,DoorDash drivers are possibly eligible for new...,https://www.augustachronicle.com/story/news/20...,The Augusta Chronicle,https://news.google.com/api/attachments/CC8iK0...
3437,158,doordash.com,doordash.com,2017-06-22T07:00:00+00:00,Four years in and just getting started,Four years in and just getting started. Four y...,https://about.doordash.com/en-us/news/four-yea...,DoorDash,


## Drop all rows old than two weeks

In [9]:
news_df["timestamp"] = pd.to_datetime(
    news_df["timestamp"],
    utc=True,
    errors="coerce"
)

# Define cutoff 1 week ago
cutoff = pd.Timestamp.utcnow() - pd.Timedelta(weeks=1)

# Filter
filtered_df = news_df[news_df["timestamp"] >= cutoff].copy()

print(f"Before: {len(news_df)} rows")
print(f"After:  {len(filtered_df)} rows")
news_df = filtered_df
news_df.head(5)

Before: 3143 rows
After:  621 rows


Unnamed: 0,competitor_id,domain,search_term,timestamp,title,snippet,url,publisher,thumbnail
24,152,oracle.com,Oracle Health,2026-02-02 17:51:42+00:00,Oracle Health Adds Order Creation Capabilities...,Expanded AI capabilities leverage ambient list...,https://www.oracle.com/news/announcement/oracl...,Oracle,https://news.google.com/api/attachments/CC8iK0...
25,152,oracle.com,Oracle Health,2026-02-05 08:36:40+00:00,Accenture Federal Services Selected to Support...,"Accenture Federal Services, a subsidiary of Ac...",https://newsroom.accenture.com/news/2026/accen...,Accenture,https://news.google.com/api/attachments/CC8iK0...
26,152,oracle.com,Oracle Health,2026-02-06 13:53:31+00:00,"""Oracle considering drastic measures to financ...",Oracle's investments in AI may prove costly. I...,https://www.techzine.eu/news/infrastructure/13...,Techzine Global,https://news.google.com/api/attachments/CC8iK0...
27,152,oracle.com,Oracle Health,2026-02-03 09:53:10+00:00,Oracle Health Clinical AI Agent Adds Automated...,Oracle Health has expanded its Clinical AI Age...,https://hlth.com/insights/news/oracle-health-c...,HLTH,https://news.google.com/api/attachments/CC8iK0...
28,152,oracle.com,Oracle Health,2026-02-04 13:05:00+00:00,Multiple Canadian Healthcare Organizations Sel...,Lumeo Regional Health Information System in So...,https://www.nasdaq.com/press-release/multiple-...,Nasdaq,


In [10]:
# # Ensure proper datetime format
# keyword_news_df["timestamp"] = pd.to_datetime(
#     keyword_news_df["timestamp"],
#     # unit="ms",
#     utc=True
# )

# # Define cutoff 1 week ago
# cutoff = pd.Timestamp.utcnow() - pd.Timedelta(weeks=1)

# # Filter
# filtered_df = keyword_news_df[keyword_news_df["timestamp"] >= cutoff].copy()

# print(f"Before: {len(keyword_news_df)} rows")
# print(f"After:  {len(filtered_df)} rows")
# keyword_news_df = filtered_df
# keyword_news_df

# Send to supabase

## Competitor related news

In [11]:
rows = []

for _, row in news_df.iterrows():
    rows.append({
        "competitor_id": row["competitor_id"],
        "published_date": row["timestamp"].isoformat() if pd.notna(row["timestamp"]) else None,
        "publisher": row["publisher"],
        "title": row["title"],
        "description": row["snippet"],
        "url": row["url"],
        "thumbnail": row["thumbnail"]
    })

response = supabase.table("news_feed").upsert(
    rows,
    on_conflict="url"
).execute()

count = len(response.data) if response.data else 0

print(f"Upsert complete ‚Äî {count} rows inserted/updated")

Upsert complete ‚Äî 621 rows inserted/updated


## Company keyword related news

In [12]:
# keyword_news_df.head(1)

In [13]:
# rows = []

# for _, row in keyword_news_df.iterrows():
#     rows.append({
#         "company_id": row["company_id"],
#         "published_date": row["timestamp"].isoformat() if pd.notna(row["timestamp"]) else None,
#         "publisher": row["publisher"],
#         "title": row["title"],
#         "description": row["snippet"],
#         "url": row["url"],
#         "thumbnail": row["thumbnail"]
#     })

# response = supabase.table("news_feed").upsert(
#     rows,
#     on_conflict="url"
# ).execute()

# count = len(response.data) if response.data else 0

# print(f"Upsert complete ‚Äî {count} rows inserted/updated")