In [1]:
print("Starting linkedin feed script...")

Starting linkedin feed script...


# Setup

In [2]:
from dotenv import load_dotenv
load_dotenv()

import requests
import json
import pandas as pd
import anthropic
import pyperclip
from supabase import create_client, Client
import time
import math
from time import perf_counter
from urllib.parse import urlparse, urlunparse, parse_qsl, urlencode
import asyncio
import aiohttp

# Supabase API
SUPABASE_URL = os.environ["SUPABASE_URL"]
SUPABASE_KEY = os.environ["SUPABASE_KEY"]
SERVICE_ROLE_KEY = os.environ["SUPABASE_SERVICE_ROLE_KEY"]

supabase: Client = create_client(SUPABASE_URL, SERVICE_ROLE_KEY)

anthropic_api_key = os.environ["ANTHROPIC_API_KEY"]
client = anthropic.Anthropic(
    api_key=anthropic_api_key,
)

In [58]:
# ‚úÖ Function to Fetch Data from Supabase
def fetch_data(table_name, batch_size=500, filters=None, related_tables=None):
    try:
        all_data = []
        start = 0

        # Build select string
        if related_tables:
            select_string = "*, " + ", ".join(f"{tbl}(*)" for tbl in related_tables)
        else:
            select_string = "*"

        while True:
            query = supabase.table(table_name).select(select_string)
            
            if filters:
                for column, value in filters.items():
                    if isinstance(value, list):
                        query = query.in_(column, value)
                    elif value is None:
                        query = query.is_(column, None)
                    else:
                        query = query.eq(column, value)
            
            response = query.range(start, start + batch_size - 1).execute()
            
            if response.data:
                all_data.extend(response.data)
                start += batch_size
                if len(response.data) < batch_size:
                    break
            else:
                break

        if all_data:
            print(f"‚úÖ Successfully fetched `{table_name}` table with filter '{filters}' and {len(all_data)} rows.")
            return pd.DataFrame(all_data)
        else:
            print(f"‚ö†Ô∏è `{table_name}` is empty.")
            return pd.DataFrame()

    except Exception as e:
        print(f"‚ùå Error fetching data from '{table_name}': {e}")
        return pd.DataFrame()

# ‚úÖ Fetch data from tables
companies = fetch_data(
    "companies",
     filters={"status": ["trial", "active"]},
)
competitors = fetch_data(
    "competitors",
)

‚úÖ Successfully fetched `companies` table with filter '{'status': ['trial', 'active']}' and 6 rows.
‚úÖ Successfully fetched `competitors` table with filter 'None' and 60 rows.


In [59]:
# Filter competitors to only trial accounts
filtered_competitors = competitors[competitors["company_id"].isin(companies["id"])]

# Optional: reset index if you want a clean one
filtered_competitors = filtered_competitors.reset_index(drop=True)
competitors = filtered_competitors
print(f"‚úÖ Filtered to {len(companies)} trial accounts with {len(competitors)} competitors")

‚úÖ Filtered to 6 trial accounts with 26 competitors


# LinkedIn Feed

## Search by keyword

In [60]:
# competitors = competitors[competitors["company_id"] == 73]
competitors

Unnamed: 0,id,created_at,website_url,competitor_name,company_id,logo,coresignal_api_response,facebook_id,google_ads_id,linkedin_id,custom_prompt,news_search_query
0,24,2025-02-07T16:02:05.586464+00:00,https://www.tryhampr.com/,hampr,7,https://www.google.com/s2/favicons?domain=tryh...,"{""id"":12931125,""source_id"":""33519287"",""company...",0.0,AR03382551659182817281,0,Hampr is an on-demand laundry service that con...,"""hampr"" laundry service OR ""hampr"" on-demand l..."
1,152,2025-12-01T20:26:58.956972+00:00,https://www.oracle.com/,Oracle Health,23,https://www.google.com/s2/favicons?domain=orac...,"{""id"":6068905,""source_id"":""1028"",""company_name...",,,1028,Oracle Health (formerly Cerner) is Oracle‚Äôs he...,"(""Oracle Health"" OR Cerner OR ""Oracle healthca..."
2,22,2025-01-27T16:36:50.780226+00:00,https://www.happynest.com/,HappyNest,7,https://www.google.com/s2/favicons?domain=happ...,"{""id"":24236484,""source_id"":""51650575"",""company...",153571900000000.0,AR12657824252641148929,0,HappyNest is a pickup and delivery laundry ser...,"""HappyNest"" laundry service OR ""HappyNest"" pic..."
3,81,2025-08-12T18:48:23.719461+00:00,https://www.piecestech.com/,Pieces,23,https://www.google.com/s2/favicons?domain=piec...,"{""id"":1219335,""source_id"":""10449851"",""company_...",0.0,0,10449851,"Pieces Technologies, Inc. (often just Pieces) ...","""Pieces Technologies"" OR (""Pieces"" AND (health..."
4,80,2025-08-12T18:48:23.589624+00:00,https://www.suki.ai/,Suki,23,https://www.google.com/s2/favicons?domain=suki...,"{""id"":11856286,""source_id"":""17877435"",""company...",0.0,AR04099081465491357697,17877435,Suki is an AI-powered digital assistant design...,"""Suki"" healthcare AI OR ""Suki"" voice assistant..."
5,166,2026-01-17T21:49:52.466505+00:00,https://workiz.com,Workiz,72,https://www.google.com/s2/favicons?domain=work...,"{""id"":12588920,""source_id"":""3794219"",""company_...",275323600000000.0,AR18044909407396954113,3794219,Workiz is a field service management software ...,"""Workiz"" field service software OR ""Workiz"" se..."
6,168,2026-01-17T21:49:53.054686+00:00,https://simplywise.com,Simply Wise,72,https://www.google.com/s2/favicons?domain=simp...,"{""id"":26634625,""source_id"":""55054627"",""company...",0.0,AR16542392064705298433,5151868,Simply Wise (simplywise.com) is a financial te...,"""Simply Wise"" financial services OR ""Simply Wi..."
7,170,2026-01-22T22:20:29.682464+00:00,https://www.turnerconstruction.com/,Turner,73,https://www.google.com/s2/favicons?domain=www....,"{""id"":2458006,""source_id"":""5772"",""company_name...",363996100000000.0,,5772,Turner Construction Company is a leading inter...,"""Turner Construction"" construction OR ""Turner ..."
8,164,2026-01-17T21:49:51.804079+00:00,https://fieldpulse.com,Field Pulse,72,https://www.google.com/s2/favicons?domain=fiel...,"{""id"":10905323,""source_id"":""10872551"",""company...",1740011000000000.0,AR12871690310899466241,10872551,Field Pulse is a field service management soft...,"""Field Pulse"" field service management OR ""Fie..."
9,162,2026-01-17T21:49:51.577829+00:00,https://getjobber.com,Jobber,72,https://www.google.com/s2/favicons?domain=getj...,"{""id"":8103265,""source_id"":""1617574"",""company_n...",183668200000000.0,AR07389967871058640897,1617574,Jobber is a cloud-based field service manageme...,"""Jobber"" field service software OR ""Jobber"" bu..."


In [61]:
print("Starting linkedin feed keyword competitor search..")

API_URL = "https://professional-network-data.p.rapidapi.com/search-posts"
HEADERS = {
    "x-rapidapi-key": "e88f5d3d95msh96c8e7a091f4a90p1bee3cjsn2b537c4486ca",
    "x-rapidapi-host": "professional-network-data.p.rapidapi.com",
    "Content-Type": "application/json"
}

MAX_PAGES = 5
CONCURRENCY_LIMIT = 1

async def fetch_page(session, competitor_id, competitor_name, linkedin_id, page, sem, search_type):

    # Base payload
    payload = {
        "keyword": "",
        "sortBy": "date_posted",
        "datePosted": "pastWeek",
        "page": page,
        "contentType": "",
        "fromMember": [],
        "fromCompany": [],
        "mentionsMember": [],
        "mentionsOrganization": [],
        "authorIndustry": [],
        "authorCompany": [],
        "authorTitle": ""
    }

    # Apply OR logic via separate requests:
    if search_type == "mentions":
        payload["mentionsOrganization"] = [linkedin_id]
        payload["fromCompany"] = []
    elif search_type == "from_company":
        payload["fromCompany"] = [linkedin_id]
        payload["mentionsOrganization"] = []
    else:
        raise ValueError(f"Unknown search_type: {search_type}")

    async with sem:  # rate limit
        try:
            async with session.post(API_URL, json=payload, headers=HEADERS, timeout=30) as resp:
                data = await resp.json()
                posts = data.get("data", {}).get("items") or []

                if not posts:
                    print(f"No more posts for {competitor_name} [{search_type}] page {page}")
                    return []

                results = []
                for post in posts:
                    author = post.get("author") or {}
                    pics = author.get("profilePictures") or []
                    highest_res_pic = (
                        sorted(pics, key=lambda x: x.get("width", 0), reverse=True)[0]["url"]
                        if pics else None
                    )

                    results.append({
                        "competitor_id": competitor_id,
                        "competitor_name": competitor_name,
                        "search_type": search_type,
                        "text": post.get("text"),
                        "postUrl": post.get("url"),
                        "postedDate": post.get("postedDate"),
                        "author_id": author.get("id"),
                        "author_fullName": author.get("fullName"),
                        "author_username": author.get("username"),
                        "author_url": author.get("url"),
                        "headline": author.get("headline"),
                        "author_profile_pic": highest_res_pic,
                    })

                print(f"Processed {competitor_name} [{search_type}] page {page} ‚Äî {len(posts)} posts")
                return results

        except Exception as e:
            print(f"‚ùå Error {competitor_name} [{search_type}] page {page}: {e}")
            return []


async def fetch_all_competitors():
    linkedin_posts = []
    sem = asyncio.Semaphore(CONCURRENCY_LIMIT)

    async with aiohttp.ClientSession() as session:
        tasks = []

        for _, row in competitors.iterrows():
            competitor_name = row["competitor_name"]
            competitor_id = int(row["id"])
            linkedin_id = row.get("linkedin_id")

            if not linkedin_id:
                print(f"Skipping {competitor_name} ‚Äî no linkedin_id")
                continue

            for search_type in ("mentions", "from_company"):
                for page in range(1, MAX_PAGES + 1):
                    tasks.append(
                        fetch_page(
                            session=session,
                            competitor_id=competitor_id,
                            competitor_name=competitor_name,
                            linkedin_id=linkedin_id,
                            page=page,
                            sem=sem,
                            search_type=search_type,
                        )
                    )

        # Run everything concurrently
        results = await asyncio.gather(*tasks)

        # Flatten list-of-lists
        for batch in results:
            if batch:  # safety check
                linkedin_posts.extend(batch)

        return linkedin_posts


# run async function in Jupyter
linkedin_posts = await fetch_all_competitors()

linkedin_feed = pd.DataFrame(linkedin_posts)
print("Completed linkedin feed keyword competitor search..")

Starting linkedin feed keyword competitor search..
Skipping hampr ‚Äî no linkedin_id
Skipping HappyNest ‚Äî no linkedin_id
Processed Oracle Health [mentions] page 1 ‚Äî 10 posts
Processed Oracle Health [mentions] page 2 ‚Äî 10 posts
Processed Oracle Health [mentions] page 3 ‚Äî 10 posts
Processed Oracle Health [mentions] page 4 ‚Äî 10 posts
Processed Oracle Health [mentions] page 5 ‚Äî 10 posts
Processed Oracle Health [from_company] page 1 ‚Äî 10 posts
Processed Oracle Health [from_company] page 2 ‚Äî 2 posts
No more posts for Oracle Health [from_company] page 3
No more posts for Oracle Health [from_company] page 4
No more posts for Oracle Health [from_company] page 5
Processed Pieces [mentions] page 1 ‚Äî 1 posts
No more posts for Pieces [mentions] page 2
No more posts for Pieces [mentions] page 3
No more posts for Pieces [mentions] page 4
No more posts for Pieces [mentions] page 5
No more posts for Pieces [from_company] page 1
No more posts for Pieces [from_company] page 2
No more po

In [62]:
linkedin_feed.head(50)

Unnamed: 0,competitor_id,competitor_name,search_type,text,postUrl,postedDate,author_id,author_fullName,author_username,author_url,headline,author_profile_pic
0,152,Oracle Health,mentions,Organizations that are deploying specialized A...,https://www.linkedin.com/posts/kirk-a-frailey-...,2026-02-09 21:09:08.948 +0000 UTC,1115557,Kirk A. Frailey,kirk-a-frailey-6b1459,https://www.linkedin.com/in/kirk-a-frailey-6b1459,Talent Acquisition Partner/Advisor - Senior Re...,https://media.licdn.com/dms/image/v2/C4E03AQFT...
1,152,Oracle Health,mentions,"Why the ""AI Bubble"" debate is asking the wrong...",https://www.linkedin.com/posts/jeremy-sullivan...,2026-02-09 21:06:23.348 +0000 UTC,73800995,"Jeremy Sullivan, JP",jeremy-sullivan-jp-aab87020,https://www.linkedin.com/in/jeremy-sullivan-jp...,Investment Adviser | Justice of the Peace I NZ...,https://media.licdn.com/dms/image/v2/D5603AQHG...
2,152,Oracle Health,mentions,"‚ú® This week, we‚Äôre celebrating 25 years of Mot...",https://www.linkedin.com/posts/motivcx_this-we...,2026-02-09 21:00:13.162 +0000 UTC,24650,Motiv,motivcx,https://www.linkedin.com/company/motivcx/,"14,507 followers",https://media.licdn.com/dms/image/v2/D560BAQG7...
3,152,Oracle Health,mentions,Hi everyone! üëã\n\nI am currently seeking remot...,https://www.linkedin.com/posts/olaliacarlomicc...,2026-02-09 20:57:32.155 +0000 UTC,794023533,Carlo Micco Olalia,olaliacarlomicco,https://www.linkedin.com/in/olaliacarlomicco,Localization Project Manager (PMP Basic Certif...,https://media.licdn.com/dms/image/v2/D5635AQHM...
4,152,Oracle Health,mentions,I got a kick of inspiration last week while in...,https://www.linkedin.com/posts/mickelibedore_i...,2026-02-09 20:51:52.597 +0000 UTC,20440494,Mickeli Bedore,mickelibedore,https://www.linkedin.com/in/mickelibedore,I help software engineers level up,https://media.licdn.com/dms/image/v2/D5603AQF5...
5,152,Oracle Health,mentions,¬°Hola! ‚ú®\nDespu√©s de algunos meses ha terminad...,https://www.linkedin.com/posts/lorena-raygoza0...,2026-02-09 20:47:43.083 +0000 UTC,1370808784,Lorena Maria Raygoza Ibarra,lorena-raygoza09,https://www.linkedin.com/in/lorena-raygoza09,Desarrolladora Jr. | HTML | CSS | Javascript |...,https://media.licdn.com/dms/image/v2/D4E03AQGk...
6,152,Oracle Health,mentions,Organizations that are deploying specialized A...,https://www.linkedin.com/posts/christina-kupec...,2026-02-09 20:43:52.362 +0000 UTC,1796134,Christina Kupec,christina-kupec-51a727,https://www.linkedin.com/in/christina-kupec-51...,"MBA, Solutions Architect at Oracle",
7,152,Oracle Health,mentions,#Feb092026 #0320pmEST\n\n#From : Dean V. Reich...,https://www.linkedin.com/posts/dean-v-reich-cn...,2026-02-09 20:39:45.331 +0000 UTC,902389216,Dean V. Reich. CNBI LLC,dean-v-reich-cnbi-llc-b9425b212,https://www.linkedin.com/in/dean-v-reich-cnbi-...,CNBI LLC . Consultant | Homeland Security LEO‚Äô...,https://media.licdn.com/dms/image/v2/D5603AQHq...
8,152,Oracle Health,mentions,Hi everyone! üëã\n\nI am currently seeking remot...,https://www.linkedin.com/posts/olaliacarlomicc...,2026-02-09 20:55:42.424 +0000 UTC,794023533,Carlo Micco Olalia,olaliacarlomicco,https://www.linkedin.com/in/olaliacarlomicco,Localization Project Manager (PMP Basic Certif...,https://media.licdn.com/dms/image/v2/D5635AQHM...
9,152,Oracle Health,mentions,Discover the future of AI at #AIWorld Tour Abu...,https://www.linkedin.com/posts/elie-yasmine-mb...,2026-02-09 20:38:25.649 +0000 UTC,79396643,"Elie Yasmine, MBA",elie-yasmine-mba-16b70b22,https://www.linkedin.com/in/elie-yasmine-mba-1...,"Sales Director, Digital BSS & Cloud Solutions ...",https://media.licdn.com/dms/image/v2/D4E03AQH0...


## Send data back to supabase

In [63]:
# Remove trailing " UTC"
linkedin_feed['postedDate'] = pd.to_datetime(
    linkedin_feed['postedDate'].str.replace(" UTC", "", regex=False),
    format='mixed',
    errors='coerce'
).dt.date.astype(str)

# Convert to datetime and extract date as string in YYYY-MM-DD format
linkedin_feed['postedDate'] = pd.to_datetime(linkedin_feed['postedDate']).dt.date.astype(str)

In [64]:
# 1. Remove duplicates in-memory before upsert
linkedin_feed = linkedin_feed.drop_duplicates(subset=["postUrl"], keep="last")

rows = []
for _, row in linkedin_feed.iterrows():
    rows.append({
        "author_id": int(row["author_id"]) if pd.notna(row["author_id"]) else None,
        "author_fullName": row.get("author_fullName"),
        "author_profile_pic": row.get("author_profile_pic"),
        "postUrl": row.get("postUrl"),
        "headline": row.get("headline"),
        "author_url": row.get("author_url"),
        "text": row.get("text"),
        "postedDate": row.get("postedDate"),
        "competitor_id": int(row["competitor_id"]) if pd.notna(row["competitor_id"]) else None,
    })

# 2. Upsert batch
response = supabase.table("linkedin_feed").upsert(
    rows,
    on_conflict="postUrl"
).execute()

count = len(response.data) if response.data else 0
print(f"Upsert complete ‚Äî {count} rows inserted/updated")

Upsert complete ‚Äî 848 rows inserted/updated
