# Setup

In [1]:
from dotenv import load_dotenv
load_dotenv()

print("Starting jobs feed script...")

import requests
import json
import pandas as pd
import anthropic
import pyperclip
from supabase import create_client, Client
import time
from typing import List, Dict
import asyncio
import aiohttp
import nest_asyncio

# Supabase API
SUPABASE_URL = os.environ["SUPABASE_URL"]
SUPABASE_KEY = os.environ["SUPABASE_KEY"]
SERVICE_ROLE_KEY = os.environ["SUPABASE_SERVICE_ROLE_KEY"]

supabase: Client = create_client(SUPABASE_URL, SERVICE_ROLE_KEY)

Starting jobs feed script...


## Fetch data from supabase

In [2]:
# ✅ Function to Fetch Data from Supabase
def fetch_data(table_name, batch_size=500, filters=None, related_tables=None):
    try:
        all_data = []
        start = 0

        # Build select string
        if related_tables:
            select_string = "*, " + ", ".join(f"{tbl}(*)" for tbl in related_tables)
        else:
            select_string = "*"

        while True:
            query = supabase.table(table_name).select(select_string)
            
            if filters:
                for column, value in filters.items():
                    if isinstance(value, list):
                        query = query.in_(column, value)
                    elif value is None:
                        query = query.is_(column, None)
                    else:
                        query = query.eq(column, value)
            
            response = query.range(start, start + batch_size - 1).execute()
            
            if response.data:
                all_data.extend(response.data)
                start += batch_size
                if len(response.data) < batch_size:
                    break
            else:
                break

        if all_data:
            print(f"✅ Successfully fetched `{table_name}` table with filter '{filters}' and {len(all_data)} rows.")
            return pd.DataFrame(all_data)
        else:
            print(f"⚠️ `{table_name}` is empty.")
            return pd.DataFrame()

    except Exception as e:
        print(f"❌ Error fetching data from '{table_name}': {e}")
        return pd.DataFrame()
# ✅ Fetch data from tables
companies = fetch_data(
    "companies",
     filters={"status": ["trial", "active"]},
)
competitors = fetch_data(
    "competitors",
)

jobs = fetch_data("jobs")

✅ Successfully fetched `companies` table with filter '{'status': ['trial', 'active']}' and 6 rows.
✅ Successfully fetched `competitors` table with filter 'None' and 59 rows.
✅ Successfully fetched `jobs` table with filter 'None' and 4452 rows.


In [3]:
# Filter competitors to only trial accounts
filtered_competitors = competitors[competitors["company_id"].isin(companies["id"])]

# Optional: reset index if you want a clean one
filtered_competitors = filtered_competitors.reset_index(drop=True)
competitors = filtered_competitors
print(f"✅ Filtered to {len(companies)} trial and active accounts with {len(competitors)} competitors")

✅ Filtered to 6 trial and active accounts with 25 competitors


In [4]:
filtered_df = competitors[
    competitors['linkedin_id'].notna() & 
    (competitors['linkedin_id'] != 0) 
]
competitors = filtered_df

# Replace NaN with 0, convert to numeric safely, then cast to int
competitors["linkedin_id"] = (
    pd.to_numeric(competitors["linkedin_id"], errors="coerce")
    .fillna(0)
    .astype(int)
)

# Print number of competitors kept
print(f"✅ {len(competitors)} competitors kept after filtering.")

✅ 23 competitors kept after filtering.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  competitors["linkedin_id"] = (


# Job Data API

## Get list of jobs

In [5]:
# API setup
url = "https://professional-network-data.p.rapidapi.com/search-jobs"
headers = {
    "x-rapidapi-key": "e88f5d3d95msh96c8e7a091f4a90p1bee3cjsn2b537c4486ca",
    "x-rapidapi-host": "professional-network-data.p.rapidapi.com"
}

# Prepare final DataFrame
all_jobs = []

# Iterate through competitors
for _, row in competitors.iterrows():
    competitor_name = row["competitor_name"]
    competitor_id = row["id"]
    linkedin_id = row["linkedin_id"]

    # Set dynamic query parameters
    querystring = {
        "companyIds": str(linkedin_id),
        "locationId": "92000000",
        "datePosted": "anyTime",
        "sort": "mostRecent"
    }
    print(querystring)

    try:
        response = requests.get(url, headers=headers, params=querystring)
        response.raise_for_status()
        result = response.json()

        # Validate structure
        if isinstance(result, dict) and isinstance(result.get("data"), list):
            for job in result["data"]:
                job["competitor_name"] = competitor_name
                job["competitor_id"] = competitor_id
                all_jobs.append(job)
            print(f"Jobs collected for {competitor_name}")
        else:
            print(f"Unexpected response format for {competitor_name}: {result}")

    except Exception as e:
        print(f"Error fetching jobs for {competitor_name}: {e}")

    time.sleep(1)

# Create final DataFrame
jobs_df = pd.DataFrame(all_jobs)
print("Grabbed jobs via API...")

{'companyIds': '1028', 'locationId': '92000000', 'datePosted': 'anyTime', 'sort': 'mostRecent'}
Jobs collected for Oracle Health
{'companyIds': '17877435', 'locationId': '92000000', 'datePosted': 'anyTime', 'sort': 'mostRecent'}
Jobs collected for Suki
{'companyIds': '10449851', 'locationId': '92000000', 'datePosted': 'anyTime', 'sort': 'mostRecent'}
Jobs collected for Pieces
{'companyIds': '77105967', 'locationId': '92000000', 'datePosted': 'anyTime', 'sort': 'mostRecent'}
Jobs collected for Ambience Healthcare
{'companyIds': '3794219', 'locationId': '92000000', 'datePosted': 'anyTime', 'sort': 'mostRecent'}
Jobs collected for Workiz
{'companyIds': '5151868', 'locationId': '92000000', 'datePosted': 'anyTime', 'sort': 'mostRecent'}
Unexpected response format for Simply Wise: {'success': False, 'message': 'The request failed. You will not be charged for this request', 'data': None}
{'companyIds': '5772', 'locationId': '92000000', 'datePosted': 'anyTime', 'sort': 'mostRecent'}
Jobs colle

## Remove jobs that already exist

In [6]:
# Extract ID and append to dataframe
jobs_df['job_id'] = jobs_df['url'].str.extract(r'/view/(\d+)')
jobs['job_id'] = jobs['url'].str.extract(r'/view/(\d+)')

In [7]:
# Ensure both id columns are strings
jobs['job_id'] = jobs['job_id'].astype(str)
jobs_df['job_id'] = jobs_df['job_id'].astype(str)

# Count before filtering
count_before = len(jobs_df)

# Filter out matching job_ids
jobs_df_filtered = jobs_df[~jobs_df['job_id'].isin(jobs['job_id'])]

# Count after filtering
count_after = len(jobs_df_filtered)

# Set dataframe back to jobs_df
jobs_df = jobs_df_filtered
print(f"Dropped duplicate jobs, {len(jobs_df)} jobs remaining...")

Dropped duplicate jobs, 121 jobs remaining...


## Get job details

In [8]:
import asyncio
import aiohttp
import time
import nest_asyncio

nest_asyncio.apply()

url = "https://professional-network-data.p.rapidapi.com/get-job-details"
headers = {
    "x-rapidapi-key": "e88f5d3d95msh96c8e7a091f4a90p1bee3cjsn2b537c4486ca",
    "x-rapidapi-host": "professional-network-data.p.rapidapi.com"
}

MAX_REQUESTS_PER_MINUTE = 50
MIN_INTERVAL = 60 / MAX_REQUESTS_PER_MINUTE

last_request_time = 0
rate_lock = asyncio.Lock()

async def rate_limited_get(session, params):
    global last_request_time

    async with rate_lock:
        # wait if last request was too recent
        elapsed = time.time() - last_request_time
        if elapsed < MIN_INTERVAL:
            await asyncio.sleep(MIN_INTERVAL - elapsed)

        last_request_time = time.time()

        return await session.get(url, headers=headers, params=params, timeout=20)


async def fetch_job(session, row):
    job_id = row["id"]
    competitor_name = row["competitor_name"]
    
    params = {"id": job_id}

    try:
        resp = await rate_limited_get(session, params)

        if resp.status == 429:
            print(f"⚠️ STILL got 429 for {competitor_name} ({job_id}) — slowing further")
            await asyncio.sleep(2)
            return None

        if resp.status != 200:
            print(f"⚠️ {competitor_name} ({job_id}) HTTP {resp.status}")
            return None

        data = await resp.json()

        if "data" in data and isinstance(data["data"], dict):
            job = data["data"]
            job["competitor_id"] = row["competitor_id"]
            job["competitor_name"] = competitor_name
            print(f"✅ Job: {competitor_name} ({job_id})")
            return job
        
        print(f"⚠️ Unexpected format for {competitor_name}")
        return None

    except Exception as e:
        print(f"❌ Error {competitor_name} ({job_id}): {e}")
        return None


async def fetch_all(df):
    async with aiohttp.ClientSession() as session:
        tasks = [fetch_job(session, row) for _, row in df.iterrows()]
        return await asyncio.gather(*tasks)

# RUN
results = await fetch_all(jobs_df)
job_descriptions_df = pd.DataFrame([r for r in results if r])
job_descriptions_df.head()

✅ Job: Oracle Health (4355500486)
✅ Job: Oracle Health (4355500473)
✅ Job: Oracle Health (4355500469)
✅ Job: Oracle Health (4355470762)
✅ Job: Oracle Health (4355412151)
✅ Job: Oracle Health (4355327256)
✅ Job: Oracle Health (4355480666)
✅ Job: Oracle Health (4355402364)
✅ Job: Oracle Health (4355382601)
✅ Job: Oracle Health (4355382596)
✅ Job: Oracle Health (4355364485)
✅ Job: Oracle Health (4355327258)
✅ Job: Oracle Health (4355510167)
✅ Job: Oracle Health (4355510163)
✅ Job: Oracle Health (4355510159)
✅ Job: Oracle Health (4355500474)
✅ Job: Oracle Health (4355500472)
✅ Job: Oracle Health (4355500466)
✅ Job: Oracle Health (4355480669)
✅ Job: Oracle Health (4355470763)
✅ Job: Oracle Health (4355470761)
✅ Job: Oracle Health (4355460761)
✅ Job: Oracle Health (4355450716)
✅ Job: Oracle Health (4355450712)
❌ Error Oracle Health (4355354941): 
⚠️ Unexpected format for Suki
✅ Job: Turner (4365525229)
✅ Job: Turner (4343311655)
✅ Job: Turner (4343129372)
✅ Job: Turner (4343441086)
✅ Job: Tu

Unnamed: 0,id,state,title,description,url,applyMethod,company,contentLanguage,location,type,...,industries,formattedIndustries,formattedExperienceLevel,listedAt,listedAtDate,originalListedAt,originalListedDate,competitor_id,competitor_name,workRemoteAllowed
0,4355500486,LISTED,Data Center Technician 2,Job Description\n\nDaily Job Duties:\n\nHardwa...,https://www.linkedin.com/jobs/view/4355500486/,{'companyApplyUrl': 'https://eeho.fa.us2.oracl...,"{'id': 1028, 'name': 'Oracle', 'universalName'...","{'code': 'EN', 'name': 'English'}","Chicago, IL",Full-time,...,[96],[IT Services and IT Consulting],Mid-Senior level,1769185441000,2026-01-23 16:24:01 +0000 UTC,1769184016000,2026-01-23 16:00:16 +0000 UTC,152,Oracle Health,
1,4355500473,LISTED,Software Developer 3,Job Description\n\nThe Oracle Cloud Infrastruc...,https://www.linkedin.com/jobs/view/4355500473/,{'companyApplyUrl': 'https://eeho.fa.us2.oracl...,"{'id': 1028, 'name': 'Oracle', 'universalName'...","{'code': 'EN', 'name': 'English'}",United States,Full-time,...,[96],[IT Services and IT Consulting],Mid-Senior level,1769185441000,2026-01-23 16:24:01 +0000 UTC,1769183930000,2026-01-23 15:58:50 +0000 UTC,152,Oracle Health,True
2,4355500469,LISTED,Principal Software Engineer - Server Managemen...,Job Description\n\nPrincipal Software Develope...,https://www.linkedin.com/jobs/view/4355500469/,{'companyApplyUrl': 'https://eeho.fa.us2.oracl...,"{'id': 1028, 'name': 'Oracle', 'universalName'...","{'code': 'EN', 'name': 'English'}","Santa Clara, CA",Full-time,...,[96],[IT Services and IT Consulting],Mid-Senior level,1769185441000,2026-01-23 16:24:01 +0000 UTC,1769183921000,2026-01-23 15:58:41 +0000 UTC,152,Oracle Health,
3,4355470762,LISTED,Industry Sales Executive - Professional Services,Job Description\n\nSells a subset of product o...,https://www.linkedin.com/jobs/view/4355470762/,{'companyApplyUrl': 'https://eeho.fa.us2.oracl...,"{'id': 1028, 'name': 'Oracle', 'universalName'...","{'code': 'EN', 'name': 'English'}",United States,Full-time,...,[96],[IT Services and IT Consulting],Mid-Senior level,1769185441000,2026-01-23 16:24:01 +0000 UTC,1769183919000,2026-01-23 15:58:39 +0000 UTC,152,Oracle Health,True
4,4355412151,LISTED,Enterprise Application Sales Executive – Cloud...,Job Description\n\nThis position is responsibl...,https://www.linkedin.com/jobs/view/4355412151/,{'companyApplyUrl': 'https://eeho.fa.us2.oracl...,"{'id': 1028, 'name': 'Oracle', 'universalName'...","{'code': 'EN', 'name': 'English'}",United States,Full-time,...,[96],[IT Services and IT Consulting],Mid-Senior level,1769185441000,2026-01-23 16:24:01 +0000 UTC,1769183919000,2026-01-23 15:58:39 +0000 UTC,152,Oracle Health,True


## Send to supabase

In [9]:
job_descriptions_df.groupby("competitor_name").size()

competitor_name
Ashoka                         7
Clayco                         4
DoorDash                       1
Field Pulse                    2
Housecall Pro                  6
Oracle Health                 24
Skanksa                       18
SpotOn                        22
Square                         1
The Rockefeller Foundation     1
Triodos                        1
Turner                        24
Wex FSM                        5
dtype: int64

In [10]:
for _, row in job_descriptions_df.iterrows():
    competitor_id = int(row["competitor_id"])
    title = row["title"]
    description = row["description"]
    postedAt_raw = row["originalListedDate"]

    postedAt = postedAt_raw.split(" ")[0]

    url = row["url"]

    response = supabase.table("jobs").insert({
        "competitor_id": competitor_id,
        "title": title,
        "postedAt": postedAt,
        "description": description,
        "url": url,
        # "relevant": True,
        "processed": False,
    }).execute()

    if response.data:
        print(f"✅ Insert record {title}")
    else:
        print(f"❌ Failed to insert record {title}")
print("Inserted job descriptions into supabase...")

✅ Insert record Data Center Technician 2
✅ Insert record Software Developer 3
✅ Insert record Principal Software Engineer - Server Management Firmware (BMC/ILOM)
✅ Insert record Industry Sales Executive - Professional Services
✅ Insert record Enterprise Application Sales Executive – Cloud Communications (Southeast and Gulf Region)
✅ Insert record Senior Principal NetSuite Consultant -Retail & Retail+ (AFA/HB/FB) - NetSuite ACS
✅ Insert record SDR NetSuite Bilingüe - Telesales Business Development Representative I
✅ Insert record Data Center Technician
✅ Insert record SDR NetSuite Bilingüe - Telesales Business Development Representative I
✅ Insert record PAM Partner Account Manager ISV
✅ Insert record Executivo (a) de Vendas Enterprise/Corporate – Oracle NetSuite - HUNTER
✅ Insert record SDR NetSuite Bilingüe - Telesales Business Development Representative I
✅ Insert record Senior Applications Developer
✅ Insert record Referral - Cloud Finance/PPM/FAH/Tax/RMCS/S&C Functional
✅ Insert re