In [16]:
from dotenv import load_dotenv
load_dotenv()

import pandas as pd
import html2text
from bs4 import BeautifulSoup
import scrapy
from scrapy.crawler import CrawlerProcess
from supabase import create_client, Client
import random
import requests
import difflib
import numpy as np
import re
from firecrawl import FirecrawlApp

# Supabase API Key
SUPABASE_URL = os.environ["SUPABASE_URL"]
SUPABASE_KEY = os.environ["SUPABASE_KEY"]
SERVICE_ROLE_KEY = os.environ["SUPABASE_SERVICE_ROLE_KEY"]
supabase: Client = create_client(SUPABASE_URL, SERVICE_ROLE_KEY)

# Firecrawl API
firecrawl_api_key = "fc-6a9dd63b67a64375889bb608bee9664a"

# User agents for scrapping
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.1 Safari/605.1.15",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.67",
    "Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1",
    "Mozilla/5.0 (Linux; Android 13; Pixel 7 Pro) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Mobile Safari/537.36",
    "Mozilla/5.0 (Linux; Android 13; SAMSUNG SM-G998B) AppleWebKit/537.36 (KHTML, like Gecko) SamsungBrowser/21.0 Chrome/115.0.0.0 Mobile Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.5790.171 Safari/537.36 OPR/100.0.4815.76",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 13.4; rv:109.0) Gecko/20100101 Firefox/109.0",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
]

In [17]:
def fetch_data(table_name, batch_size=1000, join_fields=None, flatten_joins=False):
    try:
        all_data = []
        start = 0

        # Build select clause
        select_parts = ["*"]
        if join_fields:
            for table, fields in join_fields.items():
                fields_str = ",".join(fields)
                select_parts.append(f"{table}({fields_str})")
        select_clause = ", ".join(select_parts)

        while True:
            response = (
                supabase
                .table(table_name)
                .select(select_clause)
                .range(start, start + batch_size - 1)
                .execute()
            )

            if response.data:
                all_data.extend(response.data)
                start += batch_size
                if len(response.data) < batch_size:
                    break
            else:
                break

        if not all_data:
            print(f"‚ö†Ô∏è `{table_name}` is empty.")
            return pd.DataFrame()

        df = pd.DataFrame(all_data)

        # Optional flattening
        if flatten_joins and join_fields:
            for join_table in join_fields.keys():
                if join_table in df.columns:
                    nested_df = pd.json_normalize(df[join_table])
                    nested_df.columns = [f"{join_table}_{col}" for col in nested_df.columns]
                    df = df.drop(columns=[join_table]).join(nested_df)

        print(f"‚úÖ Successfully fetched `{table_name}` ({len(df)} items).")
        return df

    except Exception as e:
        print(f"‚ùå Error fetching data from '{table_name}': {e}")
        return pd.DataFrame()

# Fetch tables
change_log = fetch_data("change_log")
competitors = fetch_data("competitors")

# Fetch page_indexing WITH related competitor data
page_indexing = fetch_data(
    "page_indexing",
    join_fields={"competitors": ["company_id"]},
    flatten_joins=True
)
companies = fetch_data("companies")

‚úÖ Successfully fetched `change_log` (56097 items).
‚úÖ Successfully fetched `competitors` (60 items).
‚úÖ Successfully fetched `page_indexing` (8403 items).
‚úÖ Successfully fetched `companies` (12 items).


## Filter down to active or trial companies

In [18]:
active_companies = companies[
    companies["status"].isin(["active", "trial"])
][["id"]]

filtered_page_indexing = page_indexing[
    page_indexing["competitors_company_id"].isin(active_companies["id"])
]

print(f"Rows before: {len(page_indexing)}")
print(f"Rows after:  {len(filtered_page_indexing)}")
page_indexing = filtered_page_indexing

Rows before: 8403
Rows after:  3588


In [24]:
page_indexing["competitors_company_id"].unique()


array([23,  7, 66, 71])

## Scrape pages

In [7]:
def get_random_ua():
    return random.choice(USER_AGENTS)

firecrawl = FirecrawlApp(api_key=firecrawl_api_key)

def firecrawl_scrape(url):
    try:
        result = firecrawl.scrape_url(
            url,
            params={
                "formats": ["html"],
                "timeout": 15000
            }
        )

        if result and result.get("html"):
            return {
                "html": result.get("html"),
            }

    except Exception as e:
        print(f"‚ùå Firecrawl exception for {url}: {e}")

    return None

results = []

class MultiURLSpider(scrapy.Spider):
    name = "multi_url_spider"

    def start_requests(self):
        for _, row in page_indexing.iterrows():
            page_id = row["id"]
            page_url = row["page_url"]

            if pd.isna(page_url):
                continue

            print(f"‚û°Ô∏è Requesting {page_id} | {page_url}")

            yield scrapy.Request(
                url=page_url,
                callback=self.parse,
                errback=self.handle_error,
                cb_kwargs={"page_id": page_id, "page_url": page_url},
                headers={"User-Agent": get_random_ua()},
                meta={"download_timeout": 15, "max_retry_times": 2},
                dont_filter=True
            )

    def parse(self, response, page_id, page_url):
        print(f"‚úÖ Scrapy success: {page_url}")
        results.append({
            "page_id": page_id,
            "page_url": page_url,
            "html_content": response.text,
            "status": "scrapy_success"
        })

    def handle_error(self, failure):
        request = failure.request
        page_id = request.cb_kwargs["page_id"]
        page_url = request.cb_kwargs["page_url"]

        print(f"‚ö†Ô∏è Scrapy failed: {page_url} ‚Üí trying requests")

        # ---- Fallback #1: requests ----
        try:
            r = requests.get(
                page_url,
                headers={"User-Agent": get_random_ua()},
                timeout=15
            )

            if r.status_code == 200 and r.text.strip():
                print(f"‚úÖ Requests fallback success: {page_url}")
                results.append({
                    "page_id": page_id,
                    "page_url": page_url,
                    "html_content": r.text,
                    "status": "requests_fallback_success"
                })
                return

            print(f"‚ö†Ô∏è Requests failed ({r.status_code})")

        except Exception as e:
            print(f"‚ö†Ô∏è Requests exception: {e}")

        # ---- Fallback #2: Firecrawl ----
        print(f"üî• Trying Firecrawl: {page_url}")
        firecrawl_result = firecrawl_scrape(page_url)

        if firecrawl_result:
            print(f"‚úÖ Firecrawl success: {page_url}")
            results.append({
                "page_id": page_id,
                "page_url": page_url,
                "html_content": firecrawl_result["html"],
                "status": "firecrawl_success"
            })
        else:
            print(f"‚ùå Firecrawl failed: {page_url}")
            results.append({
                "page_id": page_id,
                "page_url": page_url,
                "html_content": None,
                "status": "firecrawl_failed"
            })

# Run spider
process = CrawlerProcess(settings={"LOG_LEVEL": "ERROR"})
process.crawl(MultiURLSpider)
process.start()

# Results list
page_scrape_list = pd.DataFrame(results)
page_scrape_list

‚û°Ô∏è Requesting 8276 | https://docs.nabla.com/guides/intro
‚û°Ô∏è Requesting 8285 | https://docs.nabla.com/guides/intro/
‚û°Ô∏è Requesting 2119 | https://www.happynest.com/locations/new-jersey
‚û°Ô∏è Requesting 8440 | https://docs.nabla.com/server/oauth-generate-server-access-token
‚û°Ô∏è Requesting 8443 | https://docs.nabla.com/guides/api-versioning/changelog-and-upgrades
‚û°Ô∏è Requesting 8446 | https://docs.nabla.com/user/get-generate-note-async
‚û°Ô∏è Requesting 8449 | https://docs.nabla.com/next/user/update-user-dot-phrase
‚û°Ô∏è Requesting 8452 | https://docs.nabla.com/user/update-user-custom-dictionary-expression
‚û°Ô∏è Requesting 8455 | https://docs.nabla.com/server/core-server-api
‚û°Ô∏è Requesting 8458 | https://docs.nabla.com/user/delete-user-dot-phrase
‚úÖ Scrapy success: https://docs.nabla.com/guides/intro
‚ö†Ô∏è Scrapy failed: https://docs.nabla.com/guides/intro/ ‚Üí trying requests
‚ö†Ô∏è Requests failed (404)
üî• Trying Firecrawl: https://docs.nabla.com/guides/intro/

Unnamed: 0,page_id,page_url,html_content,markdown_content,status
0,8276,https://docs.nabla.com/guides/intro,"<!doctype html>\n<html lang=""en"" dir=""ltr"" cla...",,scrapy_success
1,8285,https://docs.nabla.com/guides/intro/,,,firecrawl_failed
2,8449,https://docs.nabla.com/next/user/update-user-d...,"<!doctype html>\n<html lang=""en"" dir=""ltr"" cla...",,scrapy_success
3,2119,https://www.happynest.com/locations/new-jersey,"<!doctype html><html lang=""en""><head>\n <me...",,scrapy_success
4,8455,https://docs.nabla.com/server/core-server-api,"<!doctype html>\n<html lang=""en"" dir=""ltr"" cla...",,scrapy_success
5,8458,https://docs.nabla.com/user/delete-user-dot-ph...,"<!doctype html>\n<html lang=""en"" dir=""ltr"" cla...",,scrapy_success
6,8443,https://docs.nabla.com/guides/api-versioning/c...,"<!doctype html>\n<html lang=""en"" dir=""ltr"" cla...",,scrapy_success
7,8452,https://docs.nabla.com/user/update-user-custom...,"<!doctype html>\n<html lang=""en"" dir=""ltr"" cla...",,scrapy_success
8,8440,https://docs.nabla.com/server/oauth-generate-s...,"<!doctype html>\n<html lang=""en"" dir=""ltr"" cla...",,scrapy_success
9,8446,https://docs.nabla.com/user/get-generate-note-...,"<!doctype html>\n<html lang=""en"" dir=""ltr"" cla...",,scrapy_success


## Convert to markdown

In [8]:
# Initialize html2text converter
converter = html2text.HTML2Text()
converter.ignore_links = False
converter.ignore_images = True 
converter.ignore_tables = False  
converter.ignore_emphasis = True
converter.body_width = 0

# Function to extract readable text and tagline from stored HTML content
def extract_text_and_tagline(html_content):
    if pd.isna(html_content) or not isinstance(html_content, str) or html_content.strip() == "":
        return pd.Series([None, None], index=["readable_text_new", "tagline"])

    try:
        soup = BeautifulSoup(html_content, "html.parser")

        h1 = soup.find("h1")
        tagline = h1.get_text(strip=True) if h1 else None
 
        body = soup.body
        if not body:
            return pd.Series([None, tagline], index=["readable_text_new", "tagline"])

        # Remove common unwanted elements inside body
        for tag in body.find_all(["script", "style", "nav", "footer", "aside", "form", "noscript", "header"]):
            tag.extract()

        # Remove empty or standalone anchor links that cause `[](/)` issues
        for tag in body.find_all("a"):
            if not tag.text.strip():
                tag.extract()

        # Convert to readable markdown-style text
        extracted_text = converter.handle(str(body)).strip()
        extracted_text = extracted_text.replace("[](/)", "").strip()

        return pd.Series([extracted_text, tagline], index=["readable_text_new", "tagline"])

    except Exception as e:
        print(f"Error processing HTML: {str(e)}")
        return pd.Series([f"Error: {str(e)}", None], index=["readable_text_new", "tagline"])

# Apply transformation to extract readable text and tagline
page_scrape_list[["readable_text_new", "tagline"]] = page_scrape_list["html_content"].apply(extract_text_and_tagline)
print("‚úÖ Readable text and tagline extraction complete")

‚úÖ Readable text and tagline extraction complete


## Compare and contrast versions

In [10]:
# --- Ensure created_at is datetime
change_log['created_at'] = pd.to_datetime(change_log['created_at'], format='mixed', utc=True)

# --- Build a set of page_ids that have *any* historical record
pages_with_history = set(change_log['page_id'].unique())

# --- Merge the most recent row (even if readable_text is NaN)
latest_per_page = (
    change_log
      .sort_values('created_at', ascending=False)
      .drop_duplicates(subset='page_id', keep='first')
      [['page_id', 'readable_text']]
      .rename(columns={'readable_text': 'previous_readable_text'})
)

updated_change_log = page_scrape_list.merge(latest_per_page, on='page_id', how='left')

# --- Status rules:
# 1) page_id never in change_log  -> initial_scrape
# 2) page_id exists and new text differs from previous -> update_detected
# 3) else -> no_update_detected
has_prev_record = updated_change_log['page_id'].isin(pages_with_history)
has_new_text    = updated_change_log['readable_text_new'].notna()

def _norm(s: pd.Series) -> pd.Series:
    return (
        s.fillna('')
         .astype(str)
         .str.replace(r'\s+', ' ', regex=True)
         .str.strip()
    )

prev_norm = _norm(updated_change_log['previous_readable_text'])
new_norm  = _norm(updated_change_log['readable_text_new'])

cond_initial = ~has_prev_record
cond_update  = has_prev_record & has_new_text & (new_norm != prev_norm)

updated_change_log['page_status'] = np.select(
    [cond_initial, cond_update],
    ['initial_scrape', 'update_detected'],
    default='no_update_detected'
)

print("Comparing page changes")
print(updated_change_log['page_status'].value_counts())

Comparing page changes
page_status
no_update_detected    9
update_detected       1
Name: count, dtype: int64


## Drop rows where no change detected

In [11]:
filtered_df = updated_change_log[updated_change_log["page_status"] != "no_update_detected"]
updated_change_log = filtered_df
print(f"Changes or initial scraped detected for {len(updated_change_log)} pages")

Changes or initial scraped detected for 1 pages


## Send updates to supabase

In [12]:
batch_size = 500
records_to_insert = []

for _, row in updated_change_log.iterrows():
    page_id = int(row["page_id"])
    readable_text = row["readable_text_new"]
    page_status = row["page_status"]
    tagline = row["tagline"]

    # Skip row if \u0000 is in readable_text or page_status
    if isinstance(readable_text, str) and '\u0000' in readable_text:
        print(f"‚ö†Ô∏è Skipping page_id {page_id} due to null byte in readable_text")
        continue
    if isinstance(page_status, str) and '\u0000' in page_status:
        print(f"‚ö†Ô∏è Skipping page_id {page_id} due to null byte in page_status")
        continue

    records_to_insert.append({
        "page_id": page_id,
        "readable_text": readable_text,
        "page_status": page_status,
        "h1_copy": tagline,
        "processed": False,
    })

    # When we hit the batch size, send them
    if len(records_to_insert) >= batch_size:
        response = supabase.table("change_log").insert(records_to_insert).execute()
        if response.data:
            print(f"‚úÖ Inserted {len(records_to_insert)} records")
        else:
            print(f"‚ùå Failed to insert {len(records_to_insert)} records")
        records_to_insert = []

# Insert any remaining records
if records_to_insert:
    response = supabase.table("change_log").insert(records_to_insert).execute()
    if response.data:
        print(f"‚úÖ Inserted {len(records_to_insert)} records")
    else:
        print(f"‚ùå Failed to insert {len(records_to_insert)} records")

‚úÖ Inserted 1 records
