In [330]:
!pip install selenium
!pip install python-dotenv



In [331]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import re
import time
import tempfile
from tqdm import tqdm
from pprint import pprint
import google.generativeai as genai
import os
from dotenv import load_dotenv
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [332]:
load_dotenv()

genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

In [333]:
def init_driver(headless=True):
  options = Options()
  if headless:
    options.add_argument("--headless")
  options.add_argument("--disable-gpu")
  options.add_argument("--window-size=1920,1080")
  options.add_argument("--no-sandbox")
  options.add_argument("--disable-dev-shm-usage")

  temp_profile = tempfile.mkdtemp()
  options.add_argument(f"--user-data-dir={temp_profile}")

  driver = webdriver.Chrome(options=options)

  return driver

In [334]:
def detect_footer_year(soup):
    text = soup.get_text(separator=' ')
    years = re.findall(r"\b(19|20)\d{2}\b", text)
    years = [int(y) for y in years if int(y) <= 2025]
    return max(years) if years else None

In [335]:
def detect_last_blog_post(soup):
    dates = []

    for tag in soup.find_all("time"):
        if tag.has_attr("datetime"):
            date = tag["datetime"]
            if re.match(r"\d{4}-\d{2}-\d{2}", date):
                dates.append(date)

    for link in soup.find_all('a', href=True):
        match = re.search(r'(\d{4})[-/](\d{2})[-/](\d{2})', link['href'])
        if match:
            dates.append("-".join(match.groups()))

    return max(dates) if dates else None

In [336]:
def detect_keywords(text, keywords):
  return [kw for kw in keywords if kw.lower() in text.lower()]

In [337]:
def detect_internal_links(soup, substrings):
    matches = []
    for link in soup.find_all("a", href=True):
        href = link["href"].lower()
        if any(sub in href for sub in substrings):
            matches.append(href)
    return matches

In [338]:
TECH_FINGERPRINTS = {
    "HubSpot": ["js.hs-scripts.com", "forms.hsforms.com", "hs-analytics"],
    "Salesforce": ["salesforce.com", "pardot.com"],
    "Marketo": ["mktoweb.com"],
    "Shopify": ["cdn.shopify.com", "Shopify.theme"],
    "WooCommerce": ["/wp-content/plugins/woocommerce"],
    "Magento": ["mage.js", "Magento_"],
    "Google Analytics": ["google-analytics.com", "gtag/js"],
    "Hotjar": ["hotjar.com", "hj.js"],
    "Segment": ["cdn.segment.com"],
    "WordPress": ["/wp-content/", "/wp-includes/"],
    "Wix": ["wix.com", "static.parastorage.com"],
    "Squarespace": ["squarespace.com"],
    "Intercom": ["widget.intercom.io"],
    "Zendesk": ["zendesk.com", "zdassets.com"],
    "Drift": ["js.driftt.com"],
}

In [339]:
def detect_tech_stack(html):
  tech_stack = []
  html_lower = html.lower()
  for tech, fingerprints in TECH_FINGERPRINTS.items():
    if any(f.lower() in html_lower for f in fingerprints):
      tech_stack.append(tech)

  return tech_stack

In [340]:
def scrape_signals(url, driver):
    try:
        driver.get(url)
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)

        html = driver.page_source
        soup = BeautifulSoup(html, 'lxml')
        text = soup.get_text(separator=' ', strip=True)

        growth_keywords = ["careers", "we're hiring", "join our team", "job openings", "now hiring", "open roles"]
        stability_keywords = ["family-owned", "since", "established", "founded in", "legacy", "serving since", "tradition"]

        return {
            "url": url,
            "tech_stack": detect_tech_stack(html),
            "neglect_signals": {
                "footer_year": detect_footer_year(soup),
                "last_blog_post": detect_last_blog_post(soup)
            },
            "stability_signals": (
                detect_keywords(text, stability_keywords) +
                detect_internal_links(soup, ["about", "history", "legacy"])
            ),
            "growth_signals": (
                detect_keywords(text, growth_keywords) +
                detect_internal_links(soup, ["careers", "jobs", "join"])
            )
        }
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return None

In [341]:
def build_gemini_fit_score_prompt(web_signals):
  return f"""
    You are an M&A analyst. Evaluate this business based on web signals.

    Return a single number from 1 to 10 that represents how good of a fit this business is for acquisition (10 = very attractive, 1 = poor target).

    Web signals:
    - Tech Stack: {web_signals['tech_stack']}
    - Neglect Signals: {web_signals['neglect_signals']}
    - Stability Signals: {web_signals['stability_signals']}
    - Growth Signals: {web_signals['growth_signals']}

    Respond with only a single number between 1 and 10.
  """

In [342]:
def build_deal_memo_prompt(data):
  return f"""
    You are an M&A analyst. Based on the following web signals from a company, generate a 2–4 sentence investment memo explaining whether this company could be a good acquisition opportunity. Mention any signs of neglect, stability, or growth, and justify your opinion.

    Here is the data:
    - URL: {data['url']}
    - Tech Stack: {data.get('tech_stack')}
    - Neglect Signals: {data.get('neglect_signals')}
    - Stability Signals: {data.get('stability_signals')}
    - Growth Signals: {data.get('growth_signals')}
    - Fit Score: {data.get('fit_score')}

    Output:
  """

In [343]:
def build_outreach_prompt(data):
  return f"""
    You are a search fund associate who identifies and reaches out to small business owners about potential acquisitions.

    Based on the following web data, write a short, authentic outreach email. Keep it professional, warm, and non-pushy. The email should be 4-6 sentences. Do not sell anything — the goal is to start a conversation.

    Company Info:
    - URL: {data['url']}
    - Tech Stack: {data.get('tech_stack')}
    - Neglect Signals: {data.get('neglect_signals')}
    - Stability Signals: {data.get('stability_signals')}
    - Growth Signals: {data.get('growth_signals')}
    - Fit Score: {data.get('fit_score')}
    - Deal Memo: {data.get('deal_memo')}

    Output just the email body (no subject line or extra text).
  """

In [344]:
def get_fit_score_from_gemini(web_signals):
  prompt = build_gemini_fit_score_prompt(web_signals)

  model = genai.GenerativeModel("gemini-2.5-flash")
  response = model.generate_content(prompt)

  try:
    score = int(response.text.strip())
    return max(1, min(score, 10))
  except Exception as e:
    print("Gemini Error:", e, "Response:", response.text)
    return None

In [345]:
def get_deal_memo_from_gemini(data):
  prompt = build_deal_memo_prompt(data)
  model = genai.GenerativeModel("gemini-2.5-flash")
  response = model.generate_content(prompt)

  return response.text.strip()

In [346]:
def get_outreach_email_from_gemini(data: dict):
    prompt = build_outreach_prompt(data)
    model = genai.GenerativeModel("gemini-2.5-flash")
    response = model.generate_content(prompt)
    return response.text.strip()

In [347]:
def batch_scrape(urls):
  driver = init_driver()
  results = []
  for url in tqdm(urls, desc="Scraping websites"):
    if not url.startswith("http"):
      url = "http://" + url
    result = scrape_signals(url, driver)
    if result is None:
      print(f"Skipping {url}: scrape_signals returned None")
      continue
    try:
        result["fit_score"] = get_fit_score_from_gemini(result)
        result["deal_memo"] = get_deal_memo_from_gemini(result)
        result["outreach_email"] = get_outreach_email_from_gemini(result)
    except Exception as e:
        print(f"Error generating fit score for {url}: {e}")
        result["fit_score"] = None
        result["deal_memo"] = None
        result["outreach_email"] = None
    results.append(result)

  driver.quit()
  return results

In [348]:
input_urls = [
    "http://sportsstation.indonetwork.co.id/",
    "https://mapvisionindo.com/",
    "https://www.belahdoeren.id/",
    "http://mangtani.id/",
    "https://www.caturkartikajaya.co.id/"
]

In [349]:
driver = init_driver()
results = batch_scrape(input_urls)
pprint(results)

Scraping websites:  40%|████      | 2/5 [01:25<02:08, 42.68s/it]ERROR:tornado.access:503 POST /v1beta/models/gemini-2.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 4206.58ms
Scraping websites:  80%|████████  | 4/5 [03:04<00:45, 45.65s/it]ERROR:tornado.access:503 POST /v1beta/models/gemini-2.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 7380.72ms
Scraping websites: 100%|██████████| 5/5 [03:56<00:00, 47.39s/it]


[{'deal_memo': 'Based on the web signals, Sports Station '
               '(http://sportsstation.indonetwork.co.id/) does not appear to '
               'be a viable acquisition opportunity. The extreme neglect is '
               'evident from the "20" footer year and absence of any recent '
               'blog posts, indicating a dormant or unmaintained online '
               'presence. While it benefits from the foundational stability of '
               'being hosted on the Indonetwork platform, there are no '
               'independent growth signals detected. Combined with a very low '
               'Fit Score of 2, this entity presents as an unattractive target '
               'for acquisition.',
  'fit_score': 2,
  'growth_signals': [],
  'neglect_signals': {'footer_year': 20, 'last_blog_post': None},
  'outreach_email': 'Dear [Owner Name],\n'
                    '\n'
                    'My name is [Your Name] and I lead [Your Search Fund '
                    'Name], a f