In [2]:
import requests
import re
from bs4 import BeautifulSoup
from googlesearch import search
from urllib.parse import urljoin, urlparse

HEADERS = {
    "User-Agent": "Mozilla/5.0"
}

EMAIL_REGEX = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"

def find_website(company_name):
    query = f"{company_name} official website"
    for result in search(query, num_results=5):
        if result.startswith("http"):
            return result
    return None

def extract_emails_from_url(url, visited):
    emails = set()
    try:
        response = requests.get(url, headers=HEADERS, timeout=10)
        soup = BeautifulSoup(response.text, "html.parser")

        # Extract emails from page text
        text = soup.get_text()
        emails.update(re.findall(EMAIL_REGEX, text))

        # Find internal links to crawl (contact/about)
        base_domain = urlparse(url).netloc
        for link in soup.find_all("a", href=True):
            href = link["href"].lower()
            if any(x in href for x in ["contact", "about"]):
                full_url = urljoin(url, link["href"])
                if full_url not in visited and base_domain in full_url:
                    visited.add(full_url)
                    emails.update(extract_emails_from_url(full_url, visited))

    except Exception:
        pass

    return emails

def get_company_emails(company_name):
    website = find_website(company_name)
    if not website:
        return None, set()

    visited = {website}
    emails = extract_emails_from_url(website, visited)
    return website, emails

if __name__ == "__main__":
    company = input("Enter company name: ").strip()
    website, emails = get_company_emails(company)

    if website:
        print(f"\nWebsite: {website}")
    else:
        print("\nWebsite not found")

    if emails:
        print("\nEmails found:")
        for email in emails:
            print(email)
    else:
        print("\nNo emails found")



Website not found

No emails found


In [10]:
!python -m pip install googlesearch-python


Collecting googlesearch-python
  Using cached googlesearch_python-1.3.0-py3-none-any.whl.metadata (3.4 kB)
Using cached googlesearch_python-1.3.0-py3-none-any.whl (5.6 kB)
Installing collected packages: googlesearch-python
Successfully installed googlesearch-python-1.3.0


In [2]:
import requests
import re
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

HEADERS = {
    "User-Agent": "Mozilla/5.0"
}

EMAIL_REGEX = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"

COMMON_TLDS = ["com", "in", "co", "io", "net", "org"]

def generate_domains(company_name):
    base = company_name.lower().replace("&", "and").replace(" ", "")
    return [f"https://{base}.{tld}" for tld in COMMON_TLDS]

def is_valid_site(url):
    try:
        r = requests.get(url, headers=HEADERS, timeout=6)
        return r.status_code == 200
    except:
        return False

def extract_emails(url, visited):
    emails = set()
    try:
        r = requests.get(url, headers=HEADERS, timeout=10)
        soup = BeautifulSoup(r.text, "html.parser")

        emails.update(re.findall(EMAIL_REGEX, soup.get_text()))

        domain = urlparse(url).netloc
        for link in soup.find_all("a", href=True):
            href = link["href"].lower()
            if any(k in href for k in ["contact", "about"]):
                full_url = urljoin(url, link["href"])
                if domain in full_url and full_url not in visited:
                    visited.add(full_url)
                    emails.update(extract_emails(full_url, visited))

    except:
        pass

    return emails

def get_company_emails(company_name):
    for site in generate_domains(company_name):
        if is_valid_site(site):
            visited = {site}
            emails = extract_emails(site, visited)
            return site, emails

    return None, set()

if __name__ == "__main__":
    company = input("Enter company name: ").strip()
    site, emails = get_company_emails(company)

    if site:
        print(f"\nWebsite: {site}")
    else:
        print("\nWebsite not found")

    if emails:
        print("\nEmails found:")
        for e in emails:
            print(e)
    else:
        print("\nNo emails found")



Website: https://tesla.in

No emails found


In [6]:
import requests
import re
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

HEADERS = {
    "User-Agent": "Mozilla/5.0"
}

EMAIL_REGEX = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"

MAX_PAGES = 20   # crawl limit per site
MAX_DEPTH = 2    # link depth limit

def extract_emails(text):
    return set(re.findall(EMAIL_REGEX, text))

def crawl_site(start_url):
    domain = urlparse(start_url).netloc
    visited = set()
    queue = [(start_url, 0)]
    emails = set()

    while queue and len(visited) < MAX_PAGES:
        url, depth = queue.pop(0)
        if url in visited or depth > MAX_DEPTH:
            continue

        visited.add(url)

        try:
            r = requests.get(url, headers=HEADERS, timeout=10)
            soup = BeautifulSoup(r.text, "html.parser")

            # extract emails
            emails.update(extract_emails(soup.get_text()))

            # find all internal links (tabs, menus, footer)
            for link in soup.find_all("a", href=True):
                href = link["href"]
                full_url = urljoin(url, href)
                parsed = urlparse(full_url)

                if parsed.netloc == domain and full_url not in visited:
                    queue.append((full_url, depth + 1))

        except:
            continue

    return emails

def get_company_emails(company_name):
    base = company_name.lower().replace(" ", "").replace("&", "and")
    site = f"https://{base}.com"

    try:
        requests.get(site, headers=HEADERS, timeout=5)
    except:
        return None, set()

    emails = crawl_site(site)
    return site, emails

if __name__ == "__main__":
    company = input("Enter company name: ").strip()
    site, emails = get_company_emails(company)

    if site:
        print(f"\nWebsite: {site}")
    else:
        print("\nWebsite not found")

    if emails:
        print("\nEmails found:")
        for e in emails:
            print(e)
    else:
        print("\nNo emails found")



Website not found

No emails found


In [5]:
import re
from urllib.parse import urljoin, urlparse
from playwright.sync_api import sync_playwright
from bs4 import BeautifulSoup

EMAIL_REGEX = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
OBFUSCATED_REGEX = [
    r"([a-zA-Z0-9._%+-]+)\s*\[at\]\s*([a-zA-Z0-9.-]+)\s*\[dot\]\s*([a-zA-Z]{2,})",
    r"([a-zA-Z0-9._%+-]+)\s*\(at\)\s*([a-zA-Z0-9.-]+)\s*\(dot\)\s*([a-zA-Z]{2,})",
]

MAX_PAGES = 25
MAX_DEPTH = 2

def normalize_email(match):
    return f"{match[0]}@{match[1]}.{match[2]}"

def extract_emails(text):
    emails = set(re.findall(EMAIL_REGEX, text))
    for pattern in OBFUSCATED_REGEX:
        for match in re.findall(pattern, text, flags=re.I):
            emails.add(normalize_email(match))
    return emails

def crawl_site(start_url):
    visited = set()
    queue = [(start_url, 0)]
    emails = set()
    domain = urlparse(start_url).netloc

    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True)
        page = browser.new_page()

        while queue and len(visited) < MAX_PAGES:
            url, depth = queue.pop(0)
            if url in visited or depth > MAX_DEPTH:
                continue

            visited.add(url)

            try:
                page.goto(url, timeout=60000)
                page.wait_for_timeout(3000)

                html = page.content()
                soup = BeautifulSoup(html, "html.parser")
                text = soup.get_text(separator=" ")

                emails.update(extract_emails(text))

                for link in soup.find_all("a", href=True):
                    full_url = urljoin(url, link["href"])
                    parsed = urlparse(full_url)

                    if parsed.netloc == domain and full_url not in visited:
                        queue.append((full_url, depth + 1))

            except:
                continue

        browser.close()

    return emails

if __name__ == "__main__":
    website = input("Enter company website (https://...): ").strip()
    emails = crawl_site(website)

    if emails:
        print("\nEmails found:")
        for e in sorted(emails):
            print(e)
    else:
        print("\nNo public emails found")


ModuleNotFoundError: No module named 'playwright'

In [None]:
import re
from urllib.parse import urljoin, urlparse
from playwright.sync_api import sync_playwright
from bs4 import BeautifulSoup

# Regex patterns
EMAIL_REGEX = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
OBFUSCATED_PATTERNS = [
    r"([a-zA-Z0-9._%+-]+)\s*\[at\]\s*([a-zA-Z0-9.-]+)\s*\[dot\]\s*([a-zA-Z]{2,})",
    r"([a-zA-Z0-9._%+-]+)\s*\(at\)\s*([a-zA-Z0-9.-]+)\s*\(dot\)\s*([a-zA-Z]{2,})",
]

MAX_PAGES = 30
MAX_DEPTH = 3

def extract_emails(text):
    emails = set(re.findall(EMAIL_REGEX, text))
    for pattern in OBFUSCATED_PATTERNS:
        for m in re.findall(pattern, text, flags=re.I):
            emails.add(f"{m[0]}@{m[1]}.{m[2]}")
    return emails

def crawl_website(start_url):
    visited = set()
    queue = [(start_url, 0)]
    found_emails = set()
    domain = urlparse(start_url).netloc

    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True)
        page = browser.new_page()

        while queue and len(visited) < MAX_PAGES:
            url, depth = queue.pop(0)
            if url in visited or depth > MAX_DEPTH:
                continue

            visited.add(url)

            try:
                page.goto(url, timeout=60000)
                page.wait_for_timeout(2500)

                html = page.content()
                soup = BeautifulSoup(html, "html.parser")
                text = soup.get_text(separator=" ")

                found_emails.update(extract_emails(text))

                for link in soup.find_all("a", href=True):
                    full_url = urljoin(url, link["href"])
                    parsed = urlparse(full_url)

                    if parsed.netloc == domain and full_url not in visited:
                        queue.append((full_url, depth + 1))

            except:
                continue

        browser.close()

    return found_emails

if __name__ == "__main__":
    website = input("Enter website URL (https://...): ").strip()
    emails = crawl_website(website)

    if emails:
        print("\nEmails found:")
        for email in sorted(emails):
            print(email)
    else:
        print("\nNo public emails found")


In [1]:
import requests
import re
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from collections import deque

HEADERS = {
    "User-Agent": "Mozilla/5.0"
}

EMAIL_REGEX = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"

OBFUSCATED_REGEX = [
    r"([a-zA-Z0-9._%+-]+)\s*\[at\]\s*([a-zA-Z0-9.-]+)\s*\[dot\]\s*([a-zA-Z]{2,})",
    r"([a-zA-Z0-9._%+-]+)\s*\(at\)\s*([a-zA-Z0-9.-]+)\s*\(dot\)\s*([a-zA-Z]{2,})",
]

MAX_PAGES = 50

def extract_emails(text):
    emails = set(re.findall(EMAIL_REGEX, text))
    for pattern in OBFUSCATED_REGEX:
        for m in re.findall(pattern, text, flags=re.I):
            emails.add(f"{m[0]}@{m[1]}.{m[2]}")
    return emails

def crawl_website(start_url):
    visited = set()
    queue = deque([start_url])
    domain = urlparse(start_url).netloc
    found_emails = set()

    while queue and len(visited) < MAX_PAGES:
        url = queue.popleft()
        if url in visited:
            continue

        visited.add(url)

        try:
            response = requests.get(url, headers=HEADERS, timeout=10)
            soup = BeautifulSoup(response.text, "html.parser")

            text = soup.get_text(separator=" ")
            found_emails.update(extract_emails(text))

            for link in soup.find_all("a", href=True):
                full_url = urljoin(url, link["href"])
                parsed = urlparse(full_url)

                if parsed.scheme in ["http", "https"] and parsed.netloc == domain:
                    if full_url not in visited:
                        queue.append(full_url)

        except:
            continue

    return found_emails

if __name__ == "__main__":
    website = input("Enter website URL (https://...): ").strip()
    emails = crawl_website(website)

    if emails:
        print("\nEmails found:")
        for e in sorted(emails):
            print(e)
    else:
        print("\nNo public emails found")



No public emails found


In [1]:
import re
import time
from queue import Queue
from urllib.parse import urljoin, urlparse
from threading import Thread, Lock

import tldextract
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

# Regex patterns
EMAIL_REGEX = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
OBFUSCATED_REGEX = re.compile(
    r"([a-zA-Z0-9._%+-]+)\s*(?:\[at\]|\(at\)|at)\s*"
    r"([a-zA-Z0-9.-]+)\s*(?:\[dot\]|\(dot\)|dot|\.)\s*"
    r"([a-zA-Z]{2,})",
    re.IGNORECASE
)

NUM_THREADS = 3  # number of browser threads

def normalize_url(url):
    if not url.startswith("http"):
        url = "https://" + url
    return url.rstrip("/")

def setup_driver():
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--window-size=1920,1080")
    options.add_argument(
        "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
    )
    return webdriver.Chrome(
        service=Service(ChromeDriverManager().install()),
        options=options
    )

def worker(queue, visited, emails, lock, base_domain, max_pages):
    driver = setup_driver()
    while True:
        try:
            url = queue.get(timeout=5)
        except:
            break  # queue empty

        with lock:
            if url in visited or len(visited) >= max_pages:
                queue.task_done()
                continue
            visited.add(url)

        print(f"Crawling: {url}")
        try:
            driver.get(url)
            time.sleep(2)  # allow JS
            html = driver.page_source
            soup = BeautifulSoup(html, "html.parser")

            # Extract emails
            found_emails = set(EMAIL_REGEX.findall(html))
            for match in OBFUSCATED_REGEX.findall(html):
                found_emails.add(f"{match[0]}@{match[1]}.{match[2]}")

            with lock:
                emails.update(found_emails)

            # Find internal links
            for a in soup.find_all("a", href=True):
                link = urljoin(url, a["href"])
                parsed = urlparse(link)
                if parsed.scheme not in ("http", "https"):
                    continue
                domain = tldextract.extract(link).registered_domain
                if domain == base_domain:
                    clean_link = link.split("#")[0].rstrip("/")
                    with lock:
                        if clean_link not in visited:
                            queue.put(clean_link)

        except Exception as e:
            pass
        finally:
            queue.task_done()

    driver.quit()

def crawl_website_multithread(start_url, max_pages=10):
    start_url = normalize_url(start_url)
    base_domain = tldextract.extract(start_url).registered_domain
    visited = set()
    emails = set()
    lock = Lock()
    queue = Queue()
    queue.put(start_url)

    threads = []
    for _ in range(NUM_THREADS):
        t = Thread(target=worker, args=(queue, visited, emails, lock, base_domain, max_pages))
        t.daemon = True
        t.start()
        threads.append(t)

    queue.join() 

    return sorted(emails), len(visited)

if __name__ == "__main__":
    website_input = input("Enter website URL (example.com): ").strip()
    print("\nStarting multithreaded crawl...\n")
    emails, pages = crawl_website_multithread(website_input)

    print("\nCrawl completed.")
    print(f"Pages crawled: {pages}")
    if emails:
        print("\nEmails found:")
        for e in emails:
            print(e)
    else:
        print("\nNo emails found on the website.")



Starting multithreaded crawl...



  base_domain = tldextract.extract(start_url).registered_domain


Crawling: https://tesla.com


  domain = tldextract.extract(link).registered_domain


Crawling: https://www.tesla.com//support/browser-support
Crawling: https://shop.tesla.com
Crawling: https://tesla.com/support
Crawling: https://tesla.com/teslaaccount
Crawling: https://tesla.com/models
Crawling: https://tesla.com/models/design
Crawling: https://tesla.com/model3
Crawling: https://tesla.com/model3/design
Crawling: https://tesla.com/modely

Crawl completed.
Pages crawled: 10

Emails found:
Homepage-Fe@ures-Desktop.png
Homepage-Fe@ures-Mobile.png
Homepage-Fe@ures-Tablet.png
apigateway-pricing-g@eway.tesla.com
ch@-loader.js
cua-ch@-ui.tesla.com
loc@ion-script.js
window.loc@ion.href
window.navig@or.cookieEnabled


In [1]:
import requests
import re
from bs4 import BeautifulSoup

# User se URL input lena
url = input("Enter the URL to scrape emails from: ")

try:
    response = requests.get(url)
    response.raise_for_status()  # Check if request was successful
except requests.exceptions.RequestException as e:
    print(f"Error fetching URL: {e}")
    exit()

html_content = response.text

# Parse HTML
soup = BeautifulSoup(html_content, "html.parser")

# Get text only
text = soup.get_text()

# Regex for email extraction
emails = re.findall(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", text)

# Remove duplicates
unique_emails = set(emails)

if unique_emails:
    print("Emails Found:")
    for email in unique_emails:
        print(email)
else:
    print("No emails found on this page.")


Emails Found:
care@stylo.pkCustomer


In [6]:
import requests
import re
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

headers = {
    "User-Agent": "Mozilla/5.0"
}

def is_internal_link(base_url, link):
    return urlparse(base_url).netloc == urlparse(link).netloc

def extract_emails_from_site(base_url, max_pages=20):
    visited = set()
    to_visit = [base_url]
    found_emails = set()

    while to_visit and len(visited) < max_pages:
        url = to_visit.pop(0)
        if url in visited:
            continue

        visited.add(url)

        try:
            response = requests.get(url, headers=headers, timeout=10)
            soup = BeautifulSoup(response.text, "html.parser")
        except:
            continue

        text = soup.get_text()
        emails = re.findall(
            r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}",
            text
        )
        found_emails.update(emails)

        # Collect internal links (tabs/pages)
        for a in soup.find_all("a", href=True):
            link = urljoin(base_url, a["href"])
            if is_internal_link(base_url, link) and link not in visited:
                to_visit.append(link)

    return found_emails

# Excel Input
df = pd.read_excel("companies.xlsx")
df["Emails_Found"] = ""

for index, row in df.iterrows():
    url = row["Company_URL"]
    print(f"Crawling full site: {url}")

    emails = extract_emails_from_site(url)

    if emails:
        df.at[index, "Emails_Found"] = ", ".join(emails)
    else:
        df.at[index, "Emails_Found"] = "No email found"

df.to_excel("companies_with_emails.xlsx", index=False)

print("Full website crawling completed.")


Crawling full site: https://www.gnu.org
Crawling full site: https://www.python.org
Crawling full site: https://www.apache.org
Crawling full site: https://www.linuxfoundation.org
Crawling full site: https://www.wikipedia.org
Crawling full site: https://www.mozilla.org
Crawling full site: https://www.djangoproject.com


  k = self.parse_starttag(i)


Crawling full site: https://www.postgresql.org
Crawling full site: https://www.mysql.com
Crawling full site: https://www.php.net
Full website crawling completed.
