In [1]:
import requests
import urllib.robotparser
from bs4 import BeautifulSoup
import random
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service 
import pandas as pd
import numpy as np
from datetime import datetime
from selenium import webdriver
import streamlit as st

# Section 1: robots.txt sitemap and crawl-delay
print("📄 Checking sitemap and crawl-delay in robots.txt")
print("=" * 60)

url = "https://www.amazon.eg/robots.txt"
response = requests.get(url)

found = False
lines = response.text.splitlines()
for line in lines:
    line = line.strip()
    if line.lower().startswith("sitemap:") or line.lower().startswith("crawl-delay"):
        print(line)
        found = True

if not found:
    print("No 'Sitemap' or 'Crawl-delay' directives found in robots.txt.")

print("=" * 60)

# Section 2: Crawl permission check
print("🔍 Checking crawl permissions for selected URLs")
print("=" * 60)

# Initialize and read robots.txt from the website
robots_url = "https://www.amazon.eg/robots.txt"
rp = urllib.robotparser.RobotFileParser()
rp.set_url(robots_url)
rp.read()

# Define user-agent and test URLs
user_agent = "*"
test_urls = [
    "https://www.amazon.eg/gp/cart",
    "https://www.amazon.eg/gp/help/customer/display.html",
    "https://www.amazon.eg/-/en/ref=nav_logo",
    "https://www.amazon.eg/wishlist/universal",
    "https://www.amazon.eg/exec/obidos/account-access-login"
]

# Check permission for each URL
for url in test_urls:
    allowed = rp.can_fetch(user_agent, url)
    print(f"Crawling allowed for {url}? {'✅ YES' if allowed else '❌ NO'}")

print("=" * 60)

url = "https://www.amazon.com/s?k=playstation+4"
headers = {
    'User-Agent': 'Mozilla/5.0',
    'Accept-Language': 'en-US,en;q=0.9'
}

response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")

# Look for product cards
products = soup.find_all("div", {"data-component-type": "s-search-result"})
print(f"Found {len(products)} product blocks (no JS).")

print("=" * 60)


base_url = "https://www.amazon.eg"

# Common RSS and feed endpoints to test
potential_feeds = [
    "/rss",
    "/feed",
    "/feeds",
    "/rss/new",
    "/feeds/latest",
    "/blog/rss",
]

print("🔎 Checking for RSS or Feed URLs:\n")
for path in potential_feeds:
    full_url = base_url + path
    response = requests.get(full_url)
    status = response.status_code
    print(f"{full_url} → Status: {status}")
    if status == 200:
        print("✅ Feed Found:", full_url)
    else:
        print("❌ Not Found")
    print("-" * 60)

# Inspect homepage for <link rel="alternate"> RSS hints
print("\n🔍 Scanning homepage for RSS <link> tags...\n")
home_response = requests.get(base_url)
soup = BeautifulSoup(home_response.text, "html.parser")

rss_links = soup.find_all("link", type="application/rss+xml")

if rss_links:
    for link in rss_links:
        print("✅ RSS Feed:", link.get("href"))
else:
    print("❌ No RSS feed link tags found on homepage.")

# BONUS: Scan for open API hints (very basic)
print("\n🧪 Scanning homepage for common API-related keywords...\n")
keywords = ["api", "graphql", "/v1/", "/v2/", "endpoint", "feed"]
for keyword in keywords:
    if keyword.lower() in home_response.text.lower():
        print(f"🔗 Possible API mention: {keyword}")

        
print("-" * 60)
   




📄 Checking sitemap and crawl-delay in robots.txt
No 'Sitemap' or 'Crawl-delay' directives found in robots.txt.
🔍 Checking crawl permissions for selected URLs
Crawling allowed for https://www.amazon.eg/gp/cart? ❌ NO
Crawling allowed for https://www.amazon.eg/gp/help/customer/display.html? ✅ YES
Crawling allowed for https://www.amazon.eg/-/en/ref=nav_logo? ❌ NO
Crawling allowed for https://www.amazon.eg/wishlist/universal? ❌ NO
Crawling allowed for https://www.amazon.eg/exec/obidos/account-access-login? ❌ NO
Found 16 product blocks (no JS).
🔎 Checking for RSS or Feed URLs:

https://www.amazon.eg/rss → Status: 404
❌ Not Found
------------------------------------------------------------
https://www.amazon.eg/feed → Status: 404
❌ Not Found
------------------------------------------------------------
https://www.amazon.eg/feeds → Status: 404
❌ Not Found
------------------------------------------------------------
https://www.amazon.eg/rss/new → Status: 404
❌ Not Found
-----------------------

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import time
import random
from datetime import datetime

# ----------------- Scraping helper functions ------------------

def get_title(soup):
    try:
        title = soup.find("span", id="productTitle")
        return title.get_text(strip=True) if title else "N/A"
    except:
        return "N/A"

def get_price(soup):
    try:
        price = soup.find("span", class_="a-price-whole")
        return price.get_text(strip=True) if price else "N/A"
    except:
        return "N/A"

def get_rating(soup):
    try:
        rating = soup.find("span", class_="a-icon-alt")
        return rating.get_text(strip=True) if rating else "N/A"
    except:
        return "N/A"

def get_review_count(soup):
    try:
        reviews = soup.find("span", id="acrCustomerReviewText")
        return reviews.get_text(strip=True) if reviews else "N/A"
    except:
        return "N/A"

def get_availability(soup):
    try:
        availability = soup.find("div", id="availability")
        return availability.get_text(strip=True) if availability else "N/A"
    except:
        return "N/A"

def get_image_url(soup):
    try:
        img = soup.find("img", id="landingImage")
        if img and img.has_attr('src'):
            return img['src']
        wrapper = soup.find("div", id="imgTagWrapperId")
        if wrapper:
            img = wrapper.find("img")
            if img and img.has_attr('src'):
                return img['src']
        return "N/A"
    except:
        return "N/A"

# ----------------- Main Execution ------------------

if __name__ == '__main__':

    # Add a realistic user-agent
    HEADERS = {
        'User-Agent': random.choice([
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/114.0 Safari/537.36",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.4 Safari/605.1.15"
        ]),
        'Accept-Language': 'en-US,en;q=0.5'
    }

    URL = "https://www.amazon.com/s?k=playstation+4"

    print("🔍 Fetching product search results...")
    webpage = requests.get(URL, headers=HEADERS)
    soup = BeautifulSoup(webpage.content, "html.parser")

    # Get product links
    links = soup.find_all("a", attrs={'class': 'a-link-normal s-no-outline'})
    links_list = ["https://www.amazon.com" + link.get('href') for link in links if link.get('href')]

    print(f"🔗 Found {len(links_list)} product links")

    # Prepare storage dictionary
    d = {
        "Title": [],
        "Price": [],
        "Rating": [],
        "Reviews": [],
        "Availability": [],
        "Image URL": [],
        "Link": []
    }

    # Fetch each product page
    for i, link in enumerate(links_list[:100]):  # Limit to first 100 links for demo
        print(f"📦 Scraping product {i+1}: {link}")
        try:
            product_page = requests.get(link, headers=HEADERS)
            product_soup = BeautifulSoup(product_page.content, "html.parser")

            d['Title'].append(get_title(product_soup))
            d['Price'].append(get_price(product_soup))
            d['Rating'].append(get_rating(product_soup))
            d['Reviews'].append(get_review_count(product_soup))
            d['Availability'].append(get_availability(product_soup))
            d['Image URL'].append(get_image_url(product_soup))
            d['Link'].append(link)

            time.sleep(random.uniform(1.5, 3))  # Be respectful with delay

        except Exception as e:
            print(f"❌ Failed to scrape {link}: {e}")
            continue

    # Save to Excel
    df = pd.DataFrame.from_dict(d)
    df['Title'].replace('', np.nan, inplace=True)
    df = df.dropna(subset=['Title'])

    filename = f"amazon_products_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx"
    df.to_excel(filename, index=False, engine='openpyxl')

    print(f"\n✅ Saved {len(df)} products to {filename}")


🔍 Fetching product search results...
🔗 Found 27 product links
📦 Scraping product 1: https://www.amazon.com/PlayStation-4-Slim-1TB-Console/dp/B071CV8CG2/ref=sr_1_1?dib=eyJ2IjoiMSJ9.DYCLh1lfDEqmNIOXcI70zR4V_RUUqtnwnbWGRpMJuZ5YryuywpXRHy1qVrPH-je4bL5Qmn-eMntZUpKwciLRF6ooywVX8dow1DNlSu4aZtU_WloTfckukcvLMDkTsuTLNnVVuEvItBNWsy_2tQXHLj17s6RGi_733-t3h68OXea2WC9fzFEFM8DnyCnBD4WbK4WwJXtAB3eO5Tlm1phFxCoS5Opgzf_dfpKNNCjW6kg.EVa1UzkGXvpg1yD5fckOC0p9htF66SdCFHoUn1kfRHc&dib_tag=se&keywords=playstation+4&qid=1747872596&sr=8-1
📦 Scraping product 2: https://www.amazon.com/Sony-Playstation-PS4-Black-Console/dp/B012CZ41ZA/ref=sr_1_2?dib=eyJ2IjoiMSJ9.DYCLh1lfDEqmNIOXcI70zR4V_RUUqtnwnbWGRpMJuZ5YryuywpXRHy1qVrPH-je4bL5Qmn-eMntZUpKwciLRF6ooywVX8dow1DNlSu4aZtU_WloTfckukcvLMDkTsuTLNnVVuEvItBNWsy_2tQXHLj17s6RGi_733-t3h68OXea2WC9fzFEFM8DnyCnBD4WbK4WwJXtAB3eO5Tlm1phFxCoS5Opgzf_dfpKNNCjW6kg.EVa1UzkGXvpg1yD5fckOC0p9htF66SdCFHoUn1kfRHc&dib_tag=se&keywords=playstation+4&qid=1747872596&sr=8-2
📦 Scraping product 3: http

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Title'].replace('', np.nan, inplace=True)



✅ Saved 27 products to amazon_products_20250522_031230.xlsx


In [3]:
import gradio as gr
import requests
import urllib.robotparser
from bs4 import BeautifulSoup
import pandas as pd
from io import BytesIO

# --- Crawlability analyzer function ---
def analyze_site(base_url):
    try:
        # 1. Robots.txt checks
        robots_url = base_url.rstrip('/') + "/robots.txt"
        rp = urllib.robotparser.RobotFileParser()
        rp.set_url(robots_url)
        rp.read()

        # Get crawl-delay and sitemap from robots.txt text
        rtxt = requests.get(robots_url).text
        crawl_delay = None
        sitemap_url = None
        for line in rtxt.splitlines():
            line = line.strip().lower()
            if line.startswith("crawl-delay:"):
                try:
                    crawl_delay = float(line.split(":")[1].strip())
                except:
                    crawl_delay = None
            if line.startswith("sitemap:"):
                sitemap_url = line.split(":", 1)[1].strip()

        # Test URLs for crawl permission (some common Amazon paths)
        test_urls = [
            "/gp/cart",
            "/gp/help/customer/display.html",
            "/wishlist/universal",
            "/exec/obidos/account-access-login"
        ]

        allowed_count = 0
        for path in test_urls:
            full_url = base_url.rstrip('/') + path
            if rp.can_fetch("*", full_url):
                allowed_count += 1
        crawlability_score = allowed_count / len(test_urls) * 100

        # Format outputs
        crawl_info = f"**Crawlability Score:** {crawlability_score:.1f}%  \n"
        crawl_info += f"**Crawl-delay:** {crawl_delay if crawl_delay else 'None'} seconds  \n"
        crawl_info += f"**Sitemap URL:** {sitemap_url if sitemap_url else 'No sitemap found.'}"

        # Recommendations
        recommendations = []
        if crawl_delay:
            recommendations.append(f"Crawl-delay detected: {crawl_delay} seconds — consider respecting this in your crawler.")
        else:
            recommendations.append("No crawl-delay found — you can crawl faster but stay polite!")

        if crawlability_score == 100:
            recommendations.append("All tested URLs are crawlable. You can use lightweight tools like requests + BeautifulSoup.")
        elif crawlability_score >= 50:
            recommendations.append("Partial crawlability. Consider using Scrapy with rate limiting.")
        else:
            recommendations.append("Low crawlability — consider using Selenium with human-like browsing to avoid blocks.")

        rec_text = "\n".join(recommendations)

        return crawl_info, pd.DataFrame(columns=["Title", "Price", "Rating"]), rec_text, []
    except Exception as e:
        return f"Error during analysis: {e}", pd.DataFrame(columns=["Title", "Price", "Rating"]), "", []

# --- Load Excel Data function ---
def load_excel_data():
    file_path = r"C:\Users\btech\Desktop\ir\amazon_products_20250522_025027.xlsx"
    try:
        df = pd.read_excel(file_path, engine='openpyxl')
        df_subset = df[["Title", "Price", "Rating"]].head(20)
        status = f"✅ Loaded {len(df_subset)} products from Excel."

        # Load images
        images = []
        if 'Image URL' in df.columns:
            for _, row in df.head(10).iterrows():
                image_url = row.get('Image URL', '')
                title = str(row.get('Title', 'No Title'))
                price = str(row.get('Price', 'N/A'))
                rating = str(row.get('Rating', 'N/A'))

                if pd.notna(image_url) and image_url != "N/A":
                    caption = f"{title}\n💲{price} | ⭐{rating}"
                    images.append((image_url, caption))  # ✅ Use URL directly

        return status, df_subset, "", images
    except Exception as e:
        return f"❌ Failed to load Excel file: {e}", pd.DataFrame(columns=["Title", "Price", "Rating"]), "", []


# --- Gradio Interface ---
with gr.Blocks() as demo:
    gr.Markdown("<h1 style='text-align:center; color:#3f51b5;'>🌐 Website Crawlability Analyzer + Excel Product Viewer</h1>")

    with gr.Row():
        with gr.Column(scale=2):
            gr.Markdown("## Crawlability Analysis")
            base_url = gr.Textbox(label="Enter Base URL (e.g. https://www.amazon.com)", value="https://www.amazon.com")
            analyze_btn = gr.Button("Analyze Site Live", variant="primary")
            crawl_info = gr.Markdown(label="Crawlability Info")
            recs = gr.Markdown(label="Recommendations")

        with gr.Column(scale=3):
            gr.Markdown("## Product Details from Excel")
            load_btn = gr.Button("Load Products from Excel", variant="secondary")
            load_status = gr.Markdown()
            product_table = gr.Dataframe(headers=["Title", "Price", "Rating"], interactive=False)
            image_gallery = gr.Gallery(label="Product Images", columns=2)

    # Button connections
    analyze_btn.click(fn=analyze_site, inputs=base_url, outputs=[crawl_info, product_table, recs, image_gallery])
    load_btn.click(fn=load_excel_data, inputs=[], outputs=[load_status, product_table, recs, image_gallery])

demo.launch()


* Running on local URL:  http://127.0.0.1:7862
* To create a public link, set `share=True` in `launch()`.


