# Web Scraping Smart Phones From 2 Diffrent Websities

In [9]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
from datetime import datetime
import time
import re

In [10]:
# Setup Chrome driver
chrome_options = Options()
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

In [11]:
all_products = []

# Search for phones
phones = ["apple phone", "Samsung phone"]

# Extract specs from product name
def get_specs(name):
    specs = {'storage': 'N/A', 'color': 'N/A', 'model': 'N/A'}
    
    # Get storage
    storage_match = re.search(r'(\d+)\s*GB', name)
    if storage_match:
        specs['storage'] = storage_match.group(1) + 'GB'
    
    # Get color
    colors = ['black', 'white', 'blue', 'red', 'gold', 'silver', 'purple']
    for color in colors:
        if color in name.lower():
            specs['color'] = color
            break
    
    # Get model
    if 'iPhone' in name:
        model_match = re.search(r'iPhone\s*(\d+\s*\w*)', name)
        if model_match:
            specs['model'] = model_match.group(1)
    elif 'Samsung' in name:
        model_match = re.search(r'Samsung\s*Galaxy\s*(\w+)', name)
        if model_match:
            specs['model'] = model_match.group(1)
    
    return specs

In [12]:
# Scrape Amazon

# start fresh for this run
all_products = []

for phone in phones:
    query = phone.replace(' ', '+')
    driver.get(f"https://www.amazon.eg/-/en/s?k={query}")
    time.sleep(2)
    page_num = 1

    while True:
        # get all product elements on the current page (use same XPath)
        products = driver.find_elements(By.XPATH, "//*[@data-component-type='s-search-result']")
        if not products:
            break

        for position, product in enumerate(products, start=1):
            try:
                # use relative XPaths scoped to product element
                try:
                    name = product.find_element(By.XPATH, ".//h2//span").text
                except:
                    name = "N/A"

                # price extraction (robust)
                try:
                    price_whole = product.find_element(By.XPATH, ".//span[contains(@class,'a-price-whole')]").text
                    price_frac = product.find_element(By.XPATH, ".//span[contains(@class,'a-price-fraction')]").text
                    price = f"{price_whole}{price_frac}"
                except:
                    try:
                        price = product.find_element(By.XPATH, ".//span[contains(@class,'a-offscreen')]").text
                    except:
                        price = "N/A"

                # url
                try:
                    url = product.find_element(By.XPATH, ".//h2//a").get_attribute("href")
                except:
                    url = "N/A"

                # asin (data attribute on the product element)
                asin = product.get_attribute("data-asin") or "N/A"

                # badges
                badges = []
                try:
                    badge_elements = product.find_elements(By.XPATH, ".//span[contains(@class,'a-badge-text')]")
                    for b in badge_elements:
                        t = b.text.strip()
                        if t:
                            badges.append(t)
                except:
                    pass

                # prime detection
                is_prime = False
                try:
                    if product.find_elements(By.XPATH, ".//i[contains(@aria-label,'Prime')]") or product.find_elements(By.XPATH, ".//span[contains(text(),'Prime')]"):
                        is_prime = True
                except:
                    is_prime = False

                # rating and review count
                try:
                    rating = product.find_element(By.XPATH, ".//span[contains(@class,'a-icon-alt')]").text
                except:
                    rating = "N/A"
                try:
                    review_count = product.find_element(By.XPATH, ".//span[contains(@class,'a-size-base') and (contains(@class,'s-underline-text') or contains(@class,'a-size-base'))]").text
                except:
                    # different layouts: try anchor with reviews
                    try:
                        review_count = product.find_element(By.XPATH, ".//a[contains(@href,'/product-reviews/') or contains(@href,'/gp/customer-reviews/')]").text
                    except:
                        review_count = "N/A"

                # parse specs from name (uses your existing get_specs)
                specs = get_specs(name)

                # cleaned numeric price if possible
                price_numeric = None
                try:
                    # normalize comma to dot then strip non-numeric except dot
                    cleaned = re.sub(r'[^\d.]', '', price.replace(',', '.'))
                    if cleaned:
                        price_numeric = float(cleaned)
                except:
                    price_numeric = None

                all_products.append({
                    "scrape_time": datetime.utcnow().isoformat(),
                    "brand": phone,
                    "page": page_num,
                    "position_on_page": position,
                    "asin": asin,
                    "name": name,
                    "model": specs.get('model', 'N/A'),
                    "storage": specs.get('storage', 'N/A'),
                    "color": specs.get('color', 'N/A'),
                    "price_raw": price,
                    "price_numeric": price_numeric,
                    "currency": "EGP",  # assume Egyptian site
                    "rating": rating,
                    "review_count": review_count,
                    "is_prime": is_prime,
                    "badges": ", ".join(badges) if badges else "None",
                    "url": url,
                    "website": "Amazon"
                })
            except Exception:
                # skip problematic product entries
                pass

        # try to go to next page for the current phone
        try:
            # find the "Next" pagination control (works for the modern Amazon layout)
            next_btn = driver.find_element(By.XPATH, "//a[contains(@class,'s-pagination-next') or contains(@aria-label,'Next')]")
            aria_disabled = next_btn.get_attribute("aria-disabled")
            if aria_disabled and aria_disabled.lower() == "true":
                break
            # click and wait for new page to load
            next_btn.click()
            page_num += 1
            time.sleep(2)
        except Exception:
            # no next button or can't click -> stop paginating for this phone
            break
# number of products scraped from Amazon
print(f"Scraped {len(all_products)} products from Amazon.")

  "scrape_time": datetime.utcnow().isoformat(),


Scraped 520 products from Amazon.


In [13]:
# Scrape Noon (append results to existing all_products). Uses XPath selectors.
# Requires: driver, phones, get_specs, re, datetime, time already defined in notebook.

for phone in phones:
    query = phone.replace(' ', '+')
    driver.get(f"https://www.noon.com/egypt-en/search?q={query}")
    time.sleep(2)
    page_num = 1

    while True:
        # broad XPath to capture product cards across different Noon layouts
        products = driver.find_elements(
            By.XPATH,
            "//div[@data-qa='product-item'] | //div[contains(@class,'productCard')] | //div[contains(@class,'product')] | //div[contains(@data-qa,'product-card')] | //a[contains(@href,'/product/') or contains(@href,'/products/')]"
        )
        if not products:
            break

        for position, product in enumerate(products, start=1):
            try:
                # name (try a few likely relative XPaths)
                try:
                    name = product.find_element(By.XPATH,
                        ".//h3//span | .//a//span | .//div[contains(@class,'productName')]//span | .//span[@data-qa='product-name']"
                    ).text
                except:
                    name = "N/A"

                # price (look for price spans)
                try:
                    price = product.find_element(By.XPATH,
                        ".//span[contains(@data-qa,'price')] | .//span[contains(@class,'price')] | .//div[contains(@class,'price')]/span"
                    ).text
                except:
                    price = "N/A"

                # url (anchor inside product card)
                try:
                    url = product.find_element(By.XPATH,
                        ".//a[contains(@href,'/product') or contains(@href,'/products') or contains(@class,'productLink')]"
                    ).get_attribute("href")
                except:
                    url = "N/A"

                # product id (noon often exposes data attributes)
                product_id = product.get_attribute("data-product-id") or product.get_attribute("data-sku") or product.get_attribute("data-id") or "N/A"

                # badges / labels (e.g., "Best Seller", discounts)
                badges = []
                try:
                    badge_elements = product.find_elements(By.XPATH,
                        ".//span[contains(@class,'badge')] | .//span[contains(@data-qa,'badge')] | .//div[contains(@class,'badge')]"
                    )
                    for b in badge_elements:
                        t = b.text.strip()
                        if t:
                            badges.append(t)
                except:
                    pass

                # express/noon delivery detection
                is_express = False
                try:
                    if product.find_elements(By.XPATH,
                        ".//span[contains(translate(.,'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz'),'noon express')] | .//span[contains(.,'Express')]"
                    ):
                        is_express = True
                except:
                    is_express = False

                # rating and review count (if present)
                try:
                    rating = product.find_element(By.XPATH,
                        ".//span[contains(@class,'rating') or contains(@data-qa,'rating')]"
                    ).text
                except:
                    rating = "N/A"
                try:
                    review_count = product.find_element(By.XPATH,
                        ".//span[contains(@class,'reviews') or contains(@data-qa,'reviews') or contains(@class,'review-count')]"
                    ).text
                except:
                    review_count = "N/A"

                # parse specs from name (uses existing get_specs)
                specs = get_specs(name)

                # cleaned numeric price if possible
                price_numeric = None
                try:
                    cleaned = re.sub(r'[^\d.]', '', price.replace(',', '.'))
                    if cleaned:
                        price_numeric = float(cleaned)
                except:
                    price_numeric = None

                all_products.append({
                    "scrape_time": datetime.utcnow().isoformat(),
                    "brand": phone,
                    "page": page_num,
                    "position_on_page": position,
                    "asin": product_id,               # use product id as asin-equivalent
                    "name": name,
                    "model": specs.get('model', 'N/A'),
                    "storage": specs.get('storage', 'N/A'),
                    "color": specs.get('color', 'N/A'),
                    "price_raw": price,
                    "price_numeric": price_numeric,
                    "currency": "EGP",                # assuming Egyptian noon site
                    "rating": rating,
                    "review_count": review_count,
                    "is_prime": is_express,           # treat as express delivery flag
                    "badges": ", ".join(badges) if badges else "None",
                    "url": url,
                    "website": "Noon"
                })
            except Exception:
                # skip problematic entries
                pass

        # try to go to next page
        try:
            next_btn = driver.find_element(By.XPATH,
                "//a[@rel='next'] | //button[contains(@aria-label,'Next')] | //a[contains(@class,'pagination-next')] | //button[contains(@class,'next')]"
            )
            aria_disabled = next_btn.get_attribute("aria-disabled")
            if aria_disabled and aria_disabled.lower() == "true":
                break
            # click next (works for link or button)
            try:
                next_btn.click()
            except:
                driver.execute_script("arguments[0].click();", next_btn)
            page_num += 1
            time.sleep(2)
        except Exception:
            break

print(f"Total products after Noon scrape: {len(all_products)}")

# Close the driver
driver.quit()

Total products after Noon scrape: 520


In [14]:
# Save the raw scraped data for market analysis
df_raw = pd.DataFrame(all_products)
df_raw.to_csv('Raw_Smart_phone_Data.csv', index=False)
print(f"Saved {len(df_raw)} raw products to Raw Smart phone Data.csv")

Saved 520 raw products to Raw Smart phone Data.csv


# EDA For Scraping Products

In [15]:
df_raw.head()

Unnamed: 0,scrape_time,brand,page,position_on_page,asin,name,model,storage,color,price_raw,price_numeric,currency,rating,review_count,is_prime,badges,url,website
0,2025-10-20T11:04:45.502098,apple phone,1,1,B09G9DHS68,Apple,,,,3044400,30.444,EGP,,Apple,False,,,Amazon
1,2025-10-20T11:04:46.403054,apple phone,1,2,B09G9BJD28,Apple,,,,3044400,30.444,EGP,,Apple,False,Save 5%,,Amazon
2,2025-10-20T11:04:46.633482,apple phone,1,3,B0DGJ2FB85,Apple,,,,7575000,75.75,EGP,,Apple,False,,,Amazon
3,2025-10-20T11:04:47.003698,apple phone,1,4,B0CHXNZPP1,Apple,,,,5444400,54.444,EGP,,Apple,False,,,Amazon
4,2025-10-20T11:04:47.214114,apple phone,1,5,B0DXQHPY34,Apple,,,,4999900,49.999,EGP,,Apple,False,,,Amazon


In [16]:
df_raw.shape

(520, 18)