In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time

# Initialize WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--headless")  # Run in headless mode (no browser window)
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                      "AppleWebKit/537.36 (KHTML, like Gecko) "
                      "Chrome/112.0.0.0 Safari/537.36")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# eBay iPhone Listings URL
base_url = "https://www.ebay.com/sch/i.html?_nkw=iphone&_ipg=50&_pgn="

# Function to Scrape a Single Page
def scrape_ebay(page):
    url = base_url + str(page)
    driver.get(url)
    time.sleep(5)  # Wait for page to load

    try:
        # Wait for product elements to load
        WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.CLASS_NAME, 's-item'))
        )
        print(f"Page {page} loaded successfully.")
    except:
        print(f"Timeout: No products found on page {page}")
        return []

    # Find all product cards
    products = driver.find_elements(By.CLASS_NAME, 's-item')
    print(f"Found {len(products)} products on page {page}")

    data = []
    for product in products:
        # Product Name
        try:
            name = product.find_element(By.CLASS_NAME, 's-item__title').text.strip()
        except:
            name = "N/A"

        # Price
        try:
            price = product.find_element(By.CLASS_NAME, 's-item__price').text.replace("$", "").replace(",", "").strip()
        except:
            price = "0"

        # Ratings
        try:
            rating = product.find_element(By.CLASS_NAME, 'x-star-rating').get_attribute("aria-label").split(" ")[0]
        except:
            rating = "0"

        # Reviews
        try:
            review = product.find_element(By.CLASS_NAME, 's-item__reviews-count').text.strip().split(" ")[0]
        except:
            review = "0"

        # Availability
        availability = "Available" if name != "N/A" else "Unavailable"

        # Append the extracted data
        data.append({
            'Product Name': name,
            'Price': price,
            'Ratings': rating,
            'Reviews': review,
            'Availability': availability
        })
    
    return data

# Scrape Multiple Pages
all_data = []
total_pages = 3  # Set how many pages to scrape
for page in range(1, total_pages + 1):
    print(f"\nScraping page {page}...")
    page_data = scrape_ebay(page)
    all_data.extend(page_data)
    time.sleep(5)  # Delay to avoid being blocked

# Close WebDriver
driver.quit()

# Convert to DataFrame
df = pd.DataFrame(all_data)

# Handle missing data
if 'Price' not in df.columns:
    df['Price'] = "0"
if 'Ratings' not in df.columns:
    df['Ratings'] = "0"
if 'Reviews' not in df.columns:
    df['Reviews'] = "0"

# Save to CSV
df.to_csv('iphones_data.csv', index=False)
print("Data saved to iphones_data.csv")

# Convert "Price" to Numeric for EDA
df['Price'] = pd.to_numeric(df['Price'], errors='coerce')
df['Ratings'] = pd.to_numeric(df['Ratings'], errors='coerce')
df['Reviews'] = pd.to_numeric(df['Reviews'], errors='coerce')

# Basic EDA
print("\nSummary of Prices:")
print(df['Price'].describe())

print("\nSummary of Ratings:")
print(df['Ratings'].describe())

print("\nSummary of Reviews:")
print(df['Reviews'].describe())



Scraping page 1...
Page 1 loaded successfully.
Found 62 products on page 1

Scraping page 2...
Page 2 loaded successfully.
Found 62 products on page 2

Scraping page 3...
Page 3 loaded successfully.
Found 62 products on page 3
Data saved to iphones_data.csv

Summary of Prices:
count    137.000000
mean     276.852190
std      115.407103
min      102.500000
25%      210.000000
50%      254.550000
75%      332.980000
max      674.990000
Name: Price, dtype: float64

Summary of Ratings:
count    186.0
mean       0.0
std        0.0
min        0.0
25%        0.0
50%        0.0
75%        0.0
max        0.0
Name: Ratings, dtype: float64

Summary of Reviews:
count    186.000000
mean      22.102151
std       31.309360
min        0.000000
25%        0.000000
50%        9.000000
75%       38.000000
max      229.000000
Name: Reviews, dtype: float64


In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time

In [3]:
# Initialize WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--headless")  # Run in headless mode (no browser window)
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                      "AppleWebKit/537.36 (KHTML, like Gecko) "
                      "Chrome/112.0.0.0 Safari/537.36")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)


In [4]:
# eBay iPhone Listings URL
base_url = "https://www.ebay.com/sch/i.html?_nkw=iphone&_ipg=50&_pgn="


In [5]:
# Function to Scrape a Single Page
def scrape_ebay(page):
    url = base_url + str(page)
    driver.get(url)
    time.sleep(5)  # Wait for page to load

    try:
        # Wait for product elements to load
        WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.CLASS_NAME, 's-item'))
        )
        print(f"Page {page} loaded successfully.")
    except:
        print(f"Timeout: No products found on page {page}")
        return []

    # Find all product cards
    products = driver.find_elements(By.CLASS_NAME, 's-item')
    print(f"Found {len(products)} products on page {page}")

    data = []
    for product in products:
        # Product Name
        try:
            name = product.find_element(By.CLASS_NAME, 's-item__title').text.strip()
        except:
            name = "N/A"

        # Price
        try:
            price = product.find_element(By.CLASS_NAME, 's-item__price').text.replace("$", "").replace(",", "").strip()
        except:
            price = "0"

        # Ratings
        try:
            rating = product.find_element(By.CLASS_NAME, 'x-star-rating').get_attribute("aria-label").split(" ")[0]
        except:
            rating = "0"

        # Reviews
        try:
            review = product.find_element(By.CLASS_NAME, 's-item__reviews-count').text.strip().split(" ")[0]
        except:
            review = "0"

        # Availability
        availability = "Available" if name != "N/A" else "Unavailable"

        # Append the extracted data
        data.append({
            'Product Name': name,
            'Price': price,
            'Ratings': rating,
            'Reviews': review,
            'Availability': availability
        })
    
    return data


In [6]:
# Scrape Multiple Pages
all_data = []
total_pages = 3  # Set how many pages to scrape
for page in range(1, total_pages + 1):
    print(f"\nScraping page {page}...")
    page_data = scrape_ebay(page)
    all_data.extend(page_data)
    time.sleep(5)  # Delay to avoid being blocked

# Close WebDriver
driver.quit()



Scraping page 1...
Page 1 loaded successfully.
Found 62 products on page 1

Scraping page 2...
Page 2 loaded successfully.
Found 62 products on page 2

Scraping page 3...
Page 3 loaded successfully.
Found 62 products on page 3


In [7]:
# Convert to DataFrame
df = pd.DataFrame(all_data)

# Handle missing data
if 'Price' not in df.columns:
    df['Price'] = "0"
if 'Ratings' not in df.columns:
    df['Ratings'] = "0"
if 'Reviews' not in df.columns:
    df['Reviews'] = "0"


In [8]:
# Save to CSV
df.to_csv('iphones_data.csv', index=False)
print("Data saved to iphones_data.csv")

# Convert "Price" to Numeric for EDA
df['Price'] = pd.to_numeric(df['Price'], errors='coerce')
df['Ratings'] = pd.to_numeric(df['Ratings'], errors='coerce')
df['Reviews'] = pd.to_numeric(df['Reviews'], errors='coerce')


Data saved to iphones_data.csv


In [9]:
# Basic EDA
print("\nSummary of Prices:")
print(df['Price'].describe())

print("\nSummary of Ratings:")
print(df['Ratings'].describe())

print("\nSummary of Reviews:")
print(df['Reviews'].describe())



Summary of Prices:
count       111.000000
mean       1296.145946
std       11274.668572
min           9.990000
25%         117.250000
50%         202.500000
75%         292.490000
max      119000.000000
Name: Price, dtype: float64

Summary of Ratings:
count    186.0
mean       0.0
std        0.0
min        0.0
25%        0.0
50%        0.0
75%        0.0
max        0.0
Name: Ratings, dtype: float64

Summary of Reviews:
count    186.000000
mean      44.951613
std      100.852918
min        0.000000
25%        0.000000
50%        3.000000
75%       39.500000
max      582.000000
Name: Reviews, dtype: float64
