In [None]:
%pip install selenium
%pip install webdriver-manager

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options

options = Options()
options.add_argument("start-maximized")
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64)...")

options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

# Initialize the driver
driver = webdriver.Chrome(options=options)

# Load the Myntra women's dresses page
url = "https://www.myntra.com/women-dresses"
driver.get(url)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(5)

# Optional: wait a bit for dynamic content to load
import time
time.sleep(8)

# Grab the HTML
WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.CLASS_NAME, "product-base"))
)
html = driver.page_source


In [None]:
from bs4 import BeautifulSoup

# Parse the HTML
soup = BeautifulSoup(html, 'html.parser')

In [None]:
products = soup.find_all('li', {'class': 'product-base'})
print(f"Found {len(products)} products")


# List to store scraped data
data = []

for item in products:
    brand = item.find('h3', {'class': 'product-brand'}).text.strip()
    name = item.find('h4', {'class': 'product-product'}).text.strip()

    # Finding Original and Discounted Prices
    price_elements = item.find('div', {'class': 'product-price'})
    prices = price_elements.find_all('span')
    discounted_price = prices[0].text.strip() if len(prices) > 0 else None
    original_price = prices[1].text.strip() if len(prices) > 1 else None
    discount = prices[2].text.strip() if len(prices) > 2 else None

    # Product Link
    link_tag = item.find('a', href = True)
    product_link = "https://www.myntra.com" + link_tag['href'] if link_tag else None

    data.append({
        'Brand': brand,
        'Name': name,
        'Discounted Price': discounted_price,
        'Original Price': original_price,
        'Discount': discount,
        'Link': product_link
    })

In [None]:
import pandas as pd

df = pd.DataFrame(data)
print(df.head())

In [None]:
# Strip extra spaces from column names
df.columns = df.columns.str.strip()

# Split 'Discounted Price' if it has two prices jammed together
def extract_prices(price_str):
    import re
    if pd.isna(price_str):
        return None, None
    prices = re.findall(r'Rs\.?\s?(\d+)', price_str)
    if len(prices) == 2:
        return int(prices[0]), int(prices[1])
    elif len(prices) == 1:
        return int(prices[0]), None
    else:
        return None, None

df[['Discounted_Price', 'Original_Price']] = df['Discounted Price'].apply(
    lambda x: pd.Series(extract_prices(x))
)

# Fix broken URLs (missing slash after domain)
df['Link'] = df['Link'].str.replace("www.myntra.comdresses", "www.myntra.com/dresses")

# Clean 'Discount' column if needed
df['Discount_Percentage'] = (
    (df['Original_Price'] - df['Discounted_Price']) / df['Original_Price'] * 100
).round(1)

# Drop old columns
df = df.drop(columns=['Discounted Price', 'Original Price', 'Discount'])

# Optional: drop rows with missing prices
df = df.dropna(subset=['Discounted_Price'])

# Reset index
df = df.reset_index(drop=True)

print(df.head())


In [None]:
df.to_csv("/Users/miaalex/Desktop/indian-fashion-trends/data/myntra_womens_dresses.csv", index=False)
