In [13]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from tqdm import tqdm
import pandas as pd
from bs4 import BeautifulSoup
import time

In [2]:
categories = ['shirt', 't-shirt', 'jacket', 'pants', 'skirt', 
             'shorts', 'shoes', 'sneakers', 'heels' ,'flats', 
             'dress', 'hats', 'bags']

base_url = lambda pname, pn: f"https://www.myntra.com/{pname}?p={pn}&rows=100"

In [3]:
def with_driver(func):
    def wrap(*args, **kwargs):
        driver = webdriver.Edge()
        data = func(driver = driver, *args, **kwargs)
        driver.quit()
        return data
    return wrap

In [12]:
def make_data(html, name):
    data = []
    soup = BeautifulSoup(html, "html.parser")
    for product in soup.find_all('li',{'class':'product-base'}):
        try:
            product_source = product.find('a')['href']
            product_img_url = product.find('picture').find('img')['src']
            product_name = product.find('h3').text
            product_price = product.find('span', {'class': 'product-discountedPrice'})
            data.append({'product_category': name, 
                         'product_name': product_name, 
                         'product_price': product_price, 
                         'product_img_url': product_img_url,
                         'product_source': product_source})
        except:
            pass
    return data

In [14]:
@with_driver
def get_data(driver, product_names, pn = 20):
    full_data = []
    for product_name in tqdm(product_names, leave=False):
        for n in range(1, pn+1):
            url = base_url(product_name, n)
            driver.get(url)
            try:
                element = driver.find_element(by=By.CLASS_NAME, value = 'results-base')
            except NoSuchElementException:
                continue
            html = element.get_attribute('innerHTML')
            full_data += make_data(html, product_name)
        print(len(full_data))
    return full_data

In [None]:
data = get_data(product_names=categories)

In [16]:
df = pd.DataFrame.from_dict(data)

In [17]:
df.product_category.value_counts()

shirt       220
t-shirt     220
jacket      220
pants       220
skirt       220
shorts      220
shoes       220
sneakers    220
heels       220
flats       220
dress       220
bags        209
hats         55
Name: product_category, dtype: int64

In [18]:
df.to_csv("data.csv", index = False)

In [56]:
df.head(4)

Unnamed: 0,product_category,product_name,product_img_url,product_source,product_price
0,shirt,Roadster,"https://assets.myntassets.com/dpr_2,q_60,w_210...",shirts/roadster/roadster-men-black--grey-check...,Rs. 449
1,shirt,Dennis Lingo,"https://assets.myntassets.com/dpr_2,q_60,w_210...",shirts/dennis-lingo/dennis-lingo-men-pink-slim...,Rs. 684
2,shirt,HIGHLANDER,"https://assets.myntassets.com/dpr_2,q_60,w_210...",shirts/highlander/highlander-men-white--blue-s...,Rs. 597
3,shirt,H&M,"https://assets.myntassets.com/dpr_2,q_60,w_210...",shirts/hm/hm-men-black-relaxed-fit-linen-blend...,
