In [1]:
# imports
try:
    from selenium import webdriver
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.common.by import By
    from selenium.common.exceptions import TimeoutException
    from selenium.common.exceptions import NoSuchElementException
    # from ..Modules import SaveDataAsCSV
    from dataclasses import dataclass
except Exception as e:
    print(e)
    print("Exception encountered while importing modules in Scripts/purewow_fashion")

In [2]:
# driver
path = "D:\Selenium\chromedriver.exe"
driver = webdriver.Chrome(path)
driver.maximize_window()
website = 'https://www.purewow.com/fashion'
driver.get(website)

In [3]:
# dealing with popup
print("Waiting for popup")
try:
    WebDriverWait(driver, 60).until(EC.frame_to_be_available_and_switch_to_it((By.XPATH, "//iframe[contains(@id, 'lightbox-iframe')]")))
except TimeoutException:
    print("TimeoutException: Popup not available in 60s, assume no popup")
except Exception as e:
    print(e)
    print("Error occured while waiting for popup")
else:
    skip_popup_button = driver.find_element_by_xpath(".//*[@id='layout']//button[text()='NO THANKS']")
    skip_popup_button.click()
    driver.switch_to.default_content()
    print("Popup closed successfully")

Waiting for popup
Popup closed successfully


In [4]:
# dataclass defined to store all scraped data
@dataclass
class blog_data:
    blog_title: str
    blog_date: str
    author_name: str
    blog_link: str
    author_profile_link: str
    thumbnail_link: str
    thumbnail_credit: str

all_data = []
column_names = ["Blog Title", "Blog Date", "Author Name", "Blog Link", "Author Profile Link", "Thumbnail Link", "Thumbnail Credit"]

In [5]:
# button clicks
flag = 0
var = 2
temp = 0
while True:
    try:
        more_button = WebDriverWait(driver, 60).until(EC.element_to_be_clickable((By.XPATH, "//div[@class='container show-more-container']//a[@href='javascript:void(0);']")))
    except TimeoutException:
        flag = 1
        print("TimeoutException: All clicks successful")
        break
    except Exception as e:
        print(e)
        print("Some error occured while clicking more_button")
        break
    else:
        more_button.click()
        temp += 1
        print("No TimeoutException: Single click successful")
        if temp == var:
            flag = 1
            print("No TimeoutException: All clicks successful")
            break

No TimeoutException: Single click successful
No TimeoutException: Single click successful
No TimeoutException: All clicks successful


In [6]:
# function to extract data from cards
def extract_data_from_cards(*card_types):
    """
    Traverses and extracts data from all cards passed.\n
    \n
    Parameters:\n
    Takes variable number of lists, each list is of a specific card type
    """
    
    for card_type in card_types:
        for card in card_type:
            # blog title
            try:
                element = card.find_element_by_xpath(".//h1")
            except NoSuchElementException:
                try:
                    element = card.find_element_by_xpath(".//h4")
                except NoSuchElementException:
                    card_blog_title = ""
                else:
                    card_blog_title = element.text.strip()
            else:
                card_blog_title = element.text.strip()

            # blog link
            try:
                element = card.find_element_by_xpath(".//a")
            except NoSuchElementException:
                card_blog_link = ""
            else:
                element_href = str(element.get_attribute("href")).strip()
                if element_href[0] == '/':
                    card_blog_link = "https://www.purewow.com" + element_href
                else:
                    card_blog_link = element_href

            # thumbnail link
            try:
                element = card.find_element_by_xpath(".//img")
            except NoSuchElementException:
                card_thumbnail_link = ""
            else:
                card_thumbnail_link = str(element.get_attribute("src")).strip()

            # thumbnail credit
            try:
                element = card.find_element_by_xpath(".//span[@class='photo_credit']")
            except NoSuchElementException:
                card_thumbnail_credit = ""
            else:
                card_thumbnail_credit = element.text.strip()

            # add data to list
            all_data.append(blog_data(card_blog_title, "", "", card_blog_link, "", card_thumbnail_link, card_thumbnail_credit))

In [7]:
# there are three different types of blog cards on webpage which are stored in cards_sliding, cards_big and cards_normal
if flag:
    print("Inside if")

    # cards_sliding
    try:
        WebDriverWait(driver, 60).until(EC.presence_of_element_located((By.XPATH, "//div[@class='row slide-content']")))
    except TimeoutException:
        print("TimeoutException: No cards_sliding found")
    except Exception as e:
        print(e)
        print("Some error while waiting for cards_sliding")
    else:
        cards_sliding = driver.find_elements_by_xpath("//div[@class='row slide-content']")
        print(len(cards_sliding))

    # cards_big
    try:
        WebDriverWait(driver, 60).until(EC.presence_of_element_located((By.XPATH, "//div[@class='col-sm-6 ']")))
    except TimeoutException:
        print("TimeoutException: No cards_big found")
    except Exception as e:
        print(e)
        print("Some error while waiting for cards_big")
    else:
        cards_big = driver.find_elements_by_xpath("//div[@class='col-sm-6 ']")
        print(len(cards_big))
    
    # cards_normal
    try:
        WebDriverWait(driver, 60).until(EC.presence_of_element_located((By.XPATH, "//div[@class='col-sm-4' or @class='col-sm-4 ']")))
    except TimeoutException:
        print("TimeoutException: No cards_normal found")
    except Exception as e:
        print(e)
        print("Some error while waiting for cards_normal")
    else:
        cards_normal = driver.find_elements_by_xpath("//div[@class='col-sm-4' or @class='col-sm-4 ']")
        print(len(cards_normal))

    # extract data from cards
    extract_data_from_cards(cards_sliding, cards_big, cards_normal)

Inside if
6
4
24


In [8]:
# saving dataframe as csv
import pandas as pd
test_df = pd.DataFrame(all_data)
test_df
# SaveDataAsCSV.dataclass_to_csv_in_data(dataclass_list=all_data, column_name_list=column_names, caller_path=__file__)

Unnamed: 0,blog_title,blog_date,author_name,blog_link,author_profile_link,thumbnail_link,thumbnail_credit
0,,,,https://www.purewow.com/news/rihanna-green-bra...,,https://purewows3.imgix.net/images/articles/20...,
1,,,,https://www.purewow.com/fashion/best-party-dre...,,https://purewows3.imgix.net/images/articles/20...,
2,Sofía Vergara Takes Holiday Party-Chic to Anot...,,,https://www.purewow.com/news/sofia-vergara-ski...,,https://purewows3.imgix.net/images/articles/20...,STEVE GRANITZ/GETTY IMAGES
3,15 Chic Ways to Style Overalls,,,https://www.purewow.com/fashion/how-to-style-o...,,https://purewows3.imgix.net/images/articles/20...,GETTY IMAGES
4,,,,https://www.purewow.com/news/rihanna-green-bra...,,https://purewows3.imgix.net/images/articles/20...,
5,,,,https://www.purewow.com/fashion/best-party-dre...,,https://purewows3.imgix.net/images/articles/20...,
6,19 Stores Like Madewell to Shop Right Now,,,https://www.purewow.com/fashion/stores-like-ma...,,https://purewows3.imgix.net/images/articles/20...,ANTHROPOLOGIE/FREE PEOPLE/JENNI KAYNE
7,The 32 Best Sales to Shop Right Now,,,https://www.purewow.com/fashion/best-sales-to-...,,https://purewows3.imgix.net/images/articles/20...,MCKENZIE CORDELL
8,"Telfar, Brandon Blackwood and 25 Black-Owned H...",,,https://www.purewow.com/fashion/black-owned-ha...,,https://purewows3.imgix.net/images/articles/20...,SOFIA KRAUSHAAR
9,The 5 Rules of Wearing Winter Boots in 2022,,,https://www.purewow.com/fashion/winter-boots-s...,,https://purewows3.imgix.net/images/articles/20...,@GRACEATWOOD/INSTAGRAM


In [10]:
# driver.quit()