In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
import pandas as pd
import numpy as np
import re
import os

In [None]:
def initializeDriver():
    path="D:/Selenium/chromedriver_win32/chromedriver.exe"
    driver = webdriver.Chrome(path)
    return driver

In [None]:
driver = initializeDriver()

In [None]:
directory = './Data/'

dfs = []

for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        filepath = os.path.join(directory, filename)
        df = pd.read_csv(filepath)
        df['category'] = os.path.splitext(filename)[0]
        dfs.append(df)

combined_df = pd.concat(dfs, ignore_index=True)

In [None]:
combined_df.head()

In [None]:
combined_df.drop(columns=df.columns[0], index=1, inplace=True)
combined_df = combined_df.rename(columns={"0": "link"})

In [None]:
combined_df.head()

In [None]:
def closePopUp():
    try:
        popup = WebDriverWait(driver, 10).until(
                EC.visibility_of_element_located((By.CLASS_NAME, "UpdateCityModal__modal-body___H98MG"))
        )
        cancel_button = popup.find_element(By.CLASS_NAME, "UpdateCityModal__cancel-btn___2jWwS")
        cancel_button.click()
    except:
        print("Pop Up not found!!")

In [None]:
def scrape_product_details(url, flag):
    # Open URL
    driver.get(url)
    # Close pop-up
    if flag:
        closePopUp()
    
    info = []
    
    # Image
    image_link = np.nan
    try:
        image_element = driver.find_element_by_css_selector('.col-xs-10.ProductImage__preview-container___2oTeX img')
        image_link = image_element.get_attribute('src')
    except NoSuchElementException:
        pass

    # Title
    title = np.nan
    try:
        title_element = driver.find_element_by_class_name("ProductTitle__product-title___3QMYH")
        title = title_element.text
    except NoSuchElementException:
        pass
    
    # Highlights
    highlights = np.nan
    try:
        highlight_elements = driver.find_elements_by_css_selector('.ProductHighlights__highlights-text___dc-WQ li')
        highlights = '+'.join([element.text for element in highlight_elements])
    except NoSuchElementException:
        pass

    # Number of ratings
    number_ratings = np.nan
    try:
        number_ratings_element = driver.find_element_by_css_selector('.RatingDisplay__ratings-header___ZNj5b')
        number_ratings = number_ratings_element.text
    except NoSuchElementException:
        pass

    # Rating
    rating = np.nan
    try:
        rating_element = driver.find_element_by_css_selector('.ProductTitle__ratings___4MWF_ .RatingDisplay__ratings-container___3oUuo span')
        rating = rating_element.text
    except NoSuchElementException:
        pass

    # Description
    description = np.nan
    try:
        description_content_element = driver.find_element_by_css_selector('.ProductDescription__description-content___A_qCZ')
        description = description_content_element.get_attribute("innerHTML")
        description = re.sub('<.*?>', '', description)
    except NoSuchElementException:
        pass

    # Price
    price = np.nan
    try:
        price_element = driver.find_element_by_class_name("PriceBoxPlanOption__offer-price___3v9x8")
        price = price_element.text
    except NoSuchElementException:
        pass

    # Quantity
    qty = np.nan
    try:
        qty_element = driver.find_element_by_class_name("PackSizeLabel__single-packsize___3KEr_")
        qty = qty_element.text
    except NoSuchElementException:
        pass
        
    info = [image_link, title, highlights, number_ratings, rating, description, price, qty, 'baby-sunscreens']
    return info

In [None]:
combined_df = pd.read_csv("data.csv")
columns = ['image_url', 'title', 'highlights', 'number_ratings', 'rating_element', 'description', 'price', 'qty', 'category']
df_info = pd.DataFrame(columns=columns)
flag=True
for index, row in combined_df.iterrows():
    url = row['link']
    info = scrape_product_details(url, flag)
    flag = False
    df_info.loc[len(df_info)] = info
    df_info['category'] = row['category']
df_info.to_csv("product_description.csv")

In [None]:
data = pd.read_csv("product_description.csv")

In [None]:
data.isnull().sum()

In [None]:
removed_cat = data.drop(['category'], axis=1)