# Selenium Frontend Web Scraping

In [4]:
import random
import pandas as pd
import time
import logging

from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import *
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from fake_useragent import UserAgent

import chromedriver_autoinstaller
chromedriver_autoinstaller.install()

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)

In [5]:
def try_find_element_by_xpath(xpath_code):
    try:
        text = driver.find_element_by_xpath(xpath_code)
        if(text == ''):
            text = 'Unavailable'
        return text
    except NoSuchElementException:
        logger.info('NoSuchElementException')
        text = '-'
        return text
        
def try_webdriver_text(webdriver_object):
    try:
        text = webdriver_object.text
        if(text == ''):
            text = 'Unavailable'
        return text
    except AttributeError:
        logger.info('AttributeError')
        text = '-'
        return text
    
def try_get_attribute(element, string):
    try:
        text = element.get_attribute(string)
        if(text == ''):
            text = 'Unavailable'
        return text
    except AttributeError:
        logger.info('AttributeError')
        text = '-'
        return text
        
def web_driver(headless=False):
    option = webdriver.ChromeOptions()
    if(headless == True):
        option.add_argument('--headless')
    ua = UserAgent()
    userAgent = ua.random
    logger.info(userAgent)
    option.add_argument(f'user-agent={userAgent}')
    # option.add_argument(' — incognito')
    option.add_argument('start-maximized')
    option.add_argument('disable-infobars')
    option.add_argument('--disable-extensions')
    driver = webdriver.Chrome(options=option)
    return driver

In [9]:
driver = web_driver()
driver.get('https://www.lazada.com.my')
time.sleep(3)
element = try_find_element_by_xpath('//*[@class="hp-mod-card-content card-channels-content"]//a[1]') # /html/body/div[5]/div[3]/div/div[1]/a
webdriver.ActionChains(driver).move_to_element(element).click(element).perform()
time.sleep(3)

In [None]:
shop_url = []; shop_titles = []; img_url = []

try:
    element = try_find_element_by_xpath('//*[@class="J_Flipsnap slider-list"]/li[2]/div[2]')
    logger.info('Scraping Category 1: '+try_webdriver_text(element))
    cat_element = element
    
    webdriver.ActionChains(driver).move_to_element(cat_element).click(cat_element).perform()
    time.sleep(2)

    counter = 0
    load_more = 0
    while(load_more == 0):
        webdriver.ActionChains(driver).key_down(Keys.CONTROL).send_keys(Keys.END).perform()
        time.sleep(1)

#             load_more_element = try_find_element_by_xpath('/html/body/div[3]/div/div[6]/div[2]/a')
        load_more_element = try_find_element_by_xpath('//*[@class="button J_LoadMoreButton"]')
        if(try_webdriver_text(load_more_element) != 'Unavailable'):
            webdriver.ActionChains(driver).move_to_element(load_more_element).click(load_more_element).perform()
            counter = counter+1
            # logger.info(counter)
            time.sleep(1)
        else:
            load_more = 1

    time.sleep(1)

    limit = counter - 1

    webdriver.ActionChains(driver).move_to_element(cat_element).click(cat_element).perform()
    time.sleep(2)

    for j in range(limit):
        # logger.info(j+1)
        webdriver.ActionChains(driver).key_down(Keys.CONTROL).send_keys(Keys.END).perform()
        time.sleep(1)
        load_more_element = try_find_element_by_xpath('//*[@class="button J_LoadMoreButton"]')
        webdriver.ActionChains(driver).move_to_element(load_more_element).click(load_more_element).perform()
        time.sleep(1)

    # Gets all LazMall information
    lazmall_list = driver.find_elements_by_xpath('//*[@class="lazmall-card-store-for-you J_StoreForYou"]/div')
    lazmall_list

    for container in lazmall_list:
        shop_url.append(container.find_element_by_class_name('store-title').get_attribute('href'))
        shop_titles.append(container.find_element_by_class_name('header-text').text)
        img_url.append(container.find_element_by_class_name('image').get_attribute('src'))

    logger.info('Scraped and Appended Category 1: '+try_webdriver_text(element))
except Exception as ex:
    logger.info("Error {}".format(ex))

brands_df = pd.DataFrame(zip(shop_url,shop_titles,img_url),columns=['Shop URL','LazMall Brand Name','Image URL'])
driver.quit()

In [54]:
brands_df

Unnamed: 0,Shop URL,LazMall Brand Name,Image URL
0,https://www.lazada.com.my/shop/tesco-groceries...,Tesco Groceries,https://my-test-11.slatic.net/shop/921f69441aa...
1,https://www.lazada.com.my/shop/signature-snack...,Signature Market,https://my-test-11.slatic.net/shop/81b090ae028...
2,https://www.lazada.com.my/shop/alpro-pharmacy?...,Alpro Pharmacy,https://my-test-11.slatic.net/shop/a4652b82435...
3,https://www.lazada.com.my/shop/ramshomedecor?p...,RamsHomeDecor,https://my-test-11.slatic.net/shop/b02c92c310d...
4,https://www.lazada.com.my/shop/caring-estore16...,CARiNG ESTORE,https://my-test-11.slatic.net/v2/resize/page_d...
5,https://www.lazada.com.my/shop/muji1619165809?...,MUJI,https://my-test-11.slatic.net/shop/cb751b48aa4...
6,https://www.lazada.com.my/shop/lzq-malaysia162...,LZQ Malaysia,https://my-test-11.slatic.net/shop/2eac2843114...
7,https://www.lazada.com.my/shop/mondelez1/?pos=...,Mondelez,https://my-test-11.slatic.net/shop/2d322041cea...
8,https://www.lazada.com.my/shop/oldtown-white-c...,OLDTOWN White Coffee,https://my-test-11.slatic.net/shop/f899a12c2e0...
9,https://www.lazada.com.my/shop/nestle?pos=5&ac...,NESTLE,https://my-test-11.slatic.net/shop/19beeba213d...
