# Web scraping with BeautifulSoup and Selenium

### Prerequisites:

#### python 3.6
#### chromedriver <-- a Chrome browser engine to initialize Chrome for automated running of Selenium-related script

In [1]:
#Prerequisites

import requests  #for handling HTTP requests
from bs4 import BeautifulSoup  #for parsing HTML
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait #for making WebDriver wait for an element to meet expected condition
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options #Set options for running headless Chrome browser with Selenium
import random
import time
import pandas as pd
import numpy as np
import re

In [2]:
#Define website url which I am going to scrape
url = "https://www.broadwaylifestyle.com/categories/usage/mobile-products/mobile-phone.html"

In [3]:
#Create a new Chrome session
options = Options()
options.headless = True #to initiate chrome browser in headless mode
driver = webdriver.Chrome(options=options)
driver.get(url)
#tell WebDriver to elapse 5 seconds when trying to find any element (or elements) not immediately available
driver.implicitly_wait(5)

#Simulate clicking 'Esc' to jump out pop-up window (if any)
webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()

In [4]:
#Scroll down the page to the bottom
lenOfPage = driver.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
match=False

while(match==False):
    lastCount = lenOfPage
    time.sleep(3)
    lenOfPage = driver.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
    
    if lastCount==lenOfPage:
        match=True

In [5]:
#Change webpage display in English
footer_button = driver.find_element_by_class_name('footer_title')
footer_button.click()#clicklink

try:
    eng_ver = driver.find_element_by_link_text("ENG")
    eng_ver.click()
except NoSuchElementException:
    print("English version already.")
    pass

In [6]:
#Scroll down the page to the bottom
lenOfPage = driver.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")

while True:
    try:
        lastCount = lenOfPage
        lenOfPage = driver.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
    
        time.sleep(3)
        view_button = driver.find_element_by_link_text("View More Products")
        view_button.click()
    
    except NoSuchElementException:
        if lastCount==lenOfPage:
            print ("There's no more products at the bottom of this page.")
            break

There's no more products at the bottom of this page.


In [7]:
containers = BeautifulSoup(driver.page_source, 'html.parser')
products_urls = [link.a.get('href') for link in containers.findAll("div", class_ = "product-item-info")]

products_urls = list(np.unique(products_urls))

print("We finally fetched " + str(len(products_urls)) + " products URLs.")

We finally fetched 85 products URLs.


#### In order to build and maintain an efficient automated web scraper, firstly we need to understand the structure of the website, e.g. html source code, XPath, etc.
#### (p.s. click F12 to inspect the element)

In [8]:
#shuffle url randomly to ensure browser can access the url followed by same product with different color options
random.shuffle(products_urls)

In [9]:
#Define field of product details
brand = []
product_name = []
color = [] #different colour options may result in different prices
capacity_spec = [] #different capacity options may result in different prices
RRP = []   #recommended retail price
final_price = []
free_gift = []
product_url = []

In [10]:
%%time

for i in products_urls:
    driver.get(i)
       
    driver.refresh()
    
    #Give the javascript time to render
    time.sleep(5)
    
    #Use driver.page_source to get the HTML as it appears after javascript has rendered it
    page_source = driver.page_source
              
    #Use a parser on the returned HTML
    soup = BeautifulSoup(page_source, 'html.parser')
    
    
    try:      
        view_more_button = driver.find_element_by_link_text("Click to Read More")
        view_more_button.click()
        time.sleep(5)
        
        #for x in colour:
        #driver.execute_script("arguments[0].click()", x)
             
        #scrap brand
        brand.append(soup.find("div", class_ = "brand-name").text)
                               
        #scrap product name
        product_name.append(soup.find("span", itemprop = "name").text.upper())
            
        #scrap capacity
        capacity_spec.append(soup.find(attrs={"data-th" : 'Rom (GB)'}).text)
                            
        #scrap colour
        #colour = driver.find_elements_by_xpath('//div[contains(@class, "swatch-option")][not(contains(@class, "swatch-option disabled"))]')
        
        colour = driver.find_element_by_xpath('//div[contains(@class, "selected")]')
        color.append(colour.get_attribute('option-label'))
                
        #scrap RRP
        if soup.find("span", class_ = "old-price") is not None:
            RRP.append(soup.find("span", class_ = "price-inner").text.replace('HK$ ','').replace(',',''))
        else:
            RRP.append('')
                
        #scrap final price
        final_price.append(soup.find("span", class_ = "price").text.replace('HK$ ','').replace(',',''))
                        
        #scrap product_url
        product_url.append(i)
        
    except:
        continue

driver.close()
print ("Scraping is completed.")

Scraping is completed.
Wall time: 22min 28s


In [11]:
#To zip 8 lists together and then construct dataframe
list_of_tuples = list(zip(brand, product_name, color, capacity_spec, RRP, final_price, product_url))
df = pd.DataFrame(list_of_tuples, columns = ['Brand', 'Product Name', 'Colour', 'Capacity', 'RRP', 'Selling Price', 'Product URL'])

#sort dataframe by Product Name ('A-Z')
df = df.sort_values('Product Name')

#Indicate "Broadway" as data source
df.insert(0, 'Source', 'Broadway')

#Populate current date in new column for visualizing time-series data
df['Date'] = pd.to_datetime('today').date()

### Data Cleaning

In [12]:
#To convert all characters to upper case and remove all parentheses and stuff within for Product Name
#Example: 'HUAWEI MATE20 PRO (6GB/128GB) (BK)' => 'HUAWEI MATE20 PRO'
df['Product Name'] = df['Product Name'].str.replace(r"\(.*\)","")

#To remove word 'SMARTPHONE'/'VERSION'/'MOBILE PHONE' from product name and strip space from the beginning and the end of the string 
df['Product Name'] = df['Product Name'].str.replace('SMARTPHONE','')
df['Product Name'] = df['Product Name'].str.replace('VERSION','')
df['Product Name'] = df['Product Name'].str.replace('GAMING','')
df['Product Name'] = df['Product Name'].str.replace('CRYSTAL-WH/BL','')
df['Product Name'] = df['Product Name'].str.replace('SUNRISE-OR','')
df['Product Name'] = df['Product Name'].str.replace('FLIP  TA-1170','')
df['Product Name'] = df['Product Name'].str.replace('MOBILE PHONE','')
df['Product Name'] = df['Product Name'].str.replace('GALAXY','')
df['Product Name'] = df['Product Name'].str.replace('MATE20','MATE 20')
df['Product Name'] = df['Product Name'].str.replace('MATE 20 X','MATE 20X')
df['Product Name'] = df['Product Name'].str.replace('NOTE','NOTE ')
df['Product Name'] = df['Product Name'].str.replace('TA-1164 DS 3/32 HK','')
df['Product Name'] = df['Product Name'].str.replace('TA-1196 DS 6/128 HK','')
df['Product Name'] = df['Product Name'].str.replace('TA-1189','')
df['Product Name'] = df['Product Name'].str.replace('TA-1170','')

#Remove year (e.g. 2018, 2019)
year = ['2017','2018','2019','2020','2021','2022','2023','2024']
df['Product Name'] = df['Product Name'].str.replace('|'.join([re.escape(s) for s in year]), '')

#Trim all spaces from the text string except for single spaces between words
df['Product Name'] = df['Product Name'].str.strip()

#To convert all characters to upper case and remove all special characters, parentheses and stuff within for Colour
df['Colour'] = df['Colour'].str.upper()
df['Colour'] = [re.sub('[^A-Z0-9 \n]', '', x) for x in df['Colour']]
df['Colour'] = df['Colour'].str.strip()

#To remove all whitespace and add parenthesis for 'Capacity'
df['Capacity'] = df['Capacity'].str.replace(' ','')
df['Capacity'] = [''.join('(' + item + ')').replace('()','') for item in df['Capacity']]

#To remove blank line for 'RRP' and 'Selling Price'
df['RRP'] = df['RRP'].str.replace('\n','').str.strip()
df['Selling Price'] = df['Selling Price'].str.replace('\n','').str.strip()

In [13]:
#Get list of unique brand
unique_brand = df['Brand'].unique().tolist()

#Problem found: in some cases, product name does not include brand (e.g.'GALAXY A8+')
#Solution: Align product name display (i.e. brand + name + capacity)
#1. remove brand name in product name first
p = re.compile('|'.join(map(re.escape, unique_brand)))
df['Product Name'] = [p.sub('', text).strip() for text in df['Product Name']]

#2. concatenate Brand Name and product name to align product name format
df['Product Name'] = df['Brand'].map(str) + ' ' + df['Product Name'].map(str) + ' ' + df['Capacity'].map(str)

In [14]:
#Create dictionary for special handling of product name
dict1 = {
    "SAMSUNG A70 (128GB)": "SAMSUNG A70 (6GB)", #wrongly marked by Broadway
    "NOKIA 7.2 (128GB)": "NOKIA 7.2"
}

# Remap the values of the dataframe 
df.replace({'Product Name': dict1}, inplace=True)

In [15]:
#convert Date column to datetime
df['Date'] = df['Date'].apply(lambda x: x.strftime('%Y-%m-%d'))

#Finalize dataframe column order
df = df[['Date', 'Source', 'Brand', 'Product Name', 'Colour', 'Capacity', 'RRP', 'Selling Price', 'Product URL']]

df1 = df[['Date', 'Source', 'Brand', 'Product Name', 'RRP', 'Selling Price']]

#Drop duplicates product and keep='first' to keep first of duplicates
#In case one product is selling at different prices for different colours, then only keep first of duplicates for each product name
#(i.e. given that most products are selling the same price for different colours)
cleaned_df1 = df1.drop_duplicates(subset=['Product Name'], keep='first')

In [16]:
# To write dataframe to new Excel file and append data to 'Aggregated_Broadway_headsets_offer.csv'
CurrentDate = time.strftime("%Y-%m-%d")

root = 'Broadway'

df.to_excel(root + '/' + CurrentDate + ' Broadway headsets offer' + '.xlsx', sheet_name = 'Broadway_headsets', index = False, encoding='utf-8-sig')

# Stack the DataFrames on top of 'Aggregated_Broadway_headsets_offer.csv'
cleaned_df1.to_csv('Aggregated_Broadway_headsets_offer.csv', index = False, header = False, mode='a')