# Web scraping with BeautifulSoup and Selenium

### Prerequisites:

#### python 3.6
#### chromedriver <-- a Chrome browser engine to initialize Chrome for automated running of Selenium-related script

In [1]:
#Prerequisites

import requests  #for handling HTTP requests
from bs4 import BeautifulSoup  #for parsing HTML
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.action_chains import ActionChains
import time
import pandas as pd
import numpy as np
import re

In [2]:
#Define website url which I am going to scrape
url = "https://www.fortress.com.hk/en/shop/mobile-and-communications/smartphones/c/5"
base_url = "https://www.fortress.com.hk"

In [3]:
#Create a new Chrome session
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(options = chrome_options)
driver.implicitly_wait(5) #tell WebDriver to elapse 5 seconds when trying to find any element (or elements) not immediately available

#Direct the driver to the URL we want to scrape
driver.get(url)

#tell WebDriver to elapse 5 seconds when trying to find any element (or elements) not immediately available
driver.implicitly_wait(5)

In [4]:
#After opening the url above, Selenium browser automator will search and click "View More" button
#to display more products. If no such button element, then ignore this exceptional case
while True:
    try:
        view_button = driver.find_element_by_link_text("View More")
        view_button.click()
        time.sleep(3)
    except Exception:
        print ("There's no more products at the bottom of this page.")
        break

There's no more products at the bottom of this page.


#### In order to build and maintain an efficient automated web scraper, firstly we need to understand the structure of the website, e.g. html source code, XPath, etc.
#### (p.s. click F12 to inspect the element)

In [5]:
#Gather all products URLs on the web page at once
headsets_containers = BeautifulSoup(driver.page_source, 'html.parser')

products_urls = []
for a in headsets_containers.find_all('a', class_ = "thumb gtmProductImpressions"): 
    if a.text: 
        products_urls.append(base_url + a['href'])

products_urls = list(np.unique(products_urls))

print("We finally fetched " + str(len(products_urls)) + " products URLs.")

We finally fetched 94 products URLs.


In [6]:
#preview first 5 product urls
#products_urls[:5]

In [7]:
#Define field of product details
brand = []
product_name = []
color = [] #different colour options may result in different prices
capacity_spec = [] #different capacity options may result in different prices
RRP = []   #recommended retail price
selling_price = []
free_gift = []
product_url = []

In [8]:
%%time

for i in products_urls:
    driver.get(i)
         
    try:
        #find color option(s)
        colour = driver.find_elements_by_xpath('//input[contains(@name, "Colour")]')
   
        for x in colour:
            driver.execute_script("arguments[0].click()", x)
            #Give the javascript time to render
            time.sleep(3)

            #find capacity option(s)
            capacity = driver.find_elements_by_xpath('//input[contains(@name, "Capacity")]')
        
            for elem in capacity:
                if elem.is_enabled():
                    driver.execute_script("arguments[0].click()", elem)
                    time.sleep(3)
                
                    #Use driver.page_source to get the HTML as it appears after javascript has rendered it
                    page_source = driver.page_source
                
                    #Use a parser on the returned HTML
                    soup = BeautifulSoup(page_source, 'html.parser')
                    time.sleep(3)
                
                    #scrap capacity
                    capacity_spec.append(elem.get_attribute('value')) 
            
                    #scrap brand
                    brand.append(soup.find("span", itemprop = "brand").text.upper())
                
                    #scrap product name
                    product_name.append(soup.find("h1", class_ = "h2 name").text.upper())
                
                    #scrap colour
                    color.append(x.get_attribute('value'))
                
                    #refresh the parser
                    page_source = driver.page_source
                    soup = BeautifulSoup(page_source, 'html.parser')
                
                    #scrap RRP
                    RRP0 = soup.find("div", itemprop = "offers").text.replace('RRP: ','').replace('HK$','').replace(',','')
                    RRP.append(re.findall('\d+|$', RRP0)[0])
                
                    #scrap free gift titles(s)
                    #free_gift.append(soup.find("div", class_ = "gifts").text.strip().replace("\n"," ").replace("       ",'// '))
                
                    #scrap product_url
                    product_url.append(i)
                
                    #scrap special price (final selling price)
                    if soup.find("span", itemprop = "offers price") is not None:
                        selling_price.append(soup.find("span", itemprop = "offers price").text.replace('HK$','').replace(',',''))
                    else:
                        selling_price.append(RRP0)
        
            #alternative handling if capacity option is not found in the page                
            if len(capacity) == 0:
                page_source = driver.page_source
                soup = BeautifulSoup(page_source, 'html.parser')
                time.sleep(3)
                
                #scrap capacity
                capacity_spec.append('')
            
                #scrap brand
                brand.append(soup.find("span", itemprop = "brand").text.upper())
            
                #scrap product name
                product_name.append(soup.find("h1", class_ = "h2 name").text.upper())
                
                #scrap colour
                color.append(x.get_attribute('value'))
            
                page_source = driver.page_source
                soup = BeautifulSoup(page_source, 'html.parser')
            
                #scrap RRP
                RRP0 = soup.find("div", itemprop = "offers").text.replace('RRP: ','').replace('HK$','').replace(',','')
                RRP.append(re.findall('\d+|$', RRP0)[0])
                
                #scrap free gift title(s)            
                #free_gift.append(soup.find("div", class_ = "gifts").text.strip().replace("\n"," ").replace("       ",'// '))
            
                #scrap product url
                product_url.append(i)
            
                #scrap special price (final selling price)
                if soup.find("span", itemprop = "offers price") is not None:
                    selling_price.append(soup.find("span", itemprop = "offers price").text.replace('HK$','').replace(',',''))
                else:
                    selling_price.append(RRP0)
                
    except:
        continue

driver.close()
print ("Scraping is completed.")

Scraping is completed.
Wall time: 48min 5s


In [9]:
#To zip 8 lists together and then construct dataframe
list_of_tuples = list(zip(brand, product_name, color, capacity_spec, RRP, selling_price, product_url))
df = pd.DataFrame(list_of_tuples, columns = ['Brand', 'Product Name', 'Colour', 'Capacity', 'RRP', 'Selling Price', 'Product URL'])

#Indicate "Fortress" as data source
df.insert(0, 'Source', 'Fortress')

#Populate current date in new column for visualizing time-series data
df['Date'] = pd.to_datetime('today').date()

In [10]:
#To convert all characters to upper case and remove all parentheses and stuff within for Product Name
#Example: 'HUAWEI MATE20 PRO (6GB/128GB) (BK)' => 'HUAWEI MATE20 PRO'
df['Product Name'] = df['Product Name'].str.replace(r"\(.*\)","").str.strip()
df['Product Name'] = df['Product Name'].str.replace('-',"").str.strip() 
df['Product Name'] = df['Product Name'].str.replace('SMARTPHONE','').str.strip()
df['Product Name'] = df['Product Name'].str.replace('VERSION','').str.strip()
df['Product Name'] = df['Product Name'].str.replace('GAMING','').str.strip()
df['Product Name'] = df['Product Name'].str.replace('MOBILE PHONE','').str.strip()
df['Product Name'] = df['Product Name'].str.replace('GALAXY','').str.strip()
df['Product Name'] = df['Product Name'].str.replace('VIEW20','VIEW 20').str.strip()
df['Product Name'] = df['Product Name'].str.replace(' - HONG KONG','').str.strip()  #Trim all spaces from the text string except for single spaces between words

#Create dictionary for special handling of (Nokia) product name
dict1 = {
    "NOKIA 8.1 128GB": "NOKIA 8.1 (128GB)",
    "NOKIA 8.1": "NOKIA 8.1 (64GB)",
    "NOKIA 3.2": "NOKIA 3.2 (32GB)",
    "NOKIA 4.2": "NOKIA 4.2 (32GB)",
    "NOKIA 5.1 PLUS": "NOKIA 5.1 PLUS (32GB)",
    "NOKIA NOKIA 9 PUREVIEW": "NOKIA 9 PUREVIEW (128GB)",
    "NOKIA 2720 FLIP": "NOKIA 2720",
    "NOKIA 7.2": "NOKIA 7.2 (128GB)",
    "SONY XPERIA 5": "SONY XPERIA 5 (128GB)"
}

# Remap the values of the dataframe 
df.replace({'Product Name': dict1}, inplace=True)

#Remove year (e.g. 2018, 2019)
year = ['2017','2018','2019','2020','2021','2022','2023','2024']
df['Product Name'] = df['Product Name'].str.replace('|'.join([re.escape(s) for s in year]), '')
df['Product Name'] = df['Product Name'].str.strip()

#To convert all characters to upper case and remove all special characters, parentheses and stuff within for 'Colour'
df['Colour'] = df['Colour'].str.upper()
df['Colour'] = [re.sub('[^A-Z0-9 \n]', '', x) for x in df['Colour']]
df['Colour'] = df['Colour'].str.replace('SLIVER','SILVER')
df['Colour'] = df['Colour'].str.strip()

#To remove all whitespace and add parenthesis for 'Capacity'
df['Capacity'] = df['Capacity'].str.replace(' ','').str.strip()
df['Capacity'] = [''.join('(' + item + ')').replace('()','') for item in df['Capacity']]

In [11]:
#Get list of unique brand
unique_brand = df['Brand'].unique().tolist()

#Problem found: in some cases, product name does not include brand (e.g.'GALAXY A8+')
#Solution: Align product name display (i.e. brand + name)
#1. remove brand name in product name first
p = re.compile('|'.join(map(re.escape, unique_brand)))
df['Product Name'] = [p.sub('', text).strip() for text in df['Product Name']]

#2. concatenate Brand Name and product name to align product name format
df['Product Name'] = df['Brand'].map(str) + ' ' + df['Product Name'].map(str)

In [12]:
#Finally, rename convention of Product Name = Product Name + Capacity (EXCEPT 'Nokia' mobile phone for easier mapping)
exceptional_brand = 'NOKIA'

for i in df['Product Name']:
    if exceptional_brand not in i:
        df['Name'] = df['Product Name'].str.cat(df['Capacity'],sep=" ").str.strip()

df.drop('Product Name', axis=1, inplace=True)
df = df.rename({'Name': 'Product Name'}, axis=1)

In [13]:
#Create dictionary for special handling of Samsung product name
dict2 = {
    "SAMSUNG A60 (6GB)": "SAMSUNG A60 (128GB)",   #wrongly marked by Fortress
    "SAMSUNG A80 (8GB)": "SAMSUNG A80 (128GB)",   #wrongly marked by Fortress          
    "SAMSUNG A9": "SAMSUNG A9 (128GB)"
}

# Remap the values of the dataframe 
df.replace({'Product Name': dict2}, inplace=True)

In [14]:
#Finalize dataframe column order
df = df[['Date', 'Source', 'Brand', 'Product Name', 'Colour', 'Capacity', 'RRP', 'Selling Price', 'Product URL']]

df1 = df[['Date', 'Source', 'Brand', 'Product Name', 'RRP', 'Selling Price']]

#Drop duplicates product and keep='first' to keep first of duplicates
#In case one product is selling at different prices for different colours, then only keep first of duplicates for each product name
#(i.e. given that most products are selling the same price for different colours)
cleaned_df1 = df1.drop_duplicates(subset=['Product Name'], keep='first')

In [15]:
# To write dataframe to new Excel file and append data to 'Aggregated_Fortress_headsets_offer.csv'
CurrentDate = time.strftime("%Y-%m-%d")

root = 'Fortress'

df.to_excel(root + '/' + CurrentDate + ' Fortress headsets offer' + '.xlsx', sheet_name = 'Fortress_headsets', index = False, encoding='utf-8-sig')

#Stack the DataFrames on top of 'Aggregated_Fortress_headsets_offer.csv'
cleaned_df1.to_csv('Aggregated_Fortress_headsets_offer.csv', index = False, header = False, mode='a', encoding='utf-8-sig')