# Web scraping with BeautifulSoup and Selenium

### Prerequisites:

#### python 3.6
#### chromedriver <-- a Chrome browser engine to initialize Chrome for automated running of Selenium-related script

In [1]:
#Prerequisites

import requests  #for handling HTTP requests
from bs4 import BeautifulSoup  #for parsing HTML
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait #for making WebDriver wait for an element to meet expected condition
from selenium.webdriver.support import expected_conditions as EC
from openpyxl import load_workbook #for writing outout to excel file
import random
import time
import pandas as pd
import numpy as np
import re

In [2]:
class urls_scraping():

    def __init__(self, url):
        self.url = url

    def fetch_urls(self):

        #Simulate clicking 'Esc' to jump out pop-up window (if any)
        webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()
        
        #tell Webdriver to access the website and scroll down in order to display all product urls on the page
        def page_scroll_down():
            driver.get(self.url)
        
            #Scroll down the page to the bottom
            lenOfPage = driver.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")

            while True:
                try:
                    lastCount = lenOfPage
                    time.sleep(3)
                    lenOfPage = driver.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
    
                    view_button = driver.find_element_by_link_text("View More Products")
                    view_button.click()
    
                except NoSuchElementException:
                    if lastCount==lenOfPage:
                        #print ("There's no more products at the bottom of this page.")
                        break
        
        #to scrap all products urls on the page
        def urls_scrap():       
            #Gather all products URLs on the web page at once
            containers = BeautifulSoup(driver.page_source, 'html.parser')

            global products_urls
            products_urls.extend([base_url + link.a.get('href') for link in containers.findAll("div", class_ = "tile-inner")])
            products_urls = list(np.unique(products_urls))
           
        #execute inner functions when calling the outer function 'fetch_urls()'              
        page_scroll_down()
        urls_scrap()

In [3]:
#Define website url which I am going to scrape
url1 = "https://shop.smartone.com/en/storefront/handset/listing/View-all-Smartphones/0/"
url2 = "https://shop.smartone.com/en/storefront/iphone.jsp"
base_url = 'https://shop.smartone.com'
products_urls = []

def main():
    
    #Create a new Chrome session
    #define global variable so that we can access that variable outside the scope of the function
    global driver
    driver = webdriver.Chrome()
    driver.implicitly_wait(5) #tell WebDriver to elapse 5 seconds when trying to find any element (or elements) not immediately available   
    
    get_urls1 = urls_scraping(url1)
    get_urls1.fetch_urls()
    
    get_urls2 = urls_scraping(url2)
    get_urls2.fetch_urls()
    
if __name__ == "__main__":
    main()

In [4]:
#remove the non-product url
products_urls = [x for x in products_urls if x.startswith('https://shop.smartone.com/en/storefront/mobile')]
                 
print ("We finally fetched " + str(len(products_urls)) + " products URLs.")

We finally fetched 37 products URLs.


In [5]:
#Define field of product details
brand = []
product_name = []
color = [] #different colour options may result in different prices
capacity_spec = [] #different capacity options may result in different prices
RRP = []   #recommended retail price
selling_price = []
stock_status = []
product_url = []

In [6]:
%%time

for i in products_urls:
    driver.get(i)
    
    #find capacity option(s)
    colour = driver.find_elements_by_xpath('//div[contains(@class, "st-color-chooser")]/a[contains(@class, "color-swatch")]')
    
    for x in colour:
        driver.execute_script("arguments[0].click()", x)
        
        #find capacity option(s)
        capacity = driver.find_elements_by_xpath('//span[contains(@class, "st-model-wrapper st-box-shadow")]/a[starts-with(@id,"model")]')
        
        for elem in capacity:
            driver.execute_script("arguments[0].click()", elem)
            time.sleep(3)
            
            #Use driver.page_source to get the HTML as it appears after javascript has rendered it
            page_source = driver.page_source
                
            #Use a parser on the returned HTML
            soup = BeautifulSoup(page_source, 'html.parser')
            time.sleep(3)
                
            #scrap capacity
            capacity_spec.append(elem.get_attribute('title'))
            
            #scrap brand
            brand.append(soup.find("h3", itemprop = "brand").text.upper())
                
            #scrap product name
            product_name.append(soup.find("h1", itemprop = "name").text.upper())
            
            #scrap colour
            color.append(x.get_attribute('title'))
                
            #refresh the parser
            page_source = driver.page_source
            soup = BeautifulSoup(page_source, 'html.parser')
            
            #scrap stock status
            status = soup.find('div', class_ = re.compile('^st-stock-status .*'))
            if status.get_text() != '':
                stock_status.append(status.get_text())
            else:
                stock_status.append('in stock')
                
            #scrap RRP
            RRP0 = soup.find("span", class_ = "st-ori-price").text.replace('HK$','').replace(',','')
            if RRP0 == '':
                RRP.append(soup.find("span", class_ = "st-price st-hs-view-only").text.replace('HK$','').replace(',',''))
            else:
                RRP.append(RRP0)
                
            #scrap selling (final) price
            selling_price.append(soup.find("span", class_ = "st-price st-hs-view-only").text.replace('HK$','').replace(',',''))
            
            #scrap product_url
            product_url.append(i)

driver.close()
print ("Scraping is completed.")

Scraping is completed.
Wall time: 22min 49s


In [7]:
#To zip 8 lists together and then construct dataframe
list_of_tuples = list(zip(brand, product_name, color, capacity_spec, RRP, selling_price, stock_status, product_url))
df = pd.DataFrame(list_of_tuples, columns = ['Brand', 'Product Name', 'Colour', 'Capacity', 'RRP', 'Selling Price', 'Stock Status', 'Product URL'])

#Indicate "Fortress" as data source
df.insert(0, 'Source', 'SmarTone')

#Populate current date in new column for visualizing time-series data
df['Date'] = pd.to_datetime('today').date()

In [8]:
#To convert all characters to upper case for Product Name
#Trim all spaces from the text string except for single spaces between words
df['Product Name'] = df['Product Name'].str.replace(' RAM','').str.strip()
df['Product Name'] = df['Product Name'].str.replace('GALAXY','').str.strip() 
df['Product Name'] = df['Product Name'].str.replace('NOTE','NOTE ').str.strip()

#Remove year (e.g. 2018, 2019)
year = ['2017','2018','2019','2020','2021','2022','2023','2024']
df['Product Name'] = df['Product Name'].str.replace('|'.join([re.escape(s) for s in year]), '')
df['Product Name'] = df['Product Name'].str.strip()

#To convert all characters to upper case and remove all special characters, parentheses and stuff within for 'Colour'
df['Colour'] = df['Colour'].str.upper()
df['Colour'] = [re.sub('[^A-Z0-9 \n]', '', x) for x in df['Colour']]
df['Colour'] = df['Colour'].str.strip()

#To remove all whitespace and add parenthesis for 'Capacity'
df['Capacity'] = df['Capacity'].str.replace(' ','').str.strip()
df['Capacity'] = [''.join('(' + item + ')').replace('()','') for item in df['Capacity']]

In [9]:
#Brand XIAOMI => MI
df['Brand'] = df['Brand'].str.replace('XIAOMI','MI')

#Missing (brand) data imputation
#Fill in missing Brand Name 'Apple' if the products are IPHONE (p.s. since brand is not found in IPHONE landing product page)
df.loc[df['Product Name'].str.contains('IPHONE'), 'Brand'] = 'APPLE'

#Get list of unique brand
unique_brand = df['Brand'].unique().tolist()

#Problem found: in some cases, product name does not include brand (e.g.'GALAXY NOTE9')
#Solution: Align product name display (i.e. brand + name)
#1. remove brand name in product name first
p = re.compile('|'.join(map(re.escape, unique_brand)))
df['Product Name'] = [p.sub('', text).strip() for text in df['Product Name']]

#2. concatenate Brand Name and product name to align product name format
df['Product Name'] = df['Brand'].map(str) + ' ' + df['Product Name'].map(str)

In [10]:
#Finally, rename convention of Product Name = Product Name + Capacity
for i in df['Product Name']:
    df['Name'] = df['Product Name'].str.cat(df['Capacity'],sep=" ").str.strip()

df.drop('Product Name', axis=1, inplace=True)
df = df.rename({'Name': 'Product Name'}, axis=1)

In [11]:
#Replace multi-spacing between with single whitespace within product name
df['Product Name'] = [' '.join(str1.split()) for str1 in df['Product Name']]

#Create dictionary for special handling of product name (to align naming across other sources)
dict1 = {
    "NOKIA 7.2 (128GB)": "NOKIA 7.2",
    "SAMSUNG A70 (6GB) (128GB)": "SAMSUNG A70 (6GB)",
    "SAMSUNG A70 (8GB) (128GB)": "SAMSUNG A70 (8GB)"
}

#Remap the values of the dataframe 
df.replace({'Product Name': dict1}, inplace=True)

#Brand 'SONY XPERIA' => 'SONY'
df['Brand'] = df['Brand'].str.replace('SONY XPERIA','SONY')

In [12]:
#Finalize dataframe column order
df = df[['Date', 'Source', 'Brand', 'Product Name', 'Colour', 'Capacity', 'RRP', 'Selling Price', 'Stock Status', 'Product URL']]

df1 = df[['Date', 'Source', 'Brand', 'Product Name', 'RRP', 'Selling Price']]

#Drop duplicates product and keep='first' to keep first of duplicates
#In case one product is selling at different prices for different colours, then only keep first of duplicates for each product name
#(i.e. given that most products are selling the same price for different colours)
cleaned_df1 = df1.drop_duplicates(subset=['Product Name'], keep='first')

In [13]:
#To write dataframe to new Excel file and append data to 'Aggregated_SmarTone_handsets_offer.csv'
CurrentDate = time.strftime("%Y-%m-%d")

root = 'SmarTone'

df.to_excel(root + '/' + CurrentDate + ' SmarTone handsets offer' + '.xlsx', sheet_name = 'SmarTone_handsets', index = False, encoding='utf-8-sig')

#Stack the DataFrames on top of 'Aggregated_Fortress_headsets_offer.csv'
#cleaned_df1.to_csv('Aggregated_SmarTone_headsets_offer.csv', index = False, header = False, mode='a')