### Libraries

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
import time
import csv
import random

In [2]:
option = webdriver.FirefoxOptions()
option.add_argument('headless') # Reduce chance of getting identified

### Scraping world indices prices from Yahoo

In [3]:
browser = webdriver.Firefox()
url = 'https://finance.yahoo.com/quote/%5EGSPC/history?p=%5EGSPC'
browser.get(url)
time.sleep(5) # Let it load

In [4]:
tables = browser.find_element(By.TAG_NAME,'tbody')

elements = tables.find_elements(By.TAG_NAME, 'span') # span contains the data in each cell

### Storing the data

In [5]:
list1 = []
list2 = []
list3 = []
list4 = []
list5 = []
list6 = []
list7 = []

lists = [list1, list2, list3, list4, list5, list6, list7]  # List of all target lists
index = 0  # Starting point

for i, element in enumerate(elements):
    target_list = lists[index]  # When we are iterating the first element in our output, we will simultaneously select list1 because lists[0] is list1
    target_list.append(element.text)  # On the same iteration and with list1 selected, append the iterated element to list1

    # Update the index to cycle through the target lists
    index = (index + 1) % len(lists)

browser.quit()

# Printing out the list.
print("List 1:", list1[:5])
print("List 2:", list2[:5])
print("List 3:", list3[:5])
print("List 4:", list4[:5])
print("List 5:", list5[:5])
print("List 6:", list6[:5])
print("List 7:", list7[:5])

List 1: ['Jun 30, 2023', 'Jun 29, 2023', 'Jun 28, 2023', 'Jun 27, 2023', 'Jun 26, 2023']
List 2: ['4,422.44', '4,374.94', '4,367.48', '4,337.36', '4,344.84']
List 3: ['4,458.48', '4,398.39', '4,390.35', '4,384.42', '4,362.06']
List 4: ['4,422.44', '4,371.97', '4,360.22', '4,335.00', '4,328.08']
List 5: ['4,450.38', '4,396.44', '4,376.86', '4,378.41', '4,328.82']
List 6: ['4,450.38', '4,396.44', '4,376.86', '4,378.41', '4,328.82']
List 7: ['3,923,450,000', '3,696,660,000', '3,739,330,000', '3,573,500,000', '3,415,030,000']


### Format as a dataframe

In [6]:
data = {
    'date': list1,
    'open': list2,
    'high': list3,
    'low': list4,
    'close': list5,
    'adj_close': list6,
    'vol': list7
}

df = pd.DataFrame(data)

df

Unnamed: 0,date,open,high,low,close,adj_close,vol
0,"Jun 30, 2023",4422.44,4458.48,4422.44,4450.38,4450.38,3923450000
1,"Jun 29, 2023",4374.94,4398.39,4371.97,4396.44,4396.44,3696660000
2,"Jun 28, 2023",4367.48,4390.35,4360.22,4376.86,4376.86,3739330000
3,"Jun 27, 2023",4337.36,4384.42,4335.00,4378.41,4378.41,3573500000
4,"Jun 26, 2023",4344.84,4362.06,4328.08,4328.82,4328.82,3415030000
...,...,...,...,...,...,...,...
95,"Feb 13, 2023",4096.62,4138.90,4092.67,4137.29,4137.29,3448620000
96,"Feb 10, 2023",4068.92,4094.36,4060.79,4090.46,4090.46,3891520000
97,"Feb 09, 2023",4144.25,4156.23,4069.67,4081.50,4081.50,4270200000
98,"Feb 08, 2023",4153.47,4156.85,4111.67,4117.86,4117.86,4029820000


##### We can see that the static code only extracts 100 lines of data from the site, however if we access the browser and scroll manually the data spans across one year. 

##### Dynamic website promotes better site loading speed but limits our result if we scrape it statically. We will re-scrape Yahoo by automating a scrolling action using selenium javascript module.

In [8]:
browser = webdriver.Firefox()

url_list = [
    'https://finance.yahoo.com/quote/%5EGSPC/history?p=%5EGSPC',
    'https://finance.yahoo.com/quote/%5EDJI/history?p=%5EDJI',
    'https://finance.yahoo.com/quote/%5EIXIC/history?p=%5EIXIC',
    'https://finance.yahoo.com/quote/%5ENYA/history?p=%5ENYA',
    'https://finance.yahoo.com/quote/%5ERUT/history?p=%5ERUT',
    'https://finance.yahoo.com/quote/%5EFTSE/history?p=%5EFTSE',
    'https://finance.yahoo.com/quote/%5EGDAXI/history?p=%5EGDAXI',
    'https://finance.yahoo.com/quote/%5EFCHI/history?p=%5EFCHI',
    'https://finance.yahoo.com/quote/%5ESTOXX50E/history?p=%5ESTOXX50E',
    'https://finance.yahoo.com/quote/%5EN225/history?p=%5EN225',
    'https://finance.yahoo.com/quote/%5EHSI/history?p=%5EHSI',
    'https://finance.yahoo.com/quote/%5ESTI/history?p=%5ESTI',
    'https://finance.yahoo.com/quote/%5EKLSE/history?p=%5EKLSE'
]

for url in url_list:
    load_time = random.uniform(3, 5)
    wait_time = random.uniform(1, 3)
    # Mimic a random behavior to prevent alerting anti-scraping software on the site.

    filename = url.split('p=%5E')[1] + '.csv' # CSV name for each ticker

    browser.get(url)
    time.sleep(load_time)

    previous_height = browser.execute_script("return document.documentElement.scrollHeight;") # Get initial height

    while True:
        browser.execute_script("window.scrollBy(0, document.documentElement.scrollHeight);") # Simulate scrolling, scroll to bottom to trigger dynamic loading

        time.sleep(wait_time) # wait a moment before each scroll

        current_height = browser.execute_script("return document.documentElement.scrollHeight;") # Get height after scroll

        if current_height == previous_height: # If height values are different, page is dynamic, continue loop. Otherwise break the loop
            break

        previous_height = current_height # Update the height to start a new loop

    tables = browser.find_element(By.TAG_NAME,'tbody') 
    elements = tables.find_elements(By.TAG_NAME, 'span')

    list1 = []
    list2 = []
    list3 = []
    list4 = []
    list5 = []
    list6 = []
    list7 = []

    lists = [list1, list2, list3, list4, list5, list6, list7]
    index = 0 

    for i, element in enumerate(elements):
        target_list = lists[index] 
        target_list.append(element.text) 
        index = (index + 1) % len(lists)

    with open (filename, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['date', 'open', 'high', 'low', 'close', 'adj_close', 'vol'])
        writer.writerows(zip(list1, list2, list3, list4, list5, list6, list7))

    print(f"Scraped data for index: {url}")
    print(f"Data saved to: {filename}")

browser.quit()

Scraped data for index: https://finance.yahoo.com/quote/%5EGSPC/history?p=%5EGSPC
Data saved to: GSPC.csv
Scraped data for index: https://finance.yahoo.com/quote/%5EDJI/history?p=%5EDJI
Data saved to: DJI.csv
Scraped data for index: https://finance.yahoo.com/quote/%5EIXIC/history?p=%5EIXIC
Data saved to: IXIC.csv
Scraped data for index: https://finance.yahoo.com/quote/%5ENYA/history?p=%5ENYA
Data saved to: NYA.csv
Scraped data for index: https://finance.yahoo.com/quote/%5ERUT/history?p=%5ERUT
Data saved to: RUT.csv
Scraped data for index: https://finance.yahoo.com/quote/%5EFTSE/history?p=%5EFTSE
Data saved to: FTSE.csv
Scraped data for index: https://finance.yahoo.com/quote/%5EGDAXI/history?p=%5EGDAXI
Data saved to: GDAXI.csv
Scraped data for index: https://finance.yahoo.com/quote/%5EFCHI/history?p=%5EFCHI
Data saved to: FCHI.csv
Scraped data for index: https://finance.yahoo.com/quote/%5ESTOXX50E/history?p=%5ESTOXX50E
Data saved to: STOXX50E.csv
Scraped data for index: https://finance.

### Scraping Shopee for 'Trail Running Shoes' keyword.

In [9]:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pickle

##### Shopee has a verification process when it detects a new browser instance, this step has to be done manually.

##### After manual verification we will store the cookies and load it to bypass the verification step for subsequent scrape.

##### This step only has to be done ONCE

In [15]:
browser = webdriver.Firefox()

username = '' # insert your own username
password = '' # insert your own password

browser.get('https://shopee.com.my/buyer/login?next=https%3A%2F%2Fshopee.com.my%2F') # Login page

lang = WebDriverWait(browser, 10).until(
    EC.element_to_be_clickable((By.XPATH, "//button[text()='English']"))
)
lang.click() # Click language popup window

browser.find_element(By.CLASS_NAME,'pDzPRp').send_keys(username)

time.sleep(1)
browser.find_element(By.CSS_SELECTOR, '.vkgBkQ > div:nth-child(1) > input:nth-child(1)').send_keys(password)

time.sleep(1)
browser.find_element(By.CSS_SELECTOR, '.wyhvVD').click()

# Manually verify from here on.

##### Run the below code cell once manually verified and ended at the landing page.

In [16]:
pickle.dump(browser.get_cookies(), open("cookies.pkl", "wb")) # Store the cookies

browser.quit()

### Staring a new scraping instance for 'Trail Running Shoes'

##### For subsequent scraping we only have to run the code below since we have cookies stored.

In [18]:
url = 'https://shopee.com.my/search?keyword=trail%20running%20shoes&page=' # replace with own link to interested search keyword 
browser = webdriver.Firefox()
browser.get(url)
cookies = pickle.load(open('cookies.pkl', 'rb')) # This bypasses future verification

for cookie in cookies:
    browser.add_cookie(cookie)

# Elements I'm interested to scrape
description_list = []
price_list = []
unit_sold_list = []
location_list = []
links_list = []

for x in range(0, 10): # I want to scrape 10 pages
    load_time = random.uniform(3, 5)
    wait_time = random.uniform(1, 3)

    page_num = url + str(x)
    browser.get(page_num)
    time.sleep(load_time)

    previous_height = browser.execute_script("return document.documentElement.scrollHeight;")

    while True:
        browser.execute_script("window.scrollBy(0, 1200);")

        time.sleep(wait_time)

        current_height = browser.execute_script("return document.documentElement.scrollHeight;")

        if current_height == previous_height:
            break

        previous_height = current_height

    wait = WebDriverWait(browser, 10) # Wait a max of 10 seconds
    wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'shopee-search-item-result__items'))) # Ensures the entire page has loaded with the elements I want
    full_list = browser.find_elements(By.CLASS_NAME, 'shopee-search-item-result__items')

    for i in full_list:
        description = i.find_elements(By.CLASS_NAME, 'Cve6sh')
        description_text = [desc.text for desc in description]
        description_list.extend(description_text)
        
        price = i.find_elements(By.CLASS_NAME, 'rVLWG6')
        price_text = [pric.text.split()[0] for pric in price]
        price_list.extend(price_text)

        unit_sold = i.find_elements(By.CLASS_NAME, 'r6HknA')
        unit_sold_text = [unit.text for unit in unit_sold]
        unit_sold_list.extend(unit_sold_text)
        
        location = i.find_elements(By.CLASS_NAME, "zGGwiV")
        location_text = [loc.text for loc in location]
        location_list.extend(location_text)

        links = i.find_elements(By.CSS_SELECTOR, ".shopee-search-item-result__item [href]")
        links_text = [link.get_attribute('href') for link in links]
        links_list.extend(links_text)

        print(f'Page {x} scraped and appended')

browser.quit()

df = pd.DataFrame({'description':description_list, 'price': price_list, 'unit_sold':unit_sold_list, 'location':location_list, 'url':links_list})

df['price'] = df['price'].str.replace(',', '').str.split('RM').str[1].astype(float) # Remove any formatting and extract numbers only
df['price'] = df['price'].fillna('')

df['unit_sold'] = df['unit_sold'].str.split().str[0].str.replace('k', '000').str.replace('.', '').astype(float) # Remove any formatting and extract numbers only
df['unit_sold'] = df['unit_sold'].fillna('')

df.to_csv('shopee_trail_shoes.csv', index=True)

print('Data stored in CSV: shopee_trail_shoes.csv')

Page 0 scraped and appended
Page 1 scraped and appended
Page 2 scraped and appended
Page 3 scraped and appended
Page 4 scraped and appended
Page 5 scraped and appended
Page 6 scraped and appended
Page 7 scraped and appended
Page 8 scraped and appended
Page 9 scraped and appended
Data stored in CSV: shopee_trail_shoes.csv


  df['unit_sold'] = df['unit_sold'].str.split().str[0].str.replace('k', '000').str.replace('.', '').astype(float) # Remove any formatting and extract numbers only
