# Some scraping practices I did with Selenium and BeautifulSoup

## Standard imports and notation (Selenium)

In [None]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains

In [None]:
PATH = 'C:\Program Files (x86)\chromedriver.exe'
driver = webdriver.Chrome(PATH)

driver.get("your website link will go here")

## driver.close() will close the tab, 
## driver.quit() close the window
## driver.title will give the name of the page

## In this page, the search bar had name 's'. send_keys will input this into the search bar. Keys.Return is like pressing the enter button in the search. Clear() will clear the textbox

In [None]:
driver.back()

In [None]:
search = driver.find_element_by_name("s")
search.clear()

In [None]:
search.send_keys("test")
search.send_keys(Keys.RETURN)
#element.clear() can remove the "test"

## Searching for different items on the webpage by inspecting then locating them by their html tag, ID, etc

## This try except chunk is mainly asking the program to wait for 10s while the driver locatest the element -> can be By.ID, By.LINK_TEXT, etc

In [None]:
try:
    main = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "main"))
    )
    articles = main.find_elements_by_tag_name("time")
    for article in articles:
        print(article.text)
    
except:
    print("error")

In [None]:
link = driver.find_element_by_link_text("Python Programming")
link.click()

In [None]:
try:
    element = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.LINK_TEXT, "Beginner Python Tutorials"))
    )
    element.click()

except:
    print("error")

In [None]:
try:
    element = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "sow-button-19310003"))
    )
    element.click()

except:
    print("error")

## These are all the find_elements_by 

In [None]:
find_element_by_id
find_element_by_name
find_element_by_xpath
#example of xpath '//div[@title="buyer-name"]'
find_element_by_link_text
find_element_by_partial_link_text
find_element_by_tag_name
find_element_by_class_name
find_element_by_css_selector
find_elements_by_xpath

## driver.back() and driver.forward() bring you back to the prev page/forward page
## driver.close() closes popups/windows in focus. .quit() completely closes everything

In [None]:
driver.back()
driver.back()
driver.quit() 
driver.close()

## Automating a cookie clicker website

In [None]:
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains

In [None]:
PATH = 'C:\Program Files (x86)\chromedriver.exe'
driver = webdriver.Chrome(PATH)

driver.get("https://orteil.dashnet.org/cookieclicker/")

In [None]:
driver.implicitly_wait(5)

cookie = driver.find_element_by_id("bigCookie")
cookie_count = driver.find_element_by_id("cookies")
items = [driver.find_element_by_id("productPrice" + str(i)) for i in range(1,-1,-1)]

In [None]:
actions = ActionChains(driver)
actions.click(cookie)

for i in range(5000):
    actions.perform()
    count = int(cookie_count.text.split(" ")[0])
    for item in items:
        value = int(item.text)
        if value <= count:
            upgrade_actions = ActionChains(driver)
            upgrade_actions.move_to_element(item)
            upgrade_actions.click()
            upgrade_actions.perform()

## Finding the search box, typing inside and entering it

In [None]:
search = driver.find_element_by_id("search")
search.send_keys("monitor")
search.send_keys(Keys.RETURN)
search.clear()

## This is how to click buttons -> find the button name first then use ActionChains to perform

In [None]:
actions = ActionChains(driver)

In [None]:
actions.click(limit)
actions.perform()

In [None]:
options = driver.find_elements_by_tag_name('option')
print(len(options))

## Standard notation for opening the page 

In [None]:
PATH = 'C:\Program Files (x86)\chromedriver.exe'
driver = webdriver.Chrome(PATH)

page = 'random website link goes here'
driver.get(page)

## This is how I found the search bar and tried to search for laptops on the website

In [None]:
search = driver.find_element_by_id('q')
search.send_keys('laptop')

## This is how I found details such as Prices, Product Name etc that I want to extract

In [None]:
prices = driver.find_elements_by_xpath('//span[@class="c13VH6"]')
for price in prices:
    print(price.text)

In [None]:
names = driver.find_elements_by_xpath('//div[@class="c16H9d"]')
for name in names:
    print(name.text)

## Combining it all together and writing it into a .csv file...

In [None]:
PATH = 'C:\Program Files (x86)\chromedriver.exe'
driver = webdriver.Chrome(PATH)

for i in range(10):
    page = 'https://www.lazada.sg/lenovo/?from=input&page=' + str(i + 1) + '&q=laptop'
    driver.get(page)
    
    if i == 0:
        filename = "Fourth Webscrape.csv"
        f = open(filename, 'w')

        headers = 'Product Name, Price\n'
        f.write(headers)
    
    else:
        filename = "Fourth Webscrape.csv"
        f = open(filename, 'a')
        f.write
    
    prices = driver.find_elements_by_xpath('//span[@class="c13VH6"]')
    names = driver.find_elements_by_xpath('//div[@class="c16H9d"]')
    
    name = names[i]
    price = prices[i]
        
    f.write(name.replace(',','|') + ',' + price.replace(',', ' ') + '\n')
    f.close()
    
driver.close()

## Testing this on a particular e-commerce store to get price of sneakers

In [None]:
PATH = 'C:\Program Files (x86)\chromedriver.exe'
driver = webdriver.Chrome(PATH)

driver.get("link goes here")

## This is how I found the brand names and the second cell shows how to print each of the brand name

In [None]:
brands = driver.find_elements_by_xpath('//span[@class="b-catalogList__itmBrand fsm txtDark uc"]')
print(len(brands))

In [None]:
for brand in brands:
    print(brand.text)

## This is how I found the model names and the second cells shows how to print each model name. Use len() to check if data extracted tallies with the website being searched

In [None]:
models = driver.find_elements_by_xpath('//em[@class="b-catalogList__itmTitle fss"]')
print(len(models))

In [None]:
for model in models:
    print(model.text)

## This is how to find the prices

In [None]:
prices = driver.find_elements_by_xpath('//span[@class="b-catalogList__itmPriceBox itm-priceBox fsm txtDark"]')
print(len(prices))

## This is how to write each csv file -> convert them to excel files afterwards

In [None]:
filename = 'Eighth Webscrabe.csv'

f = open(filename, 'w')

headers = 'Brand,Name,Price($)\n'

f.write(headers)

for i in range(99):
    brand = brands[i].text.replace(',', " ")
    model = models[i].text.replace(',', " ")
    price = prices[i].text.replace(',', " ")
    price = price.replace('\n', ' | ')
   
    f.write(brand + ',' + model + ',' + price + "\n")

f.close()

## This is how I find the page numbers at the bottom to click to move on to the next page. Check what each number in next_page represents

In [None]:
next_page = driver.find_elements_by_xpath('//a[@class="page-item"]')
print(next_page[3].text)

In [None]:
actions = ActionChains(driver)

In [None]:
actions.click(next_page[3])
actions.perform()

In [None]:
driver.close()

# Standard imports and notations (BeautifulSoup)

In [None]:
#Beautiful soup will parse the HTML text, URL lib will grab the page
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup

for i in range(4):
    my_url = 'https://www.courts.com.sg/computing-mobile/monitors-projectors/monitors?p=' + str(i+1) + '&product_list_limit=32'

    #This will open up the page
    uClient = uReq(my_url)

    #This reads the info within the page, remember to close the page after
    page_html = uClient.read()
    uClient.close()

    #"html.parser" tells it to parse as a html file
    page_soup = soup(page_html, "html.parser")

    #print(page_soup.h1)
    #print(page_soup.p)
    #print(page_soup.body.span)

    #Grabs each individual product
    containers = page_soup.findAll("div",{"class": "item-container"})

## Testing on Courts

In [None]:
for i in range(4):
    my_url = 'https://www.stadiumgoods.com/air-jordan-1-retro-high-og-light-smoke-grey-555088-126'

    uClient = uReq(my_url)

    page_html = uClient.read()
    uClient.close()

    page_soup = soup(page_html, 'html.parser')

    containers = page_soup.find_all("li", class_ = "item product product-item")
    #print(len(containers))

    if i == 0:
        filename = "First Webscrape.csv"
        f = open(filename, 'w')
        headers = 'Product Name, Price ($), Screen Dimensions \n'
        
        f.write(headers)
    else:
        filename = "First Webscrape.csv"
        f = open(filename, 'a')
        f.write
        
    for container in containers:
        name = container.div.a.img['alt']
        #print(name)

        price = (container.find_all("span", class_ = "price-wrapper"))[0].text
        #print(price)

        size = (container.find_all("ul", class_ = "navision-spec")[0])
        screen = (size.li.text.replace("Screen Size:", " ")).strip()
        #print(screen.strip())
        
        f.write(name + "," + price.replace(","," ") + "," + screen + "\n") 
        
    f.close()

## Testing on bestdenki

In [None]:
my_url = 'https://www.bestdenki.com.sg/catalog/computer/category/monitor-67'

uClient = uReq(my_url)

page_html = uClient.read()
uClient.close()

page_soup = soup(page_html, 'html.parser')

containers = page_soup.find_all("div", class_ = "col-md-3 col-sm-6 col-xs-6 hover-border")
#print(len(containers))

filename = "Second Webscrape.csv"
f = open(filename, 'w')

headers = 'Product Name, Current Price ($), Original Price($), Screen Dimensions\n'

f.write(headers)

for container in containers:
    name = ''
    brand = container.h3.text.strip()
    des = (container.find_all("div", class_ = "title"))[0].h3.text.strip()
    
    name = brand + " " + des

    price = container.find_all("div", class_ = "price")[0].text.strip()
    
    price = price.split()
    curr_price = price[0]
    
    try:
        old_price = price[1]
        
    except IndexError:
        old_price = "No Discount"
    
    size = ''
    for i in des:
        if i.isdigit():
            #print(i)
            while len(size) < 2:
                size = size + i
                break
    
    f.write(name + ',' + curr_price + ',' + old_price + ',' + size + "\n")
    
f.close()