/
scrape.py
71 lines (58 loc) · 2.81 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import time
from selenium import webdriver
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from webdriver_manager.chrome import ChromeDriverManager
from tqdm import tqdm
import argparse
def main(args):
# Change to the path in your file
# Eg. PATH_TO_CHROMEDRIVER_EXE = "D:\Downloads\Compressed\chromedriver_win32\chromedriver.exe"
##### Web scrapper for infinite scrolling page #####
if args.driver:
driver = webdriver.Chrome(executable_path=args.driver)
else:
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get("https://www.artbreeder.com/browse")
time.sleep(2) # Allow 2 seconds for the web page to open
driver.find_element_by_css_selector(f"div[data-name={args.type}]").click()
time.sleep(1)
scroll_pause_time = 1 # You can set your own pause time. My laptop is a bit slow so I use 1 sec
screen_height = driver.execute_script("return window.screen.height;") # get the screen height of the web
i = 1
pbar = tqdm(total = args.iterations)
pbar.update(1)
while i < args.iterations:
# scroll one screen height each time
driver.execute_script("window.scrollTo(0, {screen_height}*{i});".format(screen_height=screen_height, i=i))
i += 1
if (i % 100) == 0:
print(i)
time.sleep(scroll_pause_time)
pbar.update(1)
# update scroll height each time after scrolled, as the scroll height can change after we scrolled the page
scroll_height = driver.execute_script("return document.body.scrollHeight;")
# Break the loop when the height we need to scroll to is larger than the total scroll height
if (screen_height) * i > scroll_height:
# time.sleep(5)
break
pbar.close()
elements = driver.find_elements_by_css_selector(".main_image")
image_urls = []
for e in elements:
image_url = e.get_attribute('style').split('"')[1].replace("_small", "")
image_urls.append(image_url)
with open(args.output, "w") as output:
for row in image_urls:
output.write(str(row) + '\n')
if __name__ == "__main__":
parser = argparse.ArgumentParser()
# Adding optional argument
parser.add_argument("-o", "--output", help="Output filename", default="urls.txt")
parser.add_argument("-t", "--type", help="Image type", required=True, choices=["general", "portraits_sg2", "landscapes_sg2_concept", "buildings", "paintings", "sci_bio_art", "characters", "albums", "furries", "anime_portraits"])
parser.add_argument("-i", "--iterations", help="Scrolls iterations", type=int, default=1000)
parser.add_argument("-d", "--driver", help="Chrome driver path", default=None)
# Read arguments from command line
args = parser.parse_args()
# calling the main function
main(args)