In [None]:
"""
Developed by Miles Danswan
Date: 13/04/21
Project: ReOpen A2I2
"""
import requests as rq
import re
import urllib as u
import random as rand
import threading as t
from html.parser import HTMLParser

In [None]:
"""
Simple class inheriting from the Python threading module

Functions as a background task for downloading web scrapping results
"""
class DownloadingThread(t.Thread):
    def __init__(this, batch):
        t.Thread.__init__(this)
        this.results = dict()
        this.batch = batch
    def run(this):
        for url in this.batch:
            try:
                fn = url.rsplit("/", 1)[1]
                path = "./Temp/" + fn
                print("Downloading %s into %s" % (fn[0:10] + "...", path))
                r = rq.get(url)
                with open(path, mode="wb") as file:
                    file.write(r.content)
                this.results[url] = True
            except Exception as e:
                print("Error downloading file from %s\n\t%s" % (url, repr(e)[0:100] + "..."))
                this.results[url] = False

In [None]:
"""
Core HTML analysis and breakdown from the designated image source

Inheriting from HTMLParser allows for simple traversal of the DOM to find embedded image URLs
"""
class CommercialFloorPlanHTMLParser(HTMLParser):
    def __init__(this):
        super().__init__()
        this.image_urls = set()
        this.total_urls = 0
    def handle_starttag(this, tag, attrs):
        if tag == "a":
            if attrs[0][0] == "class" and attrs[0][1] == "serp-item__link":
                img_url_idx = attrs[1][1].find("img_url") + len("img_url") + 1
                img_url_close_idx = attrs[1][1].find("&", img_url_idx)
                this.image_urls.add(u.parse.unquote(attrs[1][1][img_url_idx:img_url_close_idx]))
                this.total_urls += 1

"""
Core class for the image web scrapping process

Simply acts as a manager for the above tasks
"""
class FloorplanDownloader():
    THREADS = 5
    def __init__(this, floorplan_urls):
        this.fp_urls = floorplan_urls
    def download(this):
        if this.fp_urls == None or len(this.fp_urls) == 0:
            print("No URL's Passed")
            return
        rand.shuffle(this.fp_urls)
        size = int(len(this.fp_urls))
        part = int(size / this.THREADS)
        threads = []
        for idx in range(this.THREADS):
            thread = DownloadingThread(this.fp_urls[idx*part:idx*part + part])
            thread.start()
            threads.append(thread)
        for thread in threads:
            thread.join()
        for thread in threads:
            print("Total Potential Images: %d" % (size))

In [None]:
PAGES = 30 # how many pages are to be search in the image engine
parser = CommercialFloorPlanHTMLParser()
for i in range(PAGES):
    url = r'https://yandex.com/images/search?text=school%20floor%20plans&rpt=image&p=' + str(i) # example url
    print("Acquiring: %s\r" % (url))
    r = rq.get(url, allow_redirects=False)
    parser.feed(str(r.content))
downloader = FloorplanDownloader(list(parser.image_urls))
downloader.download()
parser.close()