In [1]:
"""
Developed by Miles Danswan
Date: 13/04/21
Project: ReOpen A2I2
"""
import requests as rq
import re
import urllib as u
import random as rand
import threading as t
from html.parser import HTMLParser

In [2]:
"""
Simple class inheriting from the Python threading module

Functions as a background task for downloading web scrapping results
"""
class DownloadingThread(t.Thread):
    def __init__(this, batch):
        t.Thread.__init__(this)
        this.results = dict()
        this.batch = batch
    def run(this):
        for url in this.batch:
            try:
                fn = url.rsplit("/", 1)[1]
                path = "./Temp/" + fn
                print("Downloading %s into %s" % (fn[0:10] + "...", path))
                r = rq.get(url)
                with open(path, mode="wb") as file:
                    file.write(r.content)
                this.results[url] = True
            except Exception as e:
                print("Error downloading file from %s\n\t%s" % (url, repr(e)[0:100] + "..."))
                this.results[url] = False

In [3]:
"""
Core HTML analysis and breakdown from the designated image source

Inheriting from HTMLParser allows for simple traversal of the DOM to find embedded image URLs
"""
class CommercialFloorPlanHTMLParser(HTMLParser):
    def __init__(this):
        super().__init__()
        this.image_urls = set()
        this.total_urls = 0
    def handle_starttag(this, tag, attrs):
        if tag == "a":
            if attrs[0][0] == "class" and attrs[0][1] == "serp-item__link":
                img_url_idx = attrs[1][1].find("img_url") + len("img_url") + 1
                img_url_close_idx = attrs[1][1].find("&", img_url_idx)
                this.image_urls.add(u.parse.unquote(attrs[1][1][img_url_idx:img_url_close_idx]))
                this.total_urls += 1

"""
Core class for the image web scrapping process

Simply acts as a manager for the above tasks
"""
class FloorplanDownloader():
    THREADS = 5
    def __init__(this, floorplan_urls):
        this.fp_urls = floorplan_urls
    def download(this):
        if this.fp_urls == None or len(this.fp_urls) == 0:
            print("No URL's Passed")
            return
        rand.shuffle(this.fp_urls)
        size = int(len(this.fp_urls))
        part = int(size / this.THREADS)
        threads = []
        for idx in range(this.THREADS):
            thread = DownloadingThread(this.fp_urls[idx*part:idx*part + part])
            thread.start()
            threads.append(thread)
        for thread in threads:
            thread.join()
        for thread in threads:
            print("Total Potential Images: %d" % (size))

In [4]:
PAGES = 30 # how many pages are to be search in the image engine
parser = CommercialFloorPlanHTMLParser()
for i in range(PAGES):
    url = r'https://yandex.com/images/search?text=school%20floor%20plans&rpt=image&p=' + str(i) # example url
    print("Acquiring: %s\r" % (url))
    r = rq.get(url, allow_redirects=False)
    parser.feed(str(r.content))
downloader = FloorplanDownloader(list(parser.image_urls))
downloader.download()
parser.close()

Acquiring: https://yandex.com/images/search?text=school%20floor%20plans&rpt=image&p=0
Acquiring: https://yandex.com/images/search?text=school%20floor%20plans&rpt=image&p=1
Acquiring: https://yandex.com/images/search?text=school%20floor%20plans&rpt=image&p=2
Acquiring: https://yandex.com/images/search?text=school%20floor%20plans&rpt=image&p=3
Acquiring: https://yandex.com/images/search?text=school%20floor%20plans&rpt=image&p=4
Acquiring: https://yandex.com/images/search?text=school%20floor%20plans&rpt=image&p=5
Acquiring: https://yandex.com/images/search?text=school%20floor%20plans&rpt=image&p=6
Acquiring: https://yandex.com/images/search?text=school%20floor%20plans&rpt=image&p=7
Acquiring: https://yandex.com/images/search?text=school%20floor%20plans&rpt=image&p=8
Acquiring: https://yandex.com/images/search?text=school%20floor%20plans&rpt=image&p=9
Acquiring: https://yandex.com/images/search?text=school%20floor%20plans&rpt=image&p=10
Acquiring: https://yandex.com/images/search?text=scho

Downloading base-floor... into ./Temp/base-floor-plan-2.jpg
Downloading Screen-sho... into ./Temp/Screen-shot-2015-11-14-at-9.33.19-AM.png
Downloading 3e08b0affc... into ./Temp/3e08b0affca27ce194cd0887ba29f423--primary-school-elementary-schools.jpg
Downloading 8f62a8aa70... into ./Temp/8f62a8aa7036d1945fed97ce1762005c.jpg
Downloading denmark-hi... into ./Temp/denmark-high-school-floor-plans-high-school-floor-plan-master-plan-s-213fa5e804082f7c.jpg
Downloading rdc-2.jpg?... into ./Temp/rdc-2.jpg?1441196627
Downloading stringio.j... into ./Temp/stringio.jpg?1414442368
Downloading DLF-2-BHK-... into ./Temp/DLF-2-BHK-Floor-Plan.jpg
Error downloading file from https://www.openingthebook.com/media/1039/customer-flow.jpg?width=1600&height=900&mode=crop&anchor=center
	OSError(22, 'Invalid argument')...
Downloading 171220_N_h... into ./Temp/171220_N_high-school-plan-1-web.jpg
Error downloading file from https://images.adsttc.com/media/images/5007/025c/28ba/0d41/4800/09e5/large_jpg/stringio.jpg?

Error downloading file from https://images.adsttc.com/media/images/5279/6d34/e8e4/4e86/5400/0038/large_jpg/17_floor_1.jpg?1383689506
	OSError(22, 'Invalid argument')...
Downloading f3ad1677c2... into ./Temp/f3ad1677c287f601caa9cc8d3a6cf882.jpg
Downloading 61ea95e54f... into ./Temp/61ea95e54f7701e0e6157769413791e1.png
Downloading 01_2013-08... into ./Temp/01_2013-0827_1st-floor.jpg
Downloading incredible... into ./Temp/incredible-high-school-floor-plans-middle-school-floor-plans-friv-5-games-middle-school-floor-plans-pics.jpg
Downloading Durango_Hi... into ./Temp/Durango_High_School_-_Ground_Floor%2C_1920.jpg
Downloading RoomSketch... into ./Temp/RoomSketcher-Commercial-Office-Floor-Plan_1000w.jpg
Downloading UHall1stDi... into ./Temp/UHall1stDimensions.jpg
Downloading 1473733168... into ./Temp/147373316861204_plan1F.jpg
Downloading 9_2nd_floo... into ./Temp/9_2nd_floor_plan.jpg?1404264204
Downloading Floor-plan... into ./Temp/Floor-plan-buffet-e1445985762467-1024x461.png
Downloading be

Downloading 1.-SRC-Gym... into ./Temp/1.-SRC-Gym-Floor-Plan.png
Downloading Cairnhill-... into ./Temp/Cairnhill-9-Floor-Plans-Type-F2.jpg
Downloading 44074b9644... into ./Temp/44074b96447a15c0cebe9c17abb663f3.jpg
Downloading Kilburn_2.... into ./Temp/Kilburn_2.jpg
Downloading the-glades... into ./Temp/the-glades-floor-plan-d1.jpg
Downloading 58c5a64dea... into ./Temp/58c5a64dea4f8a8df67efa7f42da935e.png
Downloading floor_(1).... into ./Temp/floor_(1).jpg?1416074113
Downloading metalocus_... into ./Temp/metalocus_cobe-kindergarten_23_1180.png?itok=o8kB6gIW
Downloading e31fa3f647... into ./Temp/e31fa3f647848f9b390dc427369cb57f.gif
Downloading Smith_floo... into ./Temp/Smith_floor_plan_2.jpg
Downloading compound-h... into ./Temp/compound-house-plans-floor-plan_275795.jpg
Error downloading file from https://images.adsttc.com/media/images/5467/9396/e58e/ce12/6900/02f7/large_jpg/floor_(1).jpg?1416074113
	OSError(22, 'Invalid argument')...
Downloading TLV_school... into ./Temp/TLV_school_-_Gr

Downloading floor_(5).... into ./Temp/floor_(5).jpg?1428077049
Downloading 1b86375f4e... into ./Temp/1b86375f4e1cba5930c511f7ab9c047a.jpg
Downloading cordelia_B... into ./Temp/cordelia_Base_684.jpg
Error downloading file from https://images.adsttc.com/media/images/551e/ba11/e58e/ce84/5e00/01a6/large_jpg/floor_(5).jpg?1428077049
	OSError(22, 'Invalid argument')...
Downloading Floor-Plan... into ./Temp/Floor-Plan-RoomsABC.jpg
Downloading elementary... into ./Temp/elementary-school-drawing-30.gif
Downloading bdwzd53383... into ./Temp/bdwzd53383lnmvwn.png?fit=crop&auto=compress%2Cformat&w=615&dpr=3
Downloading 399b1922eb... into ./Temp/399b1922ebee4b6c4e09021bb6aa6137.jpg
Downloading IMG42768.j... into ./Temp/IMG42768.jpg?1384735577
Downloading res-floor.... into ./Temp/res-floor.gif
Error downloading file from https://archinect.imgix.net/uploads/bd/bdwzd53383lnmvwn.png?fit=crop&auto=compress%2Cformat&w=615&dpr=3
	OSError(22, 'Invalid argument')...
Downloading f489ef9a22... into ./Temp/f48

Downloading 396ce390a5... into ./Temp/396ce390a555782e2ba02d4580e432a3.jpg
Downloading 3d-apartme... into ./Temp/3d-apartment-floor-plans-along-with-16-new-custom-floor-plans-for-homes-of-3d-apartment-floor-plans.jpg
Downloading f4e7fe471e... into ./Temp/f4e7fe471e4a73a610f9a18b026c29f8.jpg
Downloading EHS-Stadiu... into ./Temp/EHS-Stadium-Fieldhouse-12-07-15.jpg
Downloading performing... into ./Temp/performing_arts_second_floor_plan_1920.jpg
Downloading south%20fl... into ./Temp/south%20floor%20plans%201.png
{'https://i0.wp.com/www.kellybraundesign.com/wp-content/uploads/2011/06/8-high-school-floor-plan.jpg': True, 'https://s3-eu-west-1.amazonaws.com/emap-nibiru-prod/wp-content/uploads/sites/4/2017/12/20111954/freemansswimmingpool_lowergroundfloorplan2.jpg': True, 'https://www.wallpapermatte.com/wp-content/uploads/2018/01/floor-plans-for-school-buildings-10.jpg': True, 'https://www.sl.nsw.gov.au/sites/default/files/4921_mitchell_building_master_plan_floorplan_map_display_910x910mm_upd