In [None]:
import os
import json
import urllib.request
from selenium import webdriver

In [None]:
#pip install tqdm
from tqdm import tqdm

In [None]:
#pip install image
import re
import base64
import argparse
from io import BytesIO
from PIL import Image

In [None]:
def get_image_from_base64(codec):
    """ convert base64 to image """
    base64_data = re.sub('^data:image/.+;base64,', '', codec)
    img = Image.open(BytesIO(base64.b64decode(base64_data)))
    if img.mode != 'RGB':
        img = img.convert('RGB')
    return img

In [None]:
class Crawler:
    """
    Google Web Image Crawler
    """
    def __init__(self, keyword, count):
        self.keyword = str(keyword)  # image keyword for searching
        self.count = count  # image count
        self.dirPath = ""  # image stored directory

    def create_new_directory(self):
        """
        Create directory for download if it is not exist
        :return: None
        """
        self.cwd = os.getcwd()
        self.dirPath = os.path.join(self.cwd, self.keyword)

        if not os.path.exists(self.dirPath):
            os.mkdir(self.dirPath)

    def create_url(self):
        """
        Create the url path
        :return: url for searching
        """
        url = 'https://www.google.com/search?q=' + self.keyword + '&source=lnms&tbm=isch'
        return url

    def search_url(self, url):
        """
        Search from chrome browser
        :param url: search url
        :return: webdriver
        """
        # ========== headless driver options ========== #
        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_argument('headless')
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("lang=ko_KR")
        chrome_options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36")
        # ... etc
        # ============================================= #

        # if you don't want headless driver, remove chrome_option argument
        browser = webdriver.Chrome(os.path.join(self.cwd,'chromedriver'), chrome_options=chrome_options)
        browser.get(url)
        print(url)

        # scroll by 10000px
        pk = self.count // 100 - 1
        scroll = 1 if pk == 0 else 250 * pk
        for _ in range(scroll):
            browser.execute_script('window.scrollBy(0, 10000)')

        return browser

    def download_image(self, browser):
        """
        Download the image
        :param browser: google webdriver
        :return: None
        """
        elements = browser.find_elements_by_xpath('//img[contains(@class,"rg_i")]')
        element_size = len(elements)  # used for progress status

        for idx in tqdm(range(element_size), bar_format='{l_bar}{bar:20}{r_bar}{bar:-10b}'):
            save_path = self.dirPath + '/' + str(idx) + '.jpg'
            img_src = elements[idx].get_attribute('src')
            if img_src is None:
                img_src = elements[idx].get_attribute('data-src')

            if str(img_src).startswith('data:image'):
                img = get_image_from_base64(img_src)
                img.save(save_path, 'JPEG')
            else:
                try:
                    urllib.request.urlretrieve(img_src, save_path)
                except Exception as e:
                    print('exception: ', idx, e)

        browser.close()  # close the browser

    def run(self):
        """
        main routines
        :return: None
        """
        self.create_new_directory()           # 1. create the directory
        url = self.create_url()              # 2. create the path
        browser = self.search_url(url)       # 3. search image
        self.download_image(browser)  

In [None]:
if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='خزنده تصویر Google v2')
    parser.add_argument('--keyword', required=True, type=str, help='کلید واژه تصویر برای جستجو')
    parser.add_argument('--count', required=False, type=int, default=100, help='تعداد تصاویر در 100 واحد')

    args = parser.parse_args()

    newCrawler = Crawler(args.keyword, args.count)  # create new crawler
    newCrawler.run()


In [None]:
import os
import sys

# Root directory of the project
ROOT_DIR = os.path.abspath("./")

# Import Mask RCNN
sys.path.append(ROOT_DIR)  # To find local version of the library

# Directory to save logs and trained model
IMAGE_DIR = "output"

file_names = next(os.walk(IMAGE_DIR))[2]
strTable = 'Mostafa Aliakbarzadeh \n<br>'
strTable = strTable+'<table  align="center" border="1">\n'
strRW = "<tr><td> Alireza Asemi </td></tr>"
strTable = strTable+strRW
i = 1
for j in range(10):
    strRW = '<tr>'
    for k in range(6):
        strRW=strRW+'<td><img src= "output/' + file_names[i] + '" width="300"></td>'
        i=i+1
    strRW = strRW+"</tr>\n"
    strTable = strTable+strRW
    
       
strTable = strTable+"</table>"
 
hs = open("results.html", 'w')
hs.write(strTable)
hs.close() 

hs = open("README.md", 'w')
hs.write(strTable)
hs.close() 
# print(strTable)