   # Web Scrapping Images

In [1]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import requests
import re
import os

NO_RESULT = "No results for that query"

lesions = ["Basal Cell Carcinoma", 
    "Lentigo",
    "Malignant Melanoma", 
    "Melanocytic naevus",
    "seborrhoeic keratosis",
    "Wart", 
    "Actinic Keratosis",
    "Squamous Cell Carcinoma",
    "Intraepithelial Carcinoma", 
    "Pyogenic Granuloma",
    "Haemangioma",
    "Dermatofibroma"
    ]

In [3]:
driver = webdriver.Chrome('/usr/bin/chromedriver')

## Get images from Dermnet NZ

In [4]:
driver.get("https://dermnetnz.org/image-library/")
time.sleep(10)

for lesion in lesions:
    print(lesion)
    search_box = driver.find_element_by_xpath("//input[@placeholder='Search by disease...']")
    search_box.clear()
    search_box.send_keys(lesion)
    search_box.send_keys(Keys.RETURN)

    try:
        driver.implicitly_wait(5)
        result = driver.find_element_by_css_selector("h3.colour--primary").text
        print(result)
        continue
    except:
        pass
    
    list_regex = []

    for i in driver.find_elements_by_xpath("/html/body/div[4]/div[5]/div/div/div[1]/div[2]/div"):
        inner_html = i.get_attribute('innerHTML')
        regex = re.findall(r'src="(.*?)\"',inner_html)
        if regex:
            for link in regex:
                list_regex.append(link)
                

    if not os.path.exists("dataset/"+ lesion):
        os.makedirs('dataset/' + lesion)
        
    print(len(list_regex))
        
    for i, url in enumerate(list_regex):
        image = requests.get(url, allow_redirects=True)
        open("dataset/" + lesion + "/nz-"+ str(i) + '.jpg', 'wb').write(image.content)

Basal Cell Carcinoma
6
Lentigo
3
Malignant Melanoma
1
Melanocytic naevus
1
seborrhoeic keratosis
2
Wart
2
Actinic Keratosis
7
Squamous Cell Carcinoma
6
Intraepithelial Carcinoma
No results for that query
Pyogenic Granuloma
No results for that query
Haemangioma
No results for that query
Dermatofibroma
1


## Get images from DermIS

In [5]:
driver.get("https://www.dermis.net/dermisroot/en/home/index.htm")

for lesion in lesions: 
    print(lesion)
    search_box = driver.find_element_by_id("btnSearch")
    search_box.clear()
    search_box.send_keys(lesion)
    driver.find_element_by_class_name("searchbutton").click()

    list_images_links = []

    for i in driver.find_elements_by_xpath("//*[@id='ctl00_Main_pnlSearchControl']/tbody/tr/td/table/tbody/tr/td/ul"):
        inner_html = i.get_attribute('innerHTML')
        regex = re.findall(r'href="(.*?)\"',inner_html)
        if regex:
            for link in regex:
                site = "https://www.dermis.net/dermisroot/en" + link[5:]
                list_images_links.append(site)

    list_images = []

    for i in list_images_links:
        driver.get(i)

        for j in driver.find_elements_by_xpath("//*[@id='ctl00_Main_pnlSearchControl']/tbody/tr/td/table/tbody/tr/td/table[@class='diagnoseThumbs']/tbody/tr[3]/td"):
            inner_html = j.get_attribute('innerHTML')
            regex = re.findall(r'src="(.*?)\"',inner_html)
            for link in regex:
                site = "https://www.dermis.net/" + link[9:]
                text = site.replace("100px", "550px")
                list_images.append(text)

    print(len(list_images))
    
    if not os.path.exists("dataset/"+ lesion):
        os.makedirs('dataset/' + lesion)

    for i, url in enumerate(list_images):
        image = requests.get(url, allow_redirects=True)
        open("dataset/" + lesion + "/is-" + str(i) + '.jpg', 'wb').write(image.content)

Basal Cell Carcinoma
145
Lentigo
56
Malignant Melanoma
18
Melanocytic naevus
0
seborrhoeic keratosis
0
Wart
0
Actinic Keratosis
27
Squamous Cell Carcinoma
102
Intraepithelial Carcinoma
0
Pyogenic Granuloma
0
Haemangioma
0
Dermatofibroma
15


## Get images from Atlas Dermatológico

In [6]:
driver.get("http://www.atlasdermatologico.com.br/")

for lesion in lesions:
    print(lesion)
    try:
        search_box = driver.find_element_by_id("j_idt12:search_input")
    except:
        search_box = driver.find_element_by_id("j_idt13:search_input")
    search_box.clear()
    search_box.send_keys(lesion)
    search_box.send_keys(Keys.RETURN)

    try:
        result = driver.find_element_by_class_name("ui-datalist-empty-message").text
        print(result)
        continue
    except:
        pass


    list_lesion_links = []
    for i in driver.find_elements_by_xpath("//*[@id='j_idt23_list']"):
        inner_html = i.get_attribute('innerHTML')
        regex = re.findall(r'href="(.*?)\"',inner_html)
        for link in regex:
            site = "http://www.atlasdermatologico.com.br" + link
            list_lesion_links.append(site)

    list_image_links = []
    for i in list_lesion_links:
        driver.get(i)

        for j in driver.find_elements_by_id("j_idt24"):
            inner_html = j.get_attribute('innerHTML')
            regex = re.findall(r'href="(.*?)\"',inner_html)
            for image in regex:
                site = "http://www.atlasdermatologico.com.br/" + image
                list_image_links.append(site)

    print(len(list_image_links))
    
    if not os.path.exists("dataset/"+ lesion):
        os.makedirs('dataset/' + lesion)

    for i, url in enumerate(list_image_links):
        image = requests.get(url, allow_redirects=True)
        open("dataset/" + lesion + "/ad-" + str(i) + '.jpg', 'wb').write(image.content)


Basal Cell Carcinoma
319
Lentigo
22
Malignant Melanoma
No records found.
Melanocytic naevus
No records found.
seborrhoeic keratosis
No records found.
Wart
100
Actinic Keratosis
No records found.
Squamous Cell Carcinoma
132
Intraepithelial Carcinoma
No records found.
Pyogenic Granuloma
No records found.
Haemangioma
18
Dermatofibroma
25
