# Data Collecting
Collect plant images from various sites in the internet (Google images/Bing) and automate this task by making a script using Python 

## Import Library

In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import requests
import urllib
import time

## Build Automate Script

In [2]:
#Download driver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

[WDM] - Downloading: 100%|██████████| 6.81M/6.81M [00:01<00:00, 3.88MB/s]


In [12]:
def scrapping(links,file_name,num_files):
    """
        Procedur to retrieve image from the internet to the local file

        Args:
            links : links to retrieve image from
            file_name : file name for the image to be saved
            num_files : number of image to retrieve
    """

    #Get sites links
    driver.get(links)

    # Scroll to bottom page
    driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")
    # Wait the page to load
    time.sleep(3)

    #Find the image ref and store it in the variable
    imgResults = driver.find_elements(By.XPATH,"//img[contains(@class,'rg_i Q4LuWd') or contains(concat(' ', normalize-space(@class), ' '), ' mimg ')]") #Scrap from google images or bing images
    src = []
    for img in imgResults:
        src.append(img.get_attribute('src'))

    #Request the files and save it into local files
    for i in range(num_files):    
        urllib.request.urlretrieve(str(src[i]),"../RawDatasets/{}/{}{}.jpg".format(file_name,file_name,i))
    

**Scrap from bing images**

In [44]:
scrapping(links="https://www.bing.com/images/search?q=agglonema&form=HDRSC3&first=1",file_name="Agglonema",num_files=80)                  # Agglonema
scrapping(links="https://www.bing.com/images/search?q=spider+plant&form=HDRSC3&first=1",file_name="Lili Paris",num_files=80)              # Lili Paris
scrapping(links="https://www.bing.com/images/search?q=alocasia&form=HDRSC3&first=1",file_name="Alocasia",num_files=80)                    # Alocasia
scrapping(links="https://www.bing.com/images/search?q=+Sansevieria&form=HDRSC3&first=1",file_name="Lidah Mertua",num_files=80)            # Lidah Mertua
scrapping(links="https://www.bing.com/images/search?q=Monstera+&form=HDRSC3&first=1",file_name="Janda Bolong",num_files=80)               # Janda Bolong
scrapping(links="https://www.bing.com/images/search?q=Anthurium+plowmanii&form=HDRSC3&first=1",file_name="Gelombang Cinta",num_files=80)  # Gelombang Cinta
scrapping(links="https://www.bing.com/images/search?q=Adiantum&form=HDRSC3&first=1",file_name="Suplir",num_files=80)                      # Suplir
scrapping(links="https://www.bing.com/images/search?q=syzygium+myrtifolium+&form=HDRSC3&first=1",file_name="Pucuk Merah",num_files=80)    # Pucuk Merah


**Scrap from google images**

In [None]:
scrapping("https://www.google.com/search?q=syzygium+oleana&tbm=isch&ved=2ahUKEwir7Lqaq_f-AhVAzXMBHVz3Am4Q2-cCegQIABAA&oq&gs_lcp=CgNpbWcQARgAMgcIIxDqAhAnMgcIIxDqAhAnMgcIIxDqAhAnMgcIIxDqAhAnMgcIIxDqAhAnMgcIIxDqAhAnMgcIIxDqAhAnMgcIIxDqAhAnMgcIIxDqAhAnMgcIIxDqAhAnOgQIIxAnOgUIABCABFDeA1jeA2CiDWgBcAB4AIABQ4gBQ5IBATGYAQCgAQGqAQtnd3Mtd2l6LWltZ7ABCsABAQ&sclient=img&ei=PidiZKvVE8Caz7sP3O6L8AY&bih=754&biw=1488","Agglonema",num_files=200)
scrapping("https://www.google.com/search?q=wayang+gatotkaca&rlz=1C1ONGR_enID973ID973&source=lnms&tbm=isch&sa=X&ved=2ahUKEwiKv9H0ltv-AhW8TWwGHe41CF4Q_AUoAXoECAEQAw&biw=1036&bih=674&dpr=1.25","Lili Paris",num_files=200)
scrapping("https://www.google.com/search?q=Anthurium+plowmanii&rlz=1C1ONGR_enID973ID973&hl=en-US&source=lnms&tbm=isch&sa=X&ved=2ahUKEwi947zJzPb-AhV22TgGHYh7A2kQ_AUoAXoECAEQAw&biw=1536&bih=754&dpr=1.25","Gelombang Cinta",num_files=200)
scrapping("https://www.google.com/search?q=suplir+tanaman+hias&tbm=isch&ved=2ahUKEwjH8aCWnPf-AhUL_TgGHUzqDfkQ2-cCegQIABAA&oq=suplir+tanaman+hias&gs_lcp=CgNpbWcQAzIFCAAQgAQyBQgAEIAEMgYIABAIEB4yBggAEAgQHjIGCAAQCBAeMgYIABAIEB4yBggAEAgQHjIGCAAQCBAeOgQIIxAnOgQIABAeULcUWNwrYIUtaANwAHgAgAGSAYgBqAqSAQQxNS4ymAEAoAEBqgELZ3dzLXdpei1pbWfAAQE&sclient=img&ei=ehdiZMfMO4v64-EPzNS3yA8","Suplir",num_files=200)
scrapping("https://www.google.com/search?q=syzygium+oleana&tbm=isch&ved=2ahUKEwir7Lqaq_f-AhVAzXMBHVz3Am4Q2-cCegQIABAA&oq&gs_lcp=CgNpbWcQARgAMgcIIxDqAhAnMgcIIxDqAhAnMgcIIxDqAhAnMgcIIxDqAhAnMgcIIxDqAhAnMgcIIxDqAhAnMgcIIxDqAhAnMgcIIxDqAhAnMgcIIxDqAhAnMgcIIxDqAhAnOgQIIxAnOgUIABCABFDeA1jeA2CiDWgBcAB4AIABQ4gBQ5IBATGYAQCgAQGqAQtnd3Mtd2l6LWltZ7ABCsABAQ&sclient=img&ei=PidiZKvVE8Caz7sP3O6L8AY&bih=754&biw=1488","Pucuk Merah",num_files=200)


**Split Data**

Split the data into train,test,and validation

In [5]:
import splitfolders

splitfolders.ratio('RawDataset', output="../Dataset", seed=42, ratio=(0.7,0.15,0.15), group_prefix=None)

Copying files: 1860 files [00:03, 551.05 files/s]
