# Create Datasets
## Add photos from Missouri Botanical Gardens
- When I created the code below, I thought that the Missouri Botanical Gardens included photos of perennials and weeds.  After scraping the photos of the perennials, I realized that it did not include photos of weeds, but referred to the University of Massachusetts Amherst for all weed photos.
- The code for scraping weed photos from the University of Massachusetts Amherst is included in a separate notebook

In [1]:
# standard imports
import numpy as np
import pandas as pd
import os

# webscrape
import requests
from bs4 import BeautifulSoup
from time import sleep

In [2]:
# Paths to store data
perennial_path = os.path.join(os.pardir, os.pardir, 'data', 'perennials')
weed_path = os.path.join(os.pardir, os.pardir, 'data', 'weeds')

In [69]:
# Scrape names of best performing perennials from Midwest Gardening site
perennial_url = 'https://www.midwestgardentips.com/best-performing-perennials-1'
response = requests.get(perennial_url)
html = response.text
soup = BeautifulSoup(html, 'lxml')

# Find bolded (i.e., 'strong') and italicized (i.e., 'em') text
p_list = [a.text for a in (strong.find('em') for strong in soup.find_all('strong')) if a]

perennials = []
for i in range(len(p_list)):
    text = p_list[i].split(':')[0]
    perennials.append(text)

# Remove mislabeled text from list of perennials
perennials.remove('y.')
perennials.remove('Full to part sun\xa0 Hardy in zones ')

In [70]:
perennials

['Adnophora Liliifolia',
 'Agastache',
 'Asiatic lily (also Oriental lily)',
 'Aster (hardy)',
 'Astilbe',
 'Azalea, deciduous',
 'Balloon Flower',
 'Bee Balm',
 'Bellflower',
 'Black-Eyed Susan',
 'Bleeding Heart',
 'Centaurea (Perennial Cornflower)',
 'Clematis',
 'Chrysanthemum',
 'Columbine',
 'Coneflower',
 'Coral Bells',
 'Coreopsis',
 'Daylilies',
 'Delphinium',
 'Ferns',
 'Gaillardia (blanket flower)',
 'Geranium, hardy',
 'Hosta',
 'Iris',
 'Lady’s Mantle',
 'Lavender',
 'Lily of the Valley',
 'Lungwort',
 'Mallow',
 'Nepeta',
 'Penstemon',
 'Peonies',
 'Phlox paniculata hybrids',
 'Phlox subulata',
 'Pinks',
 'Poppies',
 'Rudbeckia',
 'Russian Sage',
 'Salvia',
 'Sedum',
 'Spotted Dead Nettle, or Lamium',
 'Veronica (Speedwell)',
 'Yarrow']

In [71]:
# Create list of weeds and
# Scrape weed photos from preen site

weed_url = 'https://www.preen.com/weeds/il'
response = requests.get(weed_url)
html = response.text
soup = BeautifulSoup(html, 'lxml')
div = soup.find(id = 'WeedList')
w_list = div.find_all('a')
weeds = []
for i in range(len(w_list)):
    # Create list of weed names
    text = w_list[i].find('img').attrs['alt']  
    weeds.append(text)

In [72]:
weeds

['annual bluegrass',
 'annual ryegrass',
 'barnyardgrass',
 'black medic',
 'bristly oxtongue',
 'broadleaf dock',
 'broadleaf plantain',
 'brome',
 'buckhorn plantain',
 'bull thistle',
 'Carolina geranium',
 'carpetweed',
 'catchweed bedstraw',
 'cheat grass',
 'chickweed',
 'common mallow',
 'corn speedwell',
 'Crabgrass',
 'crowfoot grass',
 'curly dock',
 'dallisgrass',
 'dandelion',
 'eclipta',
 'evening primrose',
 'fall panicum',
 'fiddleneck',
 'field bindweed',
 'fleabane',
 'giant foxtail',
 'goosegrass',
 'green foxtail',
 'groundsel',
 'hairy bittercress',
 'hairy galinsoga',
 'henbit',
 'honeyvine milkweed',
 'jimsonweed',
 'Johnsongrass',
 'junglerice',
 'kochia',
 'ladysthumb',
 'lambsquarters',
 'lanceleaf groundcherry',
 'lespedeza',
 'lovegrass',
 'marestail',
 'mayweed',
 'morning glory',
 'mustard',
 'nettleleaf goosefoot',
 'orchardgrass',
 'Pennsylvania smartweed',
 'perennial ryegrass',
 'pineappleweed',
 'pokeweed',
 'prickly lettuce',
 'prickly sida',
 'prostr

In [3]:
def get_soup(url):
    response = requests.get(url)
    html = response.text
    soup = BeautifulSoup(html, 'lxml')
    return soup

In [4]:
def get_photo(image_url, path):
    image_soup = get_soup(image_url)
    photo = image_soup.find(id = 'FullImage').attrs.get('src')
    r = requests.get(photo)
    file = open(path, 'wb')
    file.write(r.content)
    file.close()
    return

- Missouri Botanical Gardens includes a primary photo and sometimes includes additional photos.
- For each plant, I entered the URL if the site included at least a primary photo.

In [5]:
def add_plant(plant_url, weed):
    main_url = 'https://www.missouribotanicalgarden.org'
    soup = get_soup(plant_url)
    file_name = soup.find(id = 'dnn_srTitle_lblTitle').text.strip().lower().replace(' ', '_').replace('\'', '') +'_mo'
    
    if weed:
        plant_path = weed_path
    else:
        plant_path = perennial_path
    
    if soup.find(id = 'MainContentPlaceHolder_PrimaryImageLink'):
        # Get primary photo
        path = os.path.join(plant_path, file_name + '_0.jpg')
        primary_image_url = main_url + soup.find(id = 'MainContentPlaceHolder_PrimaryImageLink').attrs.get('href')
        get_photo(primary_image_url, path)
        
        # Check if additional photos exist
        if soup.find(id = 'MainContentPlaceHolder_ImagesList'):
            count = 1
            photo_table = soup.find(id = 'MainContentPlaceHolder_ImagesList').find_all('tr')
            
            for each_row in photo_table:
                if each_row.contents:
                    photo_list = each_row.find_all('td')
                    for each_photo in photo_list:
                        if each_photo.contents:
                            photo_url = main_url + each_photo.find('a').attrs.get('href')
                            path = os.path.join(plant_path, file_name + '_' + str(count) + '.jpg')
                            get_photo(photo_url, path)
                            count += 1

In [6]:
def enter_data(weed):
    plant_url = input('Enter url:  ')
    add_plant(plant_url, weed)
    next_plant = input('Next Plant? (Y/N):  ')
    while ((next_plant == 'Y') or (next_plant == 'y')):
        plant_url = input('Enter garden.org url:  ')
        add_plant(plant_url, weed)
        next_plant = input('Next Plant? (Y/N):  ')
    return

In [None]:
enter_data(False)