# Create Datasets
## Data Collection
- midwestgardentips.com provides a list of the best performing perennials in the midwest
- preen.com provides a list of common weeds state-by-state
    - will download a list of common weeds in IL
    - will download corresponding photos from the site
- garden.org provides a collection of photographs of various plants
    - will download photos of both perennials and weeds from this site

In [68]:
# standard imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

# webscrape
import requests
from requests import get
from bs4 import BeautifulSoup
import urllib
from time import sleep

In [2]:
# Paths to store data
perennial_path = os.path.join(os.pardir, os.pardir, 'data', 'perennials')
weed_path = os.path.join(os.pardir, os.pardir, 'data', 'weeds')

## midwestgardentips.com

In [3]:
# Scrape names of best performing perennials from Midwest Gardening site
perennial_url = 'https://www.midwestgardentips.com/best-performing-perennials-1'
response = get(perennial_url)
html = response.text
soup = BeautifulSoup(html, 'lxml')

# Find bolded (i.e., 'strong') and italicized (i.e., 'em') text
p_list = [a.text for a in (strong.find('em') for strong in soup.find_all('strong')) if a]

perennials = []
for i in range(len(p_list)):
    text = p_list[i].split(':')[0]
    perennials.append(text)

# Remove mislabeled text from list of perennials
perennials.remove('y.')
perennials.remove('Full to part sun\xa0 Hardy in zones ')

In [53]:
perennials

['Adnophora Liliifolia',
 'Agastache',
 'Asiatic lily (also Oriental lily)',
 'Aster (hardy)',
 'Astilbe',
 'Azalea, deciduous',
 'Balloon Flower',
 'Bee Balm',
 'Bellflower',
 'Black-Eyed Susan',
 'Bleeding Heart',
 'Centaurea (Perennial Cornflower)',
 'Clematis',
 'Chrysanthemum',
 'Columbine',
 'Coneflower',
 'Coral Bells',
 'Coreopsis',
 'Daylilies',
 'Delphinium',
 'Ferns',
 'Gaillardia (blanket flower)',
 'Geranium, hardy',
 'Hosta',
 'Iris',
 'Lady’s Mantle',
 'Lavender',
 'Lily of the Valley',
 'Lungwort',
 'Mallow',
 'Nepeta',
 'Penstemon',
 'Peonies',
 'Phlox paniculata hybrids',
 'Phlox subulata',
 'Pinks',
 'Poppies',
 'Rudbeckia',
 'Russian Sage',
 'Salvia',
 'Sedum',
 'Spotted Dead Nettle, or Lamium',
 'Veronica (Speedwell)',
 'Yarrow']

## preen.com

In [5]:
# Create list of weeds and
# Scrape weed photos from preen site

weed_url = 'https://www.preen.com/weeds/il'
response = get(weed_url)
html = response.text
soup = BeautifulSoup(html, 'lxml')
div = soup.find(id = 'WeedList')
w_list = div.find_all('a')
weeds = []
for i in range(len(w_list)):
    # Create list of weed names
    text = w_list[i].find('img').attrs['alt']  
    weeds.append(text)
    
    # Scrape photos from site
    photo_url = 'https://www.preen.com' + w_list[i].attrs['href']
    photo_response = get(photo_url)
    photo_html = photo_response.text
    photo_soup = BeautifulSoup(photo_html, 'lxml')
    photo_div = photo_soup.find(id = 'imagePicker')
    photo_list = photo_div.find_all('a')
    for j in range(len(photo_list)):
        photo_url = 'https:' + photo_list[j].attrs['href'].replace(' ', '%20')
        
        # To account for photos that were removed from the site
        if get(photo_url).status_code != 404:
            
            # "pr" suffix to indicate photos were scraped from preen site
            path = os.path.join(weed_path, text.lower().replace(' ', '_') + '_pr')
            urllib.request.urlretrieve(photo_url, path + '_' + str(j) + '.jpg')

In [3]:
# Create list of weeds and
# Scrape weed photos from preen site

weed_url = 'https://www.preen.com/weeds/il'
response = get(weed_url)
html = response.text
soup = BeautifulSoup(html, 'lxml')
div = soup.find(id = 'WeedList')
w_list = div.find_all('a')
weeds = []
for i in range(len(w_list)):
    # Create list of weed names
    text = w_list[i].find('img').attrs['alt']  
    weeds.append(text)


## garden.org
I have created a function (enter_url) that provides the name of each perennial/weed one at a time.  The first prompt allows the user to skip the perennial/weed if no photos are provided for the plant on garden.org.  If photos exist, the user can enter the url for the plant.  The function scrapes all photos where the plant name is identified in the header or as a common name for the plant.  All plants are stored with the plant name as part of the name of the file.

In [21]:
from itertools import cycle
import traceback
import requests

In [73]:
def get_proxies():
    html = requests.get('https://free-proxy-list.net/').text
    ip_addresses = list(pd.read_html(html)[0].query('Https == "yes"')['IP Address'] + ':' +\
        pd.read_html(html)[0].query('Https == "yes"')['Port'].astype(str))
    return ip_addresses

In [74]:
ip_addresses = get_proxies()
proxies = cycle(ip_addresses)

In [76]:
# Check if plant photos appear on search result page
plant_url = 'https://garden.org/plants/view/75093/Yarrow-Achillea-Moonshine/'
proxy = next(proxies)
response = requests.get(plant_url, headers = {'User-Agent' : 'test'}, proxies = {'http' : proxy, 'https' : proxy})
html = response.text
soup = BeautifulSoup(html, 'lxml')
soup.find_all('div', {'class' : 'plant_thumbbox'})

ProxyError: HTTPSConnectionPool(host='garden.org', port=443): Max retries exceeded with url: /plants/view/75093/Yarrow-Achillea-Moonshine/ (Caused by ProxyError('Cannot connect to proxy.', OSError('Tunnel connection failed: 502 Bad Gateway')))

In [63]:
url = 'https://httpbin.org/ip'
for i in range(1,11):
    #Get a proxy from the pool
    proxy = next(proxies)
    print("Request #%d"%i)
    try:
        response = requests.get(url,proxies={"http": proxy, "https": proxy})
        print(response.json())
    except:
        #Most free proxies will often get connection errors. You will have retry the entire request using another proxy to work. 
        #We will just skip retries as its beyond the scope of this tutorial and we are only downloading a single url 
        print("Skipping. Connnection error")

Request #1
Skipping. Connnection error
Request #2
{'origin': '47.57.30.255'}
Request #3
Skipping. Connnection error
Request #4
{'origin': '136.228.141.154'}
Request #5
Skipping. Connnection error
Request #6
{'origin': '92.51.11.167'}
Request #7
{'origin': '34.105.37.21'}
Request #8
{'origin': '194.233.69.41'}
Request #9
{'origin': '52.188.167.61'}
Request #10
Skipping. Connnection error


In [64]:
# Function to get soup from garden.org site
def get_soup(url):
    # Need to "fake a browser visit" by providing a user-agent header for garden.org
    response = requests.get(url, headers = {'User-Agent' : 'test'}, proxies = {'http' : proxy, 'https' : proxy})
    html = response.text
    soup = BeautifulSoup(html, 'lxml')
    return soup

# garden.org provides a "results page" when searching for a plant
# Each "result" includes a link that provides photos for the plant
# Function goes to the URL for each result on page, and calls the "add_plant" function
def get_results(result_soup, plant_name, count, weed):
    find_plants_results = result_soup.find('table')
    plants_results = find_plants_results.find_all('tr')
    # Create list of URLs for each result
    for k in range(len(plants_results)): # For each result
        plant_url = 'https://garden.org' + plants_results[k].find('a').attrs['href']
        # Count keeps track of the number of photos for each plant
        count = add_plant(plant_url, plant_name, count, weed)
    sleep(1)
    return (count)

# Function adds all photos from each "result"
# "Results" include plants that contain the search term
# Only plants that match the name of the search term 
# as a "common name" for the plant or in the header of the page are included
def add_plant(plant_url, plant_name, count, weed):
    soup = get_soup(plant_url)
    if weed:
        path = os.path.join(weed_path, plant_name)
    else:
        path = os.path.join(perennial_path, plant_name)
    
    # Create list of common names
    tables = soup.find_all('table')
    common_names_table = None
    for j in range(len(tables)):
        if tables[j].find('caption'):
            if 'common' in tables[j].find('caption').text.lower():
                common_names_table = tables[j]
    common_names_list = []
    if common_names_table:
        common_names = common_names_table.find_all('tr')
        for k in range(len(common_names)):
            common_names_list.append(common_names[k].find('td').findNextSibling().text.strip().lower())
                
    # Add names in header to list of common names
    header_names = soup.find('h1', {'class' : 'page-header'}).text.lower()
    header_names = header_names.replace('(', '→').replace(')', '').split('→')
    common_names_list += header_names
    
    # If search term is in header or list of common names, add photos
    if plant_name.replace('_', ' ') in common_names_list:
        photo_gallery = soup.find_all('div', {'class' : 'plant_thumbbox'})
        for i in range(len(photo_gallery)):
            photo_url = 'https://garden.org' + photo_gallery[i].find('a').find('img').attrs['src']
            if get(photo_url).status_code != 404:
                urllib.request.urlretrieve(photo_url, path + '_' + str(count) + '.jpg')
                count += 1
    return (count)


In [None]:
# Find URLs for results page and pull all plants for each result
# "data" is the list of perennials or weeds
# "weed" indicates whether the plant is a weed (weed=True) or not
def enter_url(data, weed):
    for l in range(len(data)):
        print('Plant:  ', data[l])
        add_photos = input('Add Photos? (Y/N):  ')
        if (add_photos == 'Y') or (add_photos == 'y'):
            plants_url = input('Enter garden.org url:  ')
            plant_name = plants_url.split('=')[-1].replace('+', '_')
            count = 0 # Track number of results to name plant
            
            # Go to URL for each result on page, and add plants from each
            plant_soup = get_soup(plants_url)
            count = get_results(plants_soup, plant_name, count, weed)
            
            # Check if there are additional results pages
            # Will return actual page if one exists. Otherwise, will return nothing.
            query = plant_soup.find('span', {'class' : 'PageActive'})
            if query:
                next_page = query.findNextSibling()
                while next_page:
                    next_url = 'https://garden.org' + next_page.attrs['href'] # Go to next page of results
                    plant_soup = get_soup(next_url)
                    count = get_results(plant_soup, plant_name, count, weed)
                    query = plant_soup.find('span', {'class' : 'PageActive'})
                    if query:
                        next_page = query.findNextSibling()
    return

In [54]:
# To call each plant separately, enter url for main results page for each plant

def pull_data(plants_url, weed):
    plant_name = plants_url.split('=')[-1].replace('+', '_')
    count = 0 # Track number of results to name plant
    # Go to URL for each result on page, and add plants from each
    plant_soup = get_soup(plants_url)
    count = get_results(plant_soup, plant_name, count, weed)
    
    # Check if there are additional results pages
    # Will return actual page if one exists. Otherwise, will return nothing.
    query = plant_soup.find('span', {'class' : 'PageActive'})
    if query:
        next_page = query.findNextSibling()
        while next_page:
            next_url = 'https://garden.org' + next_page.attrs['href'] # Go to next page of results
            plant_soup = get_soup(next_url)
            count = get_results(plant_soup, plant_name, count, weed)
            query = plant_soup.find('span', {'class' : 'PageActive'})
            if query:
                next_page = query.findNextSibling()
    return

In [None]:
plants_url = 'https://garden.org/plants/search/text/?q=Yarrow'
pull_data(plants_url, False)

In [69]:
plants_url = 'https://garden.org/plants/view/75098/Yarrows-Achillea/'
plant_soup = get_soup(plants_url)
plant_name = plants_url.split('=')[-1].replace('+', '_')

In [71]:
# Check if plant photos appear on search result page
plant_url = 'https://garden.org/plants/view/75084/Yarrow-Achillea-millefolium/'
count = 0
add_plant(plant_url, plant_name, count, False)

0

In [None]:
enter_url(perennials, False)

In [24]:
enter_url(weeds, True)

In [13]:
weeds

['buckhorn plantain',
 'bull thistle',
 'Carolina geranium',
 'carpetweed',
 'catchweed bedstraw',
 'cheat grass',
 'chickweed',
 'common mallow',
 'corn speedwell',
 'Crabgrass',
 'crowfoot grass',
 'curly dock',
 'dallisgrass',
 'dandelion',
 'eclipta',
 'evening primrose',
 'fall panicum',
 'fiddleneck',
 'field bindweed',
 'fleabane',
 'giant foxtail',
 'goosegrass',
 'green foxtail',
 'groundsel',
 'hairy bittercress',
 'hairy galinsoga',
 'henbit',
 'honeyvine milkweed',
 'jimsonweed',
 'Johnsongrass',
 'junglerice',
 'kochia',
 'ladysthumb',
 'lambsquarters',
 'lanceleaf groundcherry',
 'lespedeza',
 'lovegrass',
 'marestail',
 'mayweed',
 'morning glory',
 'mustard',
 'nettleleaf goosefoot',
 'orchardgrass',
 'Pennsylvania smartweed',
 'perennial ryegrass',
 'pineappleweed',
 'pokeweed',
 'prickly lettuce',
 'prickly sida',
 'prostrate knotweed',
 'prostrate spurge',
 'puncturevine',
 'purple cudweed',
 'purslane',
 'ragweed',
 'rattail fescue',
 'red sorrel',
 'redroot pigweed

In [5]:
perennials

['Adnophora Liliifolia',
 'Agastache',
 'Asiatic lily (also Oriental lily)',
 'Aster (hardy)',
 'Astilbe',
 'Azalea, deciduous',
 'Balloon Flower',
 'Bee Balm',
 'Bellflower',
 'Black-Eyed Susan',
 'Bleeding Heart',
 'Centaurea (Perennial Cornflower)',
 'Clematis',
 'Chrysanthemum',
 'Columbine',
 'Coneflower',
 'Coral Bells',
 'Coreopsis',
 'Daylilies',
 'Delphinium',
 'Ferns',
 'Gaillardia (blanket flower)',
 'Geranium, hardy',
 'Hosta',
 'Iris',
 'Lady’s Mantle',
 'Lavender',
 'Lily of the Valley',
 'Lungwort',
 'Mallow',
 'Nepeta',
 'Penstemon',
 'Peonies',
 'Phlox paniculata hybrids',
 'Phlox subulata',
 'Pinks',
 'Poppies',
 'Rudbeckia',
 'Russian Sage',
 'Salvia',
 'Sedum',
 'Spotted Dead Nettle, or Lamium',
 'Veronica (Speedwell)',
 'Yarrow']

In [6]:
url = 'https://garden.org/plants/view/180715/Sage-Salvia-nemorosa-Mainacht/'
soup = get_soup(url)
soup.find_all('div', {'class' : 'plant_thumbbox'})

[]