# Plants are Friends Data Acquisition

In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup as BS
import re

# Gather info then consolidate
This script produces a csv of data from the tables from houseplantsexpert.com/a-z-list-of-house-plants. <br>
Some things to consider:
- Different sources may have different information
- Make dataframes from scraped websites until satisfied with level of data (chose from 5 websites)
- Compare differences and take the most likely to be accurate (most agreement)
- fill in gaps manually if reasonable

## Web scraping

In [None]:
# from houseplants expert
scrap1 = requests.get('https://houseplantsexpert.com/a-z-list-of-house-plants.html')
scrap1.status_code

### Get a list of plants

In [None]:
plant_soup1 = BS(scrap1.text)
plant_list1 = plant_soup1.find_all('a', {'href': re.compile(r'https://houseplantsexpert.com/shop/category/houseplants/.*')})
plant_list1 = [x.text for x in plant_list1]
plant_list1

### Start collecting the data
First I want an idea of how many plants and which ones, are in this website, so this next bit is not 100% necessary but allows for an idea of the data. I did this for multiple websites in order to vet them for the project and settled on houseplantsexpert.com

In [None]:
# Generates list of hrefs for individual plant pages minus all the other links on the first page
plant_list1 = plant_soup1.find_all('a', {'href': re.compile(r'https://houseplantsexpert.com/.*')})[166:409]

name_list1 = [x.text for x in plant_list1]
href_list1 = [x['href'] for x in plant_list1]
source1_df = pd.DataFrame({'common_name': name_list1, 'url': href_list1})

# Remove the rows for which the name is empty
source1_df = source1_df.loc[source1_df.common_name != '']

# pull the fancy name into a different column and remove the duplicated rows
source1_df['fancy_name'] = source1_df['common_name']
source1_df['duplicate'] = False
for i in range(0, source1_df.shape[0]-1):
    next_row = i + 1
    if source1_df.iloc[next_row,1] == source1_df.iloc[i,1]:
        source1_df.iloc[i,2] = source1_df.iloc[next_row,0]
        source1_df.iloc[next_row,3] = True

source1_df['duplicate'].value_counts()
source1_df = source1_df.loc[~source1_df.duplicate]
source1_df = source1_df.drop(columns='duplicate')
source1_df = source1_df.reset_index(drop=True)
source1_df

#### Scrape the tables from each of the individual plant pages into a dictionary
This site has two tables that will be very useful their fields aren't always in the same order but usually they have the same name. So I'll build a dictionary. <br>
NOTE: This step can take a while to run

In [None]:
# go get the tables from the urls
url_list = source1_df['url'].to_list()
dict_list = []
err_list = []
for u in range(0,len(url_list)):
    scr = requests.get(url_list[u])
    if scr.status_code == 200:
        scr = BS(scr.text)
        scr_list = [x.text for x in scr.find_all('td')]
        scr_dict = {}
        for i in range(0,len(scr_list),2):
            k = scr_list[i].strip(':')
            scr_dict[k] = scr_list[i+1]
        dict_list.append(scr_dict)
    else:
        err_list.append(url)

#### Check the output

In [None]:
# should be a list of dictionaries
dict_list

In [None]:
# should be empty unless a link didn't work
err_list

### Convert to a data frame

In [None]:
plants_df = pd.DataFrame(dict_list)
plants_df.head()

### Cleanup time!
Some of the fields have a slightly different names so they will need to be merged. At some point this website changed the format of their tables so plants before and after that point have differences. There are missing data points as well.

In [None]:
plants_df.columns

In [None]:
# split the names
plants_df['Common Name'] = plants_df['Names'].str.split(' \(common\).', expand=True)[0]
plants_df['Bot_Name'] = plants_df['Names'].str.split(' \(common\).', expand=True)[1]

# get the names for the 3 delinquents
plants_df.iloc[52,-2] = plants_df.iloc[52,22][:15]
plants_df.iloc[52,-1] = 'Ficus lyrata'
plants_df.iloc[101,-2] = plants_df.iloc[101,22][:11]
plants_df.iloc[101,-1] = 'Dracaena trifasciata'

# drop the empty row at 102
plants_df = plants_df.drop(index=102).reset_index(drop=True)

# fix some soil columns
plants_df.iloc[52,6] = plants_df.iloc[52,22] + ', ' + plants_df.iloc[52,23]
plants_df.iloc[74,6] = plants_df.iloc[74,24]

# drop unnecessary columns
plants_df = plants_df.drop(columns = ['Pruning and grooming: ', 'Grooming and pruning', 'Names', 
                                      'Soil Type', 'Soil pH', 'Potting Soil', 'Resting Period', 'Family', 
                                      'Leaf Size', 'Flower', 'Pruning', 'Grooming And Pruning', 'Common Names', 'Botanical Name'])
plants_df = plants_df.rename(columns = {'Bot_Name':'Botanical Name'})

# result
plants_df.info()

### Uncomment and run this to save to a csv
See the cleanup notebook for further data cleaning processes starting from the csv below.

In [None]:
#plants_df.to_csv('plants1.csv', index=False)