# First try / Rough process

In [1]:
import requests
from bs4 import BeautifulSoup
import json

In [2]:
base_url = "https://www.shootgardening.co.uk/plant/"
specific_url = "pachira-aquatica"

In [3]:
page = requests.get(base_url + specific_url)
soup = BeautifulSoup(page.content, "html.parser")

In [4]:
results = soup.find("div", class_="box box_selected")
text_only_results = results.text

In [5]:
sections = {"Other names" : "Genus", #section 1 - other/common names
            "Cultivation" : "Soil type", #section 2 - cultivation info
            "Soil type" : "Soil drainage", #section 3 - soil type
            "Soil drainage" : "Soil pH", #section 4 - soil drain/moisture
            "Soil pH" : "Light", #section 5 - soil pH
            "Light" : "Aspect", #section 6 - light/shade conditions
            "Aspect" : "Exposure", #section 7 - optimal place for growth
            "Exposure" : "UK hardiness", #section 8 - light exposure
            }

In [6]:
for key in sections:
    print(key, '->', sections[key])
print ("Dictionary has a total of", len(sections), "entries")

Other names -> Genus
Cultivation -> Soil type
Soil type -> Soil drainage
Soil drainage -> Soil pH
Soil pH -> Light
Light -> Aspect
Aspect -> Exposure
Exposure -> UK hardiness
Dictionary has a total of 8 entries


In [7]:
#take in kvp from section and get respective section text
def get_plant_upkeep_info_sections(list):
    start = text_only_results.find(key) + len(key)
    end = text_only_results.find(sections[key])
    specific_section_text = text_only_results [start:end]
    list.append(specific_section_text)


In [8]:
plant_upkeep_as_list = []

for key in sections:
    get_plant_upkeep_info_sections(plant_upkeep_as_list)

In [9]:
for item in plant_upkeep_as_list:
    print(item)
print("List has a total of", len(plant_upkeep_as_list), "entries")


Money tree, Provision tree, Shaving brush tree, Guiana chestnut, Water chestnut


Grow indoors in a partially shaded position. Water regularly, applying a balanced fertiliser each month. Maintain humidity with regular misting.


Loamy


Moist but well-drained, Well-drained


Acid, Alkaline, Neutral


Partial Shade


East, West


Sheltered

List has a total of 8 entries


In [10]:
#draft json structure
money_tree_dict_test = {
    "Common names" : ["Money tree", "Provision tree", "Shaving bush tree", "Guiana chestnut", "Water chestnut"],
    "Cultivation" : "Grow indoors in a partially shaded position. Water regularly, applying a balanced fertiliser each month. Maintain humidity with regular misting.",
    "Soil" : {
        "Type" : "Loamy",
        "Drainage" : "Moist but well-drained, Well-drained",
        "pH" : ["Acid","Alkaline","Neutral"]
    },
    "Light" : "Partial Shade",
    "Placement": {
        "Aspect": "East, West",
        "Exposure": "Sheltered" 
    },
    "Image": ""
}

In [11]:
#turning long comma separated strings into a list
def convert_names(string):
    cleaned_name_list = []
    other_names_as_list = list(string.split(","))
    for i in other_names_as_list:
        new_i = i.strip()
        cleaned_name_list.append(new_i)
    return cleaned_name_list

In [12]:
#cleaning \n around each item of the list
def clean_list_strings(list):
    for i,s in enumerate(list):
        list[i] = s.strip()
    return list

In [13]:
def make_dict(name):
    some_var = {
    name : {
    "Common names" : convert_names(plant_upkeep_as_list[0]),
    "Cultivation" : plant_upkeep_as_list[1],
    "Soil" : {
        "Type" : plant_upkeep_as_list[2],
        "Drainage" : plant_upkeep_as_list[3],
        "pH" : convert_names(plant_upkeep_as_list[4])
    },
    "Light" : plant_upkeep_as_list[5],
    "Placement": {
        "Aspect": plant_upkeep_as_list[6],
        "Exposure": plant_upkeep_as_list[7] 
    },
    "Image": ""}
    }
    return some_var

# Clean up / Detailed process

The script is trying to take raw scraped html, clean it up and fit it in a json format.
This is what is called an ETL process. ETL stands for Extract-Transform-Load.

1.  <b>Extract:</b> Make the request for the plant we want
2.  <b>Transform:</b> Read the html to scrape the data we need
3.  <b>Load:</b> Save the scraped data into the json format we desire 


In [14]:
import requests
from bs4 import BeautifulSoup
import json
import logging

logging.basicConfig(level=logging.DEBUG,
                    format='%(asctime)s %(message)s')
logger = logging.getLogger(__name__)

## Extract



-  we create a url specific to some plant
-  then we make a request to that url 
-  we then soupify the response we got back from that url
-  then we filter the html soup we have to get only the contents of 
   `.find("div", class_="box box_selected")`

In [None]:
# Below I pasted the code you used earlier to create the 
# `text_only_results` variable
base_url = "https://www.shootgardening.co.uk/plant/"
specific_url = "pachira-aquatica"

page = requests.get(base_url + specific_url)
soup = BeautifulSoup(page.content, "html.parser")

results = soup.find("div", class_="box box_selected")
text_only_results = results.text

What we need to do is encapsulate the above behaviour into a function.

The function will take the plant name as a string as a parameter.<br>
The function will return the ```text_only_results``` for the page.

In [15]:
def get_html_text_for_plant(plant_name):
    base_url = "https://www.shootgardening.co.uk/plant/"
    url = base_url + plant_name
    logger.info(f"reading url {url}")
    
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")
    
    results = soup.find("div", class_="box box_selected")
    text_only_results = results.text
    return text_only_results
    

In [16]:
plant_specific_text_result = get_html_text_for_plant("pachira-aquatica")

2022-03-30 16:23:15,076 reading url https://www.shootgardening.co.uk/plant/pachira-aquatica
2022-03-30 16:23:15,079 Starting new HTTPS connection (1): www.shootgardening.co.uk:443
2022-03-30 16:23:15,449 https://www.shootgardening.co.uk:443 "GET /plant/pachira-aquatica HTTP/1.1" 200 None


## Transform

-  For the transform, we first initialize a `sections` variable where we map
   the start of a section with the end of a section.
   - For example, trying to extract the html specifically between `"Other names" and "Genus"` \
-  We iterate through the `sections`, making a call to `get_plant_upkeep_info_sections` on each iteration.
-  The `get_plant_upkeep_info_sections` takes in a list that it updates on each iteration of looping through `sections`
-  We create a variable `plant_upkeep_as_list` that is updated by the above step of iterating through `sections`
-  The final result is that `plant_upkeep_as_list` is a list of the items we want for the plant

#### Original

In [None]:
sections = {"Other names" : "Genus", #section 1 - other/common names
            "Cultivation" : "Soil type", #section 2 - cultivation info
            "Soil type" : "Soil drainage", #section 3 - soil type
            "Soil drainage" : "Soil pH", #section 4 - soil drain/moisture
            "Soil pH" : "Light", #section 5 - soil pH
            "Light" : "Aspect", #section 6 - light/shade conditions
            "Aspect" : "Exposure", #section 7 - optimal place for growth
            "Exposure" : "UK hardiness", #section 8 - light exposure
            }

In [None]:
#take in kvp from section and get respective section text
def get_plant_upkeep_info_sections(list):
    start = text_only_results.find(key) + len(key)
    end = text_only_results.find(sections[key])
    specific_section_text = text_only_results [start:end]
    list.append(specific_section_text.strip())


In [None]:
plant_upkeep_as_list = []

for key in sections:
    get_plant_upkeep_info_sections(plant_upkeep_as_list)

In [None]:
plant_upkeep_as_list

### Update

There's a few things I want to clean-up about the previous Transform solution.

Similar to what was done for the Extract portion, we are going to work to incorporate the Transform portion into a function.

If we look at the original `get_plant_upkeep_info_sections` there's a few things to improve:
-  It takes in a parameter called `list`. In python `list` is a type. One should never call their variable `list` in python.
-  The function relies on several variables that it does not take in as parameters.
   - It uses the variables `text_only_results`, `key`, and `sections`
-  In a setting outside of Jupyter, we would run into an issue because those variables would not be in scope.


The approach the update takes is similar but different.
-  We want to create a list of the upkeep_items
-  We will iterate through the same sections
-  We now have a function that takes in the key, value, and full text that we are searching.
   This function returns the subtext that we were looking for.
-  We then take this returned output and append to a list

In [22]:
def get_sections():
    sections = {"Other names" : "Genus", #section 1 - other/common names
            "Cultivation" : "Soil type", #section 2 - cultivation info
            "Soil type" : "Soil drainage", #section 3 - soil type
            "Soil drainage" : "Soil pH", #section 4 - soil drain/moisture
            "Soil pH" : "Light", #section 5 - soil pH
            "Light" : "Aspect", #section 6 - light/shade conditions
            "Aspect" : "Exposure", #section 7 - optimal place for growth
            "Exposure" : "UK hardiness", #section 8 - light exposure
            }
    return sections

In [23]:
def get_text_for_bounded_section(key, value, text_result):
    start = text_result.find(key) + len(key)
    end = text_result.find(value)
    subtext = text_result[start:end]
    subtext = subtext.strip()
    logger.info(f"Between {key} and {value} the text '{subtext}' was extracted")
    return subtext

In [24]:
def create_list_of_plant_upkeep_items(text_result):
    sections = get_sections()
    
    upkeep_items = []
    for key in sections:
        value = sections[key]
        subtext = get_text_for_bounded_section(key, value, text_result)
        upkeep_items.append(subtext)
    
    return upkeep_items

## Load

-  For the Load we are using the `plant_upkeep_as_list` variable we created to populate a dictionary
-  Then we plan on using that dictionary to populate a json file

#### Original

In [29]:
#turning long comma separated strings into a list
def convert_names(string):
    cleaned_name_list = []
    other_names_as_list = list(string.split(","))
    for i in other_names_as_list:
        new_i = i.strip()
        cleaned_name_list.append(new_i)
    return cleaned_name_list

In [30]:
def make_dict(name):
    some_var = {
    name : {
    "Common names" : convert_names(plant_upkeep_as_list[0]),
    "Cultivation" : plant_upkeep_as_list[1],
    "Soil" : {
        "Type" : plant_upkeep_as_list[2],
        "Drainage" : plant_upkeep_as_list[3],
        "pH" : convert_names(plant_upkeep_as_list[4])
    },
    "Light" : plant_upkeep_as_list[5],
    "Placement": {
        "Aspect": plant_upkeep_as_list[6],
        "Exposure": plant_upkeep_as_list[7] 
    },
    "Image": ""}
    }
    return some_var

#### Update

In [31]:
#turning long comma separated strings into a list
def convert_names(string):
    cleaned_name_list = []
    other_names_as_list = list(string.split(","))
    for i in other_names_as_list:
        new_i = i.strip()
        cleaned_name_list.append(new_i)
    return cleaned_name_list

In [26]:
def make_json_from_upkeep_list(plant_upkeep_list):
    single_plant_json = {
        "Common names" : convert_names(plant_upkeep_list[0]),
        "Cultivation" : plant_upkeep_list[1],
        "Soil" : {
            "Type" : plant_upkeep_list[2],
            "Drainage" : plant_upkeep_list[3],
            "pH" : convert_names(plant_upkeep_list[4])
        },
        "Light" : plant_upkeep_list[5],
        "Placement": {
            "Aspect": plant_upkeep_list[6],
            "Exposure": plant_upkeep_list[7] 
        },
        "Image": ""
    }
    return single_plant_json

## Full Process

In [34]:
plant_name = "pachira-aquatica"
plant_specific_text_result = get_html_text_for_plant(plant_name)
upkeep_items = create_list_of_plant_upkeep_items(plant_specific_text_result)
upkeep_json = make_json_from_upkeep_list(upkeep_items)
plant_json = {plant_name : upkeep_json}


2022-03-30 16:30:44,785 reading url https://www.shootgardening.co.uk/plant/pachira-aquatica
2022-03-30 16:30:44,790 Starting new HTTPS connection (1): www.shootgardening.co.uk:443
2022-03-30 16:30:45,252 https://www.shootgardening.co.uk:443 "GET /plant/pachira-aquatica HTTP/1.1" 200 None
2022-03-30 16:30:45,315 Between Other names and Genus the text 'Money tree, Provision tree, Shaving brush tree, Guiana chestnut, Water chestnut' was extracted
2022-03-30 16:30:45,316 Between Cultivation and Soil type the text 'Grow indoors in a partially shaded position. Water regularly, applying a balanced fertiliser each month. Maintain humidity with regular misting.' was extracted
2022-03-30 16:30:45,316 Between Soil type and Soil drainage the text 'Loamy' was extracted
2022-03-30 16:30:45,317 Between Soil drainage and Soil pH the text 'Moist but well-drained, Well-drained' was extracted
2022-03-30 16:30:45,318 Between Soil pH and Light the text 'Acid, Alkaline, Neutral' was extracted
2022-03-30 16:

In [39]:
def write_plant_json_to_file(plant_json, filename="data.json"):
    with open(filename, 'w') as fp:
        json.dump(plant_json, fp, indent=2)

## Wrapping it up

In [40]:
def get_json_for_plant(plant_name):
    plant_specific_text_result = get_html_text_for_plant(plant_name)
    upkeep_items = create_list_of_plant_upkeep_items(plant_specific_text_result)
    upkeep_json = make_json_from_upkeep_list(upkeep_items)
    plant_json = {plant_name : upkeep_json}
    return plant_json

In [41]:
plant_json = get_json_for_plant("pachira-aquatica")
write_plant_json_to_file(plant_json)

2022-03-30 16:33:32,240 reading url https://www.shootgardening.co.uk/plant/pachira-aquatica
2022-03-30 16:33:32,242 Starting new HTTPS connection (1): www.shootgardening.co.uk:443
2022-03-30 16:33:32,659 https://www.shootgardening.co.uk:443 "GET /plant/pachira-aquatica HTTP/1.1" 200 None
2022-03-30 16:33:32,744 Between Other names and Genus the text 'Money tree, Provision tree, Shaving brush tree, Guiana chestnut, Water chestnut' was extracted
2022-03-30 16:33:32,744 Between Cultivation and Soil type the text 'Grow indoors in a partially shaded position. Water regularly, applying a balanced fertiliser each month. Maintain humidity with regular misting.' was extracted
2022-03-30 16:33:32,744 Between Soil type and Soil drainage the text 'Loamy' was extracted
2022-03-30 16:33:32,745 Between Soil drainage and Soil pH the text 'Moist but well-drained, Well-drained' was extracted
2022-03-30 16:33:32,745 Between Soil pH and Light the text 'Acid, Alkaline, Neutral' was extracted
2022-03-30 16:

In [42]:
plant = "calathea-makoyana"
plant_json = get_json_for_plant(plant)
write_plant_json_to_file(plant_json)

2022-03-30 16:34:55,482 reading url https://www.shootgardening.co.uk/plant/calathea-makoyana
2022-03-30 16:34:55,486 Starting new HTTPS connection (1): www.shootgardening.co.uk:443
2022-03-30 16:34:55,956 https://www.shootgardening.co.uk:443 "GET /plant/calathea-makoyana HTTP/1.1" 200 None
2022-03-30 16:34:56,022 Between Other names and Genus the text 'Peacock plant, Cathedral windows, Brain plant' was extracted
2022-03-30 16:34:56,023 Between Cultivation and Soil type the text 'Avoid direct sunlight. Enjoys humid, draught-free conditions with a consistent temperature of at least 16 °C. Water during the growing season, applying a liquid feed monthly and keep the compost moist but not wet during winter.' was extracted
2022-03-30 16:34:56,024 Between Soil type and Soil drainage the text 'Chalky, Clay, Loamy' was extracted
2022-03-30 16:34:56,025 Between Soil drainage and Soil pH the text 'Moist but well-drained, Well-drained' was extracted
2022-03-30 16:34:56,025 Between Soil pH and Ligh

In [55]:
plants = ["calathea-makoyana", "pachira-aquatica", "aloe-any-succulent-variety"]
plants_json = {}
for plant in plants:
    plant_json = get_json_for_plant(plant)
    plants_json.update(plant_json)


2022-03-30 16:38:02,661 reading url https://www.shootgardening.co.uk/plant/calathea-makoyana
2022-03-30 16:38:02,665 Starting new HTTPS connection (1): www.shootgardening.co.uk:443
2022-03-30 16:38:03,205 https://www.shootgardening.co.uk:443 "GET /plant/calathea-makoyana HTTP/1.1" 200 None
2022-03-30 16:38:03,273 Between Other names and Genus the text 'Peacock plant, Cathedral windows, Brain plant' was extracted
2022-03-30 16:38:03,273 Between Cultivation and Soil type the text 'Avoid direct sunlight. Enjoys humid, draught-free conditions with a consistent temperature of at least 16 °C. Water during the growing season, applying a liquid feed monthly and keep the compost moist but not wet during winter.' was extracted
2022-03-30 16:38:03,274 Between Soil type and Soil drainage the text 'Chalky, Clay, Loamy' was extracted
2022-03-30 16:38:03,274 Between Soil drainage and Soil pH the text 'Moist but well-drained, Well-drained' was extracted
2022-03-30 16:38:03,275 Between Soil pH and Ligh