In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

In [3]:
# List of plant names and corresponding FAO codes
plants = [
    ("Apples", 1407), ("Bananas", 2483), ("Barley", 1232), ("Cotton", 1158), ("Dates", 1673),
    ("Eggplants", 1965), ("Grapes", 2160), ("Maize (Corn)", 2175), ("Mango", 1416), ("Olives", 1553),
    ("Onions and Shallots", 364), ("Oranges", 720), ("Peanut", 2199), ("Potatoes", 1971), ("Rice", 1574),
    ("Sorghum", 48747), ("Soybean", 1150), ("Sugar Beet", 514), ("Sugar Cane", 1884), ("Sunflowerseed", 1191),
    ("Tomatoes", 1379), ("Watermelon", 708), ("Wheat", 2114)
]

In [4]:
# Base URL for FAO EcoCrop database
def get_fao_url(plant_id):
    return f"https://ecocrop.apps.fao.org/ecocrop/srv/en/dataSheet?id={plant_id}"

In [63]:
# Function to scrape data
def scrape_fao_data(plant_name, plant_id):
    url = get_fao_url(plant_id)
    response = requests.get(url)
    with open(f'htmls/{plant_name}_{plant_id}.html', 'wb') as f:
        f.write(response.content)

    if response.status_code != 200:
        print(f"Failed to fetch data for {plant_name} (ID: {plant_id})")
        return None

    soup = BeautifulSoup(response.text, 'html.parser')

    # Extracting plant scientific name
    sci_name = soup.find('h2').text.strip() if soup.find('h2') else None

    # Extracting attributes from the "Description" table
    desc_table = soup.find_all('table')[0]
    desc_data = {}
    for row in desc_table.find_all('tr')[1:]:  # Skip header row
        keys = row.find_all('th')
        values = row.find_all('td')
        for key, value in zip(keys, values):
            desc_data['Desc ' + key.text.strip()] = value.text.strip()

    # Extracting attributes from the "Ecology" table
    eco_data = {}
    eco_table = soup.find_all('table')[1]
    
    # high row
    row = eco_table.find_all('tr')[2]
    key = row.find_all('th')[0]
    eco_data['Eco ' + key.text.strip() + ' Optimal'] = row.find_all('td')[5].text.strip()
    eco_data['Eco ' + key.text.strip() + ' Absolute'] = row.find_all('td')[6].text.strip()

    # middle rows
    for row in eco_table.find_all('tr')[3:8]:  # Skip headers
        # left half
        key = row.find_all('th')[0]
        eco_data['Eco ' + key.text.strip() + ' Optimal Min'] = row.find_all('td')[0].text.strip()
        eco_data['Eco ' + key.text.strip() + ' Optimal Max'] = row.find_all('td')[1].text.strip()
        eco_data['Eco ' + key.text.strip() + ' Absolute Min'] = row.find_all('td')[2].text.strip()
        eco_data['Eco ' + key.text.strip() + ' Absolute Max'] = row.find_all('td')[3].text.strip()
    
        # right half
        key = row.find_all('th')[1]
        eco_data['Eco ' + key.text.strip() + ' Optimal'] = row.find_all('td')[4].text.strip()
        eco_data['Eco ' + key.text.strip() + ' Absolute'] = row.find_all('td')[5].text.strip()
        
    # low row
    row = eco_table.find_all('tr')[8]
    key = row.find_all('th')[0]
    eco_data['Eco ' + key.text.strip() + ' Optimal Min'] = row.find_all('td')[0].text.strip()
    eco_data['Eco ' + key.text.strip() + ' Optimal Max'] = row.find_all('td')[1].text.strip()
    eco_data['Eco ' + key.text.strip() + ' Absolute Min'] = row.find_all('td')[2].text.strip()
    eco_data['Eco ' + key.text.strip() + ' Absolute Max'] = row.find_all('td')[3].text.strip()

    # Extracting the remaining attributes from the "Ecology" table
    eco_table = soup.find_all('table')[2]

    for row in eco_table.find_all('tr')[:3]:
        # left half
        key = row.find_all('th')[0]
        eco_data['Eco ' + key.text.strip()] = row.find_all('td')[0].text.strip()
    
        # right half
        key = row.find_all('th')[1]
        eco_data['Eco ' + key.text.strip()] = row.find_all('td')[1].text.strip()

    # last row
    row = eco_table.find_all('tr')[3]
    key = row.find_all('th')[0]
    eco_data['Eco ' + key.text.strip()] = row.find_all('td')[0].text.strip()
    
    # Returning extracted data
    return {
        "ID": plant_id,
        "Plant Name": plant_name,
        "Scientific Name": sci_name,
        **desc_data,
        **eco_data
    }

In [64]:
# List to store all data
data = []

# Loop through each plant and scrape data
for plant_name, plant_id in plants:
    plant_data = scrape_fao_data(plant_name, plant_id)
    if plant_data:
        data.append(plant_data)
    time.sleep(1)  # To avoid overwhelming the server

# Convert list to DataFrame
df = pd.DataFrame(data)

# Save the dataframe to CSV
df.to_csv("D:\\fao_plant_data.csv", index=False)

print("Data extraction completed! CSV file saved.")

Data extraction completed! CSV file saved.
