In [None]:
import requests
import xml.etree.ElementTree as ET

class InPortValidator():
    def __init__(self, dmp_xml_file_path: str = None,
                 dmp_xml_element_tree: ET = None):
        self.dmp_xml_file_path = dmp_xml_file_path
        self.dmp_xml_element_tree = dmp_xml_element_tree
        self._handle_xml_file()
        self.inport_sample_url = "https://www.fisheries.noaa.gov/inport/help/xml-loader/inport-xml-sample-dmp.xml"
        self.sample_root = self.load_xml_from_url(self.inport_sample_url)
        self._set_valid_values_from_sample_xml()
    
    def _handle_xml_file(self):
        # Handles the XML file provided in the init by loading self.root
        # correctly
        if (self.dmp_xml_file_path is None) and (self.dmp_xml_element_tree is None):
            raise ValueError("Either dmp_xml_file_path or dmp_xml_element_tree must be provided")
        assert (self.dmp_xml_file_path is not None) ^ (self.dmp_xml_element_tree is not None), \
            "Either dmp_xml_file_path or dmp_xml_element_tree must be provided, but not both."

        if self.dmp_xml_file_path:
            # Load XML from file path
            self.root = ET.parse(self.dmp_xml_file_path).getroot()
        else:
            # Use the provided ElementTree root element
            self.root = self.dmp_xml_element_tree.getroot()

    def load_xml_from_url(self, url):
        """
        Fetches XML content from a URL and parses it into an ElementTree root
        element.
        """
        try:
            # Fetch the content from the URL
            response = requests.get(url)
            # Raise an exception if the request was unsuccessful
            response.raise_for_status() 

            # The XML data is in response.content (bytes) or response.text
            # (string)
            # Use ET.fromstring to parse the content directly
            root = ET.fromstring(response.content)
            return root

        except requests.exceptions.RequestException as e:
            print(f"Error fetching URL: {e}")
            return None
        except ET.ParseError as e:
            print(f"Error parsing XML: {e}")
            return None
    
    def _set_valid_values_from_sample_xml(self):
        self._set_valid_dmp_types()
    
    def _set_valid_dmp_types(self):
        # Extracts and loads a list of valid DMP types from the sample XML
        # content
        dmp_types = self.sample_root.find("./general-information/dmp-type")
        dmp_types = dmp_types.text
        dmp_types = dmp_types[dmp_types.find(":") + 1:].split(",")
        dmp_types = [dmp_type.strip(" .") for dmp_type in dmp_types]
        self.valid_dmp_types = dmp_types

    def _validate_dmp_type(self, dmp_type: str) -> bool:
        return dmp_type in self.valid_dmp_types
    
    def validate_xml(self, xml_file_path: str, xml: ET):
        # Validate the XML content against the sample XML
        self._validate_dmp_type()

In [26]:

inport_sample_url = "https://www.fisheries.noaa.gov/inport/help/xml-loader/inport-xml-sample-dmp.xml"

validator = InPortValidator()
# Print the tag of the root element
print(f'Root tag: {validator.sample_root.tag}')
# Print the tag and attributes of each child element
for child in validator.sample_root:
    print(child.tag, child.attrib)

Root tag: inport-metadata
item-identification {}
general-information {}
program-information {}
coverage {}
data-acquisition {}
data-collection {}
contact-information {}
resources {}
data-protection {}
data-lineage {}
data-documentation {}
data-access {}
long-term-preservation {}
comments {}
related-items {'mode': 'replace'}
catalog-details {}


In [27]:
print(validator.valid_dmp_types)

['Initial DMP', 'Updated DMP', 'Final DMP', 'Funding Proposal DMP (Grants/CAs)', 'Updated DMP (Grants/CAs)', 'Final DMP (Grants/CAs)', 'Program-level DMP', 'No Data DMP']


In [18]:
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup

def get_max_page_number_from_inport(url: str = "") -> int:
    url = "https://www.fisheries.noaa.gov/inport/help/components/programs?page=0"
    r = requests.get(url)
    soup = BeautifulSoup(r.text, "html.parser")
    found_partial = soup.find(string=re.compile('Page 1 of '))
    if found_partial:
        max_page = int(found_partial.split(' of ')[1])
        print(f"Max page: {max_page}")
        return max_page
    else:
        print("Max page not found")
        return 0

def get_programs_table() -> pd.DataFrame:
    # The URL of the webpage you want to scrape
    url = "https://www.fisheries.noaa.gov/inport/help/components/programs?page=0"
    # Get the maximum page number from the webpage.
    max_page = get_max_page_number_from_inport(url=url)

    # Create an empty list to store DataFrames for each page
    dfs = []

    for curr_page in range(max_page):
        url = f"https://www.fisheries.noaa.gov/inport/help/components/programs?page={curr_page}"
        r = requests.get(url)

        # Use pandas to read all tables on the page
        # This returns a list of DataFrames
        tables = pd.read_html(url)

        # Access the first table in the list (index 0)
        dfs.append(tables[0])

    # Concatenate all DataFrames in the list into a single DataFrame
    programs_table = pd.concat(dfs, ignore_index=True)
    return programs_table

get_programs_table()

Max page: 3


Unnamed: 0,Org Group,Program
0,NESDIS,"National Environmental Satellite, Data, and In..."
1,NESDIS,NESDIS > Chief of Staff (COS)
2,NESDIS,NESDIS > Cooperative Institute for Climate and...
3,NESDIS,NESDIS > Cooperative Institute for Meteorologi...
4,NESDIS,NESDIS > Cooperative Institute for Research in...
...,...,...
199,OMAO,OMAO > Marine Operations (MO)
200,OMAO,OMAO > NOAA Commissioned Corps (NOAA Corps)
201,OMAO,OMAO > NOAA Diving Program (NDP)
202,OMAO,OMAO > Small Boat Program (SBP)
