<a href="https://colab.research.google.com/github/m-wessler/oper-scrape/blob/main/AFD_Word_Scrape_weather_gov.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import re
import requests
import pandas as pd
from tqdm.notebook import tqdm
from bs4 import BeautifulSoup

# Define the search terms and precision terms
search_terms = [
    "GFS", "GFS ENSEMBLE", "GEFS", "ECMWF", "ECMWF ENSEMBLE",
    "EPS", "HRRR", "HREF", "ENSEMBLE", "EFI", "NBM",
    "NATIONAL BLEND", "CLUSTER"
]

precision_terms = [
    "PERCENTILE", "PROBABILITY", "POSSIBLE", "EXPECTED",
    "CHANCE", "LIKELY"
]

# List of WFO sites
WFO = [
    "BYZ", "BOI", "LKN", "EKA", "FGZ", "GGW", "TFX", "VEF", "LOX", "MFR",
    "MSO", "PDT", "PSR", "PIH", "PQR", "REV", "STO", "SLC", "SGX", "MTR",
    "HNX", "SEW", "OTX", "TWC"
]

# Initialize counters
wfo_search_term_counts = {wfo: {term: 0 for term in search_terms} for wfo in WFO}
wfo_precision_term_counts = {wfo: {term: 0 for term in precision_terms} for wfo in WFO}

# Iterate over each WFO site
for wfo in tqdm(WFO, desc='WFO Sites'):
    # Iterate over the most recent 15 versions
    for version in tqdm(range(1, 16), desc=f'Versions for {wfo}', leave=False):
        # Fetch the content from the URL
        url = f'https://forecast.weather.gov/product.php?site=NWS&issuedby={wfo}&product=AFD&format=CI&version={version}&glossary=1'
        try:
            response = requests.get(url)
            response.raise_for_status()
            content = response.text

            # Parse the HTML content to get the text
            soup = BeautifulSoup(content, 'html.parser')
            text_content = soup.get_text()

            # Count occurrences of each term using regex with word boundaries
            for term in search_terms:
                wfo_search_term_counts[wfo][term] += len(re.findall(r'\b' + re.escape(term) + r'\b', text_content, re.IGNORECASE))

            for term in precision_terms:
                wfo_precision_term_counts[wfo][term] += len(re.findall(r'\b' + re.escape(term) + r'\b', text_content, re.IGNORECASE))
        except requests.exceptions.RequestException as e:
            print(f"Failed to retrieve version {version} for WFO {wfo}: {e}")

# Combine the counts into a single DataFrame
search_term_df = pd.DataFrame(wfo_search_term_counts).T
precision_term_df = pd.DataFrame(wfo_precision_term_counts).T
combined_df = pd.concat([search_term_df, precision_term_df], axis=1, keys=['Search Terms', 'Precision Terms'])

combined_df.to_csv('./test.csv')

# Print the combined DataFrame
print("Combined Term Counts:")
combined_df

In [None]:
combined_df

Unnamed: 0_level_0,Search Terms,Search Terms,Search Terms,Search Terms,Search Terms,Search Terms,Search Terms,Search Terms,Search Terms,Search Terms,Search Terms,Search Terms,Search Terms,Precision Terms,Precision Terms,Precision Terms,Precision Terms,Precision Terms,Precision Terms
Unnamed: 0_level_1,GFS,GFS ENSEMBLE,GEFS,ECMWF,ECMWF ENSEMBLE,EPS,HRRR,HREF,ENSEMBLE,EFI,NBM,NATIONAL BLEND,CLUSTER,PERCENTILE,PROBABILITY,POSSIBLE,EXPECTED,CHANCE,LIKELY
BYZ,20,0,0,5,0,0,2,0,6,0,11,3,1,4,3,35,66,94,15
BOI,2,0,0,2,0,0,0,0,2,0,0,0,0,0,2,34,26,74,4
LKN,4,0,0,1,0,0,0,0,0,0,0,0,0,0,0,15,51,8,16
EKA,4,0,0,7,3,0,1,18,26,3,8,0,0,2,5,51,75,71,59
FGZ,0,0,0,0,0,0,0,5,0,0,0,0,0,0,0,27,67,11,22
GGW,2,0,0,0,0,0,1,3,0,0,15,0,0,6,2,13,38,10,5
TFX,0,0,0,0,0,0,0,0,11,0,4,0,0,0,19,6,27,30,0
VEF,0,0,0,0,0,4,0,2,5,0,16,0,9,0,4,44,103,15,28
LOX,0,0,0,0,0,2,6,2,6,0,2,0,0,0,0,64,89,103,79
MFR,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,52,71,16,73
