### **NWS SPC Mesoscale Discussion Scraper**
#### **Created by Tyson Stewart for use in EAE 598/698**
##### Data courtesy of Iowa Environmental Mesonet

Import necessary packages and read CSV of MDs and their relevant details.

In [1]:
import geopandas as gp
import pandas as pd
import urllib.request

csv = pd.read_csv("central_ok_mds.csv")

#Create new column called "Watch Parameter" that by default will be set to all zeroes unless a watch is present
csv["WATCH PARAMETER"] = [0] * len(csv["CONCERN"])

csv

Unnamed: 0,SPC PRODUCT ID,YEAR,ISSUE,SPC PRODUCT NUM,MONTH,DAY,TIME (UTC),NUM,CONFIDEN,CONCERN,WATCH PARAMETER
0,201401102125-KWNS-ACUS11-SWOMCD,2014,2.014010e+11,12,1,10,2125,12,,,0
1,201402020948-KWNS-ACUS11-SWOMCD,2014,2.014020e+11,52,2,2,948,52,,,0
2,201402021349-KWNS-ACUS11-SWOMCD,2014,2.014020e+11,53,2,2,1349,53,,,0
3,201402021609-KWNS-ACUS11-SWOMCD,2014,2.014020e+11,54,2,2,1609,54,,,0
4,201402040934-KWNS-ACUS11-SWOMCD,2014,2.014020e+11,61,2,4,934,61,,,0
...,...,...,...,...,...,...,...,...,...,...,...
1068,202411041812-KWNS-ACUS11-SWOMCD,2024,2.024110e+11,2203,11,4,1812,2203,,,0
1069,202411082136-KWNS-ACUS11-SWOMCD,2024,2.024110e+11,2222,11,8,2136,2222,,,0
1070,202411180802-KWNS-ACUS11-SWOMCD,2024,2.024110e+11,2229,11,18,802,2229,,,0
1071,202411181009-KWNS-ACUS11-SWOMCD,2024,2.024110e+11,2231,11,18,1009,2231,,,0


MD CSV does not contain the actual text, and is inconsistent regarding watch probability and MD type. To fix this we can use IEM's API to scrape each MD for the relevant details.

In [2]:
# Lists to have our scrapped data inserted into
concern_list = []
prob_list = []

#List of terms in the "Concerning" line that indicate that the MD is NOT a watch
watch_tag = ["Unlikely", "Likely", "Possible", "UNLIKELY", "LIKELY", "POSSIBLE", "unlikely", "likely", "possible", "needed soon", "NEEDED SOON"]

c = 0

for i in range(len(csv["SPC PRODUCT ID"])):
    #Pull the MD text from IEM API
    prod_id = csv["SPC PRODUCT ID"][i]
    link = f"https://mesonet.agron.iastate.edu/api/1/nwstext/{prod_id}"
    text = urllib.request.urlopen(link)
    #Create/reset flag variables for Watch/MD processing
    c_flag = 0
    w_flag = 0
    c += 1
    
    #Check every line in the MD Text to find "concerning" and watch probability lines
    for line in text:
        if ("CONCERNING" in str(line) or "Concerning" in str(line)) and c_flag == 0:
            d = 0
            concern = str(line[13:])
            concern_strip = concern.strip("\n")[2:-4]

            concern_list.append(concern_strip)
            #print(f"{prod_id} is {concern_strip}")

            if ("WATCH" in concern_strip) or ("Watch" in concern_strip):
                verify_flag = 1
                for i in watch_tag:
                    #Check if any of our watch disqualifying terms (watch_tag) are in the concerning tag.
                    if watch_tag[d] in concern_strip:
                        verify_flag = 0
                        break
                    d += 1
                
                # If "Watch" is in the concerning tag, and there is none of the watch disqualifying terms are present, it must be an active watch.
                if verify_flag == 1:
                    w_flag = 1
                    prob_list.append(100)
                    #print("ACTIVE WATCH!")

            c_flag = 1
        
        if "PROBABILITY OF WATCH ISSUANCE" in str(line) or "Probability of Watch Issuance" in str(line):
            watch_prob = str(line)[34:-11]
            #print(f"Watch prob is {watch_prob}%")
            prob_list.append(watch_prob)
            w_flag = 1
    
    #If no watch probability data is present, assume that it is 0%.
    if w_flag == 0:
        watch_prob = 0
        #print(f"Watch prob is {watch_prob}%")
        prob_list.append(watch_prob)

#concern_list

***Unit Testing:*** Verify that the watch probability and MD type lists match in length.

In [3]:
print(len(concern_list))
print(len(prob_list))

1073
1073


Iterates through each list and applies the value into the CSV. Also check if the watch probability is 100% and if so set the "Watch Parameter" value to 1. (Watch Parameter will serve as our binary classifier for the ML model).

In [4]:
for i in range(len(prob_list)):
    csv.loc[i, "CONFIDEN"] = prob_list[i]

    if csv["CONFIDEN"][i] == 100:
        csv.loc[i, "WATCH PARAMETER"] = 1

for i in range(len(prob_list)):
    csv.loc[i, "CONCERN"] = concern_list[i]
csv

  csv.loc[i, "CONFIDEN"] = prob_list[i]
  csv.loc[i, "CONCERN"] = concern_list[i]


Unnamed: 0,SPC PRODUCT ID,YEAR,ISSUE,SPC PRODUCT NUM,MONTH,DAY,TIME (UTC),NUM,CONFIDEN,CONCERN,WATCH PARAMETER
0,201401102125-KWNS-ACUS11-SWOMCD,2014,2.014010e+11,12,1,10,2125,12,5,SEVERE POTENTIAL...WATCH UNLIKELY,0
1,201402020948-KWNS-ACUS11-SWOMCD,2014,2.014020e+11,52,2,2,948,52,0,WINTER MIXED PRECIPITATION,0
2,201402021349-KWNS-ACUS11-SWOMCD,2014,2.014020e+11,53,2,2,1349,53,0,WINTER MIXED PRECIPITATION,0
3,201402021609-KWNS-ACUS11-SWOMCD,2014,2.014020e+11,54,2,2,1609,54,0,HEAVY SNOW,0
4,201402040934-KWNS-ACUS11-SWOMCD,2014,2.014020e+11,61,2,4,934,61,0,HEAVY SNOW,0
...,...,...,...,...,...,...,...,...,...,...,...
1068,202411041812-KWNS-ACUS11-SWOMCD,2024,2.024110e+11,2203,11,4,1812,2203,100,Tornado Watch 704..,1
1069,202411082136-KWNS-ACUS11-SWOMCD,2024,2.024110e+11,2222,11,8,2136,2222,100,Tornado Watch 708..,1
1070,202411180802-KWNS-ACUS11-SWOMCD,2024,2.024110e+11,2229,11,18,802,2229,80,Severe potential...Watch likely,0
1071,202411181009-KWNS-ACUS11-SWOMCD,2024,2.024110e+11,2231,11,18,1009,2231,100,Tornado Watch 710..,1


Filter the newly modified CSV file to only contain MDs concerning severe thunderstorm and tornado threats.

In [5]:
sub_csv = csv[(csv["CONCERN"].str.contains("SEVERE")) | (csv["CONCERN"].str.contains("Severe")) | (csv["CONCERN"].str.contains("TORNADO")) | (csv["CONCERN"].str.contains("Tornado"))]

sub_csv

Unnamed: 0,SPC PRODUCT ID,YEAR,ISSUE,SPC PRODUCT NUM,MONTH,DAY,TIME (UTC),NUM,CONFIDEN,CONCERN,WATCH PARAMETER
0,201401102125-KWNS-ACUS11-SWOMCD,2014,2.014010e+11,12,1,10,2125,12,5,SEVERE POTENTIAL...WATCH UNLIKELY,0
8,201404020118-KWNS-ACUS11-SWOMCD,2014,2.014040e+11,244,4,2,118,244,100,SEVERE THUNDERSTORM WATCH 45..,1
9,201404020242-KWNS-ACUS11-SWOMCD,2014,2.014040e+11,245,4,2,242,245,100,SEVERE THUNDERSTORM WATCH 45..,1
10,201404021919-KWNS-ACUS11-SWOMCD,2014,2.014040e+11,248,4,2,1919,248,80,SEVERE POTENTIAL...TORNADO WATCH LIKELY,0
11,201404022300-KWNS-ACUS11-SWOMCD,2014,2.014040e+11,250,4,2,2300,250,100,TORNADO WATCH 46..,1
...,...,...,...,...,...,...,...,...,...,...,...
1068,202411041812-KWNS-ACUS11-SWOMCD,2024,2.024110e+11,2203,11,4,1812,2203,100,Tornado Watch 704..,1
1069,202411082136-KWNS-ACUS11-SWOMCD,2024,2.024110e+11,2222,11,8,2136,2222,100,Tornado Watch 708..,1
1070,202411180802-KWNS-ACUS11-SWOMCD,2024,2.024110e+11,2229,11,18,802,2229,80,Severe potential...Watch likely,0
1071,202411181009-KWNS-ACUS11-SWOMCD,2024,2.024110e+11,2231,11,18,1009,2231,100,Tornado Watch 710..,1


***Unit Testing:*** Evaluate CSV fields to ensure that there are no null data points.

In [6]:
sub_csv.reset_index(drop=True, inplace=True)

key_list = list(sub_csv.keys())

# Flag variable to raise if a null value is detected
null_flag = 0

#Test each key to ensure all data is present
for i in key_list:
    for j in range(0, (len(sub_csv[i]) - 1)):
        if sub_csv.loc[j, i] == "NaN" or sub_csv.loc[j, i] == "Null" or sub_csv.loc[j, i] == None:
            null_flag = 1
            print(f"NULL VALUE DETECTED AT INDEX {j}!")

if null_flag == 0:
    print("No null values detected. Check passed.")

No null values detected. Check passed.


Write the newly filtered CSV to disk.

In [7]:
sub_csv.to_csv("central_ok_mds_FINAL.csv")