#### Set styling for plotting

In [16]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.ticker as ticker
import seaborn as sns
sns.set_palette('colorblind')
from matplotlib.pyplot import tight_layout
# ##SETTING PARAMS FOR MATPLOTLIB FIGURES
plt.rcParams.update({"figure.figsize": (6, 6),
                 "axes.facecolor": "white",
                 "axes.edgecolor": "black"})
plt.rcParams['axes.prop_cycle'] = plt.cycler(color=sns.color_palette('colorblind'))
##set font size
font = {'family': 'sans-serif',
       'weight': 'normal',
       'size': 14}
plt.rc('font', **font)
# ##PANDAS PLOTTING
pd.plotting.register_matplotlib_converters()

#### Step 1: save environment file

In [17]:
!conda env export > combined_met_environment.yml

#### Step 2: import modules

In [18]:
import glob
import os
import xml.etree.ElementTree as ET
from datetime import datetime, timezone
from tqdm import tqdm
import codecs
import csv
import pytz

## INFORMATION
This dataset in excel format provices the full year of 2023 adverse wetaher data


#### Step 3: import csv files

In [19]:
data_directory_xl = "/mnt/hgfs/shared/project_data/met_eireann/National warnings from pdfs_2023.xlsx"
full_path_xl = os.path.abspath(data_directory_xl)

In [20]:
# Read the 2023 excel file
df_xl = pd.read_excel(full_path_xl)

In [21]:
df_xl.head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,unprotect cafo,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 33,Unnamed: 34,Unnamed: 35,Unnamed: 36,Unnamed: 37,Unnamed: 38,Ulster,Unnamed: 40,Unnamed: 41,Unnamed: 42
0,Total,,,Yellow,Orange,Red,Named,,Ireland,,...,MH,OY,WH,WX,WW,,All,DL,CN,MN
1,2023-01-01 00:00:00,Fog/Ice,Level: Yellow,243,50,6,0,,0,,...,0,0,0,0,0,,0,0,0,0
2,,,Type: Fog / Ice,0,0,0,0,,0,,...,0,0,0,0,0,,0,0,0,0
3,,,Message: Icy stretches along with patches of f...,0,0,0,0,,0,,...,0,0,0,0,0,,0,0,0,0
4,,,Affected Regions: ireland,0,0,0,0,,1,,...,0,0,0,0,0,,0,0,0,0


In [22]:
# If your DataFrame is named df, you can filter like this:
yellow_events = df_xl.loc[df_xl['Unnamed: 3'] == 1, 'unprotect cafo']
orange_events = df_xl.loc[df_xl['Unnamed: 4'] == 1, 'unprotect cafo']

# To display the list of texts for yellow events:
#print(yellow_events.to_string(index=False))
yellow_events.count()

244

In [23]:
# Filter out events that contain the string "Level: Yellow"
yellow_events_filtered = yellow_events[~yellow_events.str.contains("Level: Yellow", na=False)]

# Display the filtered events (the anomalies)
print("\nAnomalous yellow events (without 'Level: Yellow'):")
print(yellow_events_filtered.to_string(index=False))


Anomalous yellow events (without 'Level: Yellow'):
                                            Yellow
                                            yellow


In [24]:
# Filter out events that contain the string "Level: Yellow"
orange_events_filtered = orange_events[~orange_events.str.contains("Level: Orange", na=False)]

# Display the filtered events (the anomalies)
print("\nAnomalous yellow events (without 'Level: Orange'):")
print(orange_events_filtered.to_string(index=False))


Anomalous yellow events (without 'Level: Orange'):


#### Step 4: process Excel data 

In [30]:
def standardize_datetime(date_str):
    try:
        # Handle special cases first
        if '/' in date_str and 'updated' in date_str.lower():
            date_str = date_str.split('/')[0].strip()
           
        if len(date_str.split()) < 3:  # Skip incomplete dates
            return None
           
        formats = [
            "%A %d/%m/%Y %H:%M",
            "%H:%M %A %d/%m/%Y",
            "%A %d/%m/%Y %H.%M", 
            "%d/%m/%Y %H:%M",
            "%d/%m/%Y %H.%M",
            "%A %d/%m/%Y"  # Format without time
        ]
       
        for fmt in formats:
            try:
                dt = datetime.strptime(date_str.strip(), fmt)
                return pytz.utc.localize(dt).isoformat()
            except ValueError:
                continue
       
        # Last resort - try pandas parsing
        return pd.to_datetime(date_str).tz_localize('UTC').isoformat()
    except Exception as e:
        print(f"Error converting date {date_str}: {str(e)}")
        return None

def transform_warnings_data(df):
    processed_data = []
    current_event = None
    event_texts = []  # This will accumulate all text lines for the current event

    # Define the counties (used for affected regions)
    counties = {
        'Carlow': 0, 'Cavan': 0, 'Clare': 0, 'Cork': 0, 'Donegal': 0,
        'Dublin': 0, 'Galway': 0, 'Kerry': 0, 'Kildare': 0, 'Kilkenny': 0,
        'Laois': 0, 'Leitrim': 0, 'Limerick': 0, 'Longford': 0, 'Louth': 0,
        'Mayo': 0, 'Meath': 0, 'Monaghan': 0, 'Offaly': 0, 'Roscommon': 0,
        'Sligo': 0, 'Tipperary': 0, 'Waterford': 0, 'Westmeath': 0,
        'Wexford': 0, 'Wicklow': 0
    }
    
    county_map = {
        'CK': 'Cork', 'CE': 'Clare', 'KY': 'Kerry', 'LK': 'Limerick', 
        'TY': 'Tipperary', 'WD': 'Waterford', 'GY': 'Galway', 'LM': 'Leitrim',
        'MO': 'Mayo', 'RN': 'Roscommon', 'SO': 'Sligo', 'CW': 'Carlow',
        'DN': 'Dublin', 'KE': 'Kildare', 'KK': 'Kilkenny', 'LS': 'Laois',
        'LD': 'Longford', 'LH': 'Louth', 'MH': 'Meath', 'OY': 'Offaly',
        'WH': 'Westmeath', 'WX': 'Wexford', 'WW': 'Wicklow', 'DL': 'Donegal',
        'CN': 'Cavan', 'MN': 'Monaghan'
    }
    
    # Iterate over all rows in the DataFrame.
    for _, row in df.iterrows():
        # A new event starts when 'Unnamed: 1' is not null.
        if pd.notna(row['Unnamed: 1']):
            # Finalize the previous event (if any)
            if current_event is not None:
                combined_text = " ".join(event_texts).lower()
                
                # Save the full combined text for debugging, if needed.
                current_event['Combined Text'] = " ".join(event_texts)
                
                # Skip events that contain the word "advisory"
                if "advisory" not in combined_text:
                    if current_event.get('flag_yellow', 0) > 0 and 'yellow' in combined_text:
                        current_event['Warning Colour'] = 'Yellow'
                    elif current_event.get('flag_orange', 0) > 0 and 'orange' in combined_text:
                        current_event['Warning Colour'] = 'Orange'
                    elif current_event.get('flag_red', 0) > 0 and 'red' in combined_text:
                        current_event['Warning Colour'] = 'Red'
                    
                    processed_data.append(current_event.copy())
            
            # Start a new event.
            current_event = {}
            event_texts = []  # Reset the accumulator
            
            # Use the value from 'Unnamed: 1' as the initial Warning Element.
            current_event['Warning Element'] = row['Unnamed: 1']
            current_event.update(counties.copy())
            
            # Retrieve the flag columns for yellow, orange, and red.
            try:
                current_event['flag_yellow'] = float(row['Unnamed: 3']) if pd.notna(row['Unnamed: 3']) else 0
            except Exception:
                current_event['flag_yellow'] = 0
            try:
                current_event['flag_orange'] = float(row['Unnamed: 4']) if pd.notna(row['Unnamed: 4']) else 0
            except Exception:
                current_event['flag_orange'] = 0
            try:
                current_event['flag_red'] = float(row['Unnamed: 5']) if pd.notna(row['Unnamed: 5']) else 0
            except Exception:
                current_event['flag_red'] = 0
            
            # If an "Ireland" column exists and equals 1, mark all counties.
            if 'Ireland' in row.index:
                try:
                    if float(row['Ireland']) == 1:
                        for county in counties.keys():
                            current_event[county] = 1
                except Exception:
                    pass
            
            # Process the current row's text from "unprotect cafo"
            text_line = str(row['unprotect cafo']) if pd.notna(row['unprotect cafo']) else ""
            event_texts.append(text_line)
            field = text_line.lower()
            if 'type:' in field:
                current_event['Warning Element'] = text_line.replace('Type:', '').strip()
            elif 'message:' in field:
                current_event['Warning Text'] = text_line.replace('Message:', '').strip()
            elif 'issue time:' in field:
                current_event['Issue Time'] = standardize_datetime(text_line.replace('Issue Time:', '').strip())
            elif 'expected onset:' in field:
                current_event['Valid From'] = standardize_datetime(text_line.replace('Expected Onset:', '').strip())
            elif 'expires:' in field:
                current_event['Valid To'] = standardize_datetime(text_line.replace('Expires:', '').strip())
            elif 'affected regions:' in field:
                regions = text_line.replace('Affected Regions:', '').strip()
                if 'ireland' in regions.lower():
                    for county in counties.keys():
                        current_event[county] = 1
                else:
                    for abbr, full_name in county_map.items():
                        if abbr in row.index and row[abbr] == 1:
                            current_event[full_name] = 1
        else:
            # Continuation row for the current event.
            if current_event is not None:
                text_line = str(row['unprotect cafo']) if pd.notna(row['unprotect cafo']) else ""
                event_texts.append(text_line)
                field = text_line.lower()
                if 'type:' in field:
                    current_event['Warning Element'] = text_line.replace('Type:', '').strip()
                elif 'message:' in field:
                    current_event['Warning Text'] = text_line.replace('Message:', '').strip()
                elif 'issue time:' in field:
                    current_event['Issue Time'] = standardize_datetime(text_line.replace('Issue Time:', '').strip())
                elif 'expected onset:' in field:
                    current_event['Valid From'] = standardize_datetime(text_line.replace('Expected Onset:', '').strip())
                elif 'expires:' in field:
                    current_event['Valid To'] = standardize_datetime(text_line.replace('Expires:', '').strip())
                elif 'affected regions:' in field:
                    regions = text_line.replace('Affected Regions:', '').strip()
                    if 'ireland' in regions.lower():
                        for county in counties.keys():
                            current_event[county] = 1
                    else:
                        for abbr, full_name in county_map.items():
                            if abbr in row.index and row[abbr] == 1:
                                current_event[full_name] = 1
                
                # Also check for the Ireland flag in continuation rows.
                if 'Ireland' in row.index:
                    try:
                        if float(row['Ireland']) == 1:
                            for county in counties.keys():
                                current_event[county] = 1
                    except Exception:
                        pass

    # Finalize the last event (if any)
    if current_event is not None:
        combined_text = " ".join(event_texts).lower()
        current_event['Combined Text'] = " ".join(event_texts)
        if "advisory" not in combined_text:
            if current_event.get('flag_yellow', 0) > 0 and 'yellow' in combined_text:
                current_event['Warning Colour'] = 'Yellow'
            elif current_event.get('flag_orange', 0) > 0 and 'orange' in combined_text:
                current_event['Warning Colour'] = 'Orange'
            elif current_event.get('flag_red', 0) > 0 and 'red' in combined_text:
                current_event['Warning Colour'] = 'Red'
            processed_data.append(current_event.copy())
    
    return pd.DataFrame(processed_data)

In [31]:
df_xlsx = transform_warnings_data(df_xl)

#### Step 5 : Test and remove provinces, mapping to HSE regions instead

In [32]:
df_xlsx.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 36 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 1   Carlow           299 non-null    int64  
 2   Cavan            299 non-null    int64  
 3   Clare            299 non-null    int64  
 4   Cork             299 non-null    int64  
 5   Donegal          299 non-null    int64  
 6   Dublin           299 non-null    int64  
 7   Galway           299 non-null    int64  
 8   Kerry            299 non-null    int64  
 9   Kildare          299 non-null    int64  
 10  Kilkenny         299 non-null    int64  
 11  Laois            299 non-null    int64  
 12  Leitrim          299 non-null    int64  
 13  Limerick         299 non-null    int64  
 14  Longford         299 non-null    int64  
 15  Louth            299 non-null    int64  
 16  Mayo             299 non-null    int64  
 17  Meath            299 non-null    int64  
 18  Monaghan        

no provinces listed after processing

In [33]:
warning_counts = df_xlsx['Warning Colour'].value_counts()
print("Warning Counts:")
print(warning_counts)

Yellow    240
Orange     48
Red         6
Name: count, dtype: int64


In [35]:
df_xlsx.to_csv('/mnt/hgfs/shared/project_data/met_eireann/xl_warnings_2023.csv', index=False)