#### Set styling for plotting

In [13]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.ticker as ticker
import seaborn as sns
sns.set_palette('colorblind')
from matplotlib.pyplot import tight_layout
# ##SETTING PARAMS FOR MATPLOTLIB FIGURES
plt.rcParams.update({"figure.figsize": (6, 6),
                 "axes.facecolor": "white",
                 "axes.edgecolor": "black"})
plt.rcParams['axes.prop_cycle'] = plt.cycler(color=sns.color_palette('colorblind'))
##set font size
font = {'family': 'sans-serif',
       'weight': 'normal',
       'size': 14}
plt.rc('font', **font)
# ##PANDAS PLOTTING
pd.plotting.register_matplotlib_converters()

#### Step 1: save environment file

In [14]:
!conda env export > combined_met_environment.yml

#### Step 2: import modules

In [15]:
import glob
import os
import xml.etree.ElementTree as ET
from datetime import datetime, timezone
from tqdm import tqdm
import codecs
import csv

## INFORMATION

## ODS - Met Eirean old system (manual) for recording Adverse weather
Start date: 2012-04-25 12:00:00
End date: 2021-02-17 09:00:00
only 1654 events in totat which seems low compared to the new rss xml system even with Advisories removed
will use data from 2013 to when the new system starts in 2018 

## XLSX 
some of 2023 was missing from the xml style data so Met Eireann sent on the full 2023 in this excel data format 

#### Step 3: import csv files

In [16]:
data_directory_xml = "/mnt/hgfs/shared/weather_warnings/archive_warnings/archive"
data_directory_ods = "/mnt/hgfs/shared/project_data/met_eireann/Archived_Wx_Warnings_25April2012_17February2021.ods"
data_directory_xl = "/mnt/hgfs/shared/project_data/met_eireann/National warnings from pdfs_2023.xlsx"
full_path_xml = os.path.abspath(data_directory_xml)
full_path_ods = os.path.abspath(data_directory_ods)
full_path_xl = os.path.abspath(data_directory_xl)

In [17]:
# Read the .ods file
df_ods = pd.read_excel(full_path_ods, engine='odf', parse_dates=['Issue Time', 'Valid From', 'Valid To'])
# Read the 2023 excel file
df_xl = pd.read_excel(full_path_xl)

In [18]:
# check the first few rows
print(df_ods.head())

# look at dataframe info
print(df_ods.info())

# check the df shape
print(f"Number of rows: {df_ods.shape[0]}")
print(f"Number of columns: {df_ods.shape[1]}")

print("ODS Filtered Date Range:")
print(f"Start date: {df_ods['Issue Time'].min()}")
print(f"End date: {df_ods['Issue Time'].max()}")

0 2012-04-25 12:00:00 2012-04-25 12:00:00 2012-04-26 12:00:00         Yellow   
1 2012-06-01 21:00:00 2012-06-02 12:00:00 2012-06-03 21:00:00         Yellow   
2 2012-06-02 14:00:00 2012-06-02 14:00:00 2012-06-03 12:00:00         Orange   
3 2012-06-08 10:00:00 2012-06-08 10:00:00 2012-06-08 23:59:00         Yellow   
4 2012-06-14 20:00:00 2012-06-14 20:00:00 2012-06-16 12:00:00         Yellow   

0            Rain                               Munster and Leinster   
1            Rain                     Munster, Connacht and Leinster   
2            Rain                               Munster and Leinster   
3            Rain                              Connacht and Leinster   
4            Rain  Munster, Leinster, Connacht, Donegal, Monaghan...   

0  Heavy rain moving into Southern coastal counti...     True   True   True   
1  Between 25 and 65 mm of rain possible, (heavie...     True   True   True   
2  Between 25mm & 65mm of rain expected over Lein...     True   True   True   
3

In [19]:
# check the first few rows
print(df_xl.head())

# look at dataframe info
print(df_xl.info())

# check the df shape
print(f"Number of rows: {df_xl.shape[0]}")
print(f"Number of columns: {df_xl.shape[1]}")

#print("ODS Filtered Date Range:")
#print(f"Start date: {df_xl['Issue Time'].min()}")
#print(f"End date: {df_xl['Issue Time'].max()}")

            Unnamed: 0 Unnamed: 1  \
0                Total        NaN   
1  2023-01-01 00:00:00    Fog/Ice   
2                  NaN        NaN   
3                  NaN        NaN   
4                  NaN        NaN   

                                      unprotect cafo Unnamed: 3 Unnamed: 4  \
0                                                NaN     Yellow     Orange   
1                                     Level: Yellow         244         50   
2                                   Type: Fog / Ice           0          0   
3  Message: Icy stretches along with patches of f...          0          0   
4                          Affected Regions: ireland          0          0   

  Unnamed: 5 Unnamed: 6  Unnamed: 7 Unnamed: 8  Unnamed: 9  ... Unnamed: 33  \
0        Red      Named         NaN    Ireland         NaN  ...          MH   
1        6.5          0         NaN          0         NaN  ...           0   
2          0          0         NaN          0         NaN  ...        

#### Step 4: process XML data 

In [20]:
def severity_to_color(severity):
    """Map severity levels to warning colors"""
    mapping = {
        'Extreme': 'Red',
        'Severe': 'Orange',
        'Moderate': 'Yellow'
    }
    return mapping.get(severity, 'notmapped')

def get_element_text(element, path, namespace):
    """Safely get text from an XML element"""
    found = element.find(path, namespace)
    return found.text if found is not None else None

def parse_xml(file_path_xml):
    """Parse an XML file and extract relevant information."""
    try:
        # Read the file content
        with codecs.open(file_path_xml, 'r', encoding='utf-8', errors='ignore') as file:
            xml_content = file.read()
        
        # Parse the XML content
        root = ET.fromstring(xml_content)
        namespace = {'cap': 'urn:oasis:names:tc:emergency:cap:1.2'}
        
        # Extract data from the info element first to check if we should process this warning
        info = root.find('cap:info', namespace)
        if info is None:
            return None
            
        # Check for advisory warnings (type 22) - exclude them
        parameters = info.findall('cap:parameter', namespace)
        for param in parameters:
            if get_element_text(param, 'cap:valueName', namespace) == 'awareness_type':
                awareness_type = get_element_text(param, 'cap:value', namespace)
                if awareness_type and '22' in awareness_type:
                    return None
                break
        
        # Check for county information
        area = info.find('cap:area', namespace)
        if area is None:
            return None
            
        geocodes = area.findall('cap:geocode', namespace)
        if not geocodes:
            return None

        # Initialize row with the old column format
        row = {
            'Issue Time': get_element_text(root, 'cap:sent', namespace),
            'Valid From': (get_element_text(info, 'cap:effective', namespace) or 
                         get_element_text(info, 'cap:onset', namespace)),
            'Valid To': get_element_text(info, 'cap:expires', namespace),
            'Warning Element': get_element_text(info, 'cap:event', namespace),
            'Warning Text': get_element_text(info, 'cap:description', namespace),
            'WhereToText': get_element_text(area, 'cap:areaDesc', namespace),
            'Warning Colour': severity_to_color(get_element_text(info, 'cap:severity', namespace))
        }
        
        # Initialize all county columns to 0
        county_info = {
            'EI01': 'Carlow', 'EI02': 'Cavan', 'EI03': 'Clare', 'EI04': 'Cork', 'EI32': 'Cork City',
            'EI06': 'Donegal', 'EI33': 'Dublin City', 'EI34': 'Dún Laoghaire-Rathdown', 'EI35': 'Fingal',
            'EI10': 'Galway', 'EI36': 'Galway City', 'EI11': 'Kerry', 'EI12': 'Kildare', 'EI13': 'Kilkenny',
            'EI15': 'Laois', 'EI14': 'Leitrim', 'EI42': 'Limerick', 'EI37': 'Limerick City', 'EI18': 'Longford',
            'EI19': 'Louth', 'EI20': 'Mayo', 'EI21': 'Meath', 'EI22': 'Monaghan', 'EI23': 'Offaly',
            'EI24': 'Roscommon', 'EI25': 'Sligo', 'EI39': 'South Dublin', 'EI43': 'Tipperary',
            'EI44': 'Waterford', 'EI29': 'Westmeath', 'EI30': 'Wexford', 'EI31': 'Wicklow'
        }
        
        # Initialize all counties to 0
        for county_name in county_info.values():
            row[county_name] = 0
            
        # Set affected counties to 1
        for gc in geocodes:
            if get_element_text(gc, 'cap:valueName', namespace) == 'FIPS':
                county_code = get_element_text(gc, 'cap:value', namespace)
                if county_code in county_info:
                    row[county_info[county_code]] = 1
        
        return row
    
    except ET.ParseError as e:
        return None
    except Exception as e:
        return None

def process_files(data_directory):
    """Process all XML files in the given directory and extract weather warning data."""
    file_pattern = os.path.join(data_directory, '*.xml')
    file_list = glob.glob(file_pattern)

    data = []
    error_files = []
    excluded_files = []

    for file in tqdm(file_list, desc="Processing files"):
        row = parse_xml(file)
        if row is not None:
            data.append(row)
        else:
            try:
                with codecs.open(file, 'r', encoding='utf-8', errors='ignore') as f:
                    ET.parse(f)
                excluded_files.append(file)
            except:
                error_files.append(file)

    # Create DataFrame
    df_xml = pd.DataFrame(data)

    # Convert date fields to datetime
    date_columns = ['Issue Time', 'Valid From', 'Valid To']
    for col in date_columns:
        if col in df_xml.columns:
            df_xml[col] = pd.to_datetime(df_xml[col], utc=True, errors='coerce')

    return df_xml, error_files, excluded_files

def save_error_files(file_list, filename):
    """Save list of error files to CSV"""
    pd.DataFrame({'file': file_list}).to_csv(filename, index=False)


df_xml, error_files, excluded_files = process_files(data_directory_xml)

# Save results
# df_xml.to_csv('weather_warnings.csv', index=False)
save_error_files(error_files, 'error_files.csv')
save_error_files(excluded_files, 'excluded_files.csv')

Processing files: 100%|███████████████████| 10697/10697 [04:10<00:00, 42.68it/s]


In [21]:
# check the first few rows
print(df_xml.head())

# look at dataframe info
print(df_xml.info())

# check the df shape
print(f"Number of rows: {df_xml.shape[0]}")
print(f"Number of columns: {df_xml.shape[1]}")

print("XML Filtered Date Range:")
print(f"Start date: {df_xml['Issue Time'].min()}")
print(f"End date: {df_xml['Issue Time'].max()}")
df_xml.to_csv("output.csv", index=False)

                 Issue Time                Valid From  \
0 2018-03-29 19:46:16+00:00 2018-03-22 23:00:01+00:00   
1 2018-03-29 19:50:05+00:00 2018-03-29 19:00:01+00:00   
2 2018-03-29 21:56:16+00:00 2018-03-29 19:00:01+00:00   
3 2018-03-29 23:10:21+00:00 2018-03-29 19:00:01+00:00   
4 2018-03-30 04:15:33+00:00 2018-03-29 19:00:01+00:00   


0  Heavy showery rain continuing overnight with s...     Ireland   
1  Heavy showery rain overnight with some wintry ...     Ireland   
2  Heavy showery rain overnight with some wintry ...     Ireland   
3  Heavy showery rain overnight with some wintry ...     Ireland   
4  Heavy showery rain overnight with some wintry ...     Ireland   

0         Yellow       1      1      1  ...         1       1          1   
1         Yellow       1      1      1  ...         1       1          1   
2         Yellow       1      1      1  ...         1       1          1   
3         Yellow       1      1      1  ...         1       1          1   
4         Y

#### Step 5: removing unneeded data and missing data from ODS 

In [22]:
##we dont need provinces will add hse regions after combined
df_ods= df_ods.drop(['Connacht', 'Leinster', 'Munster', 'Ulster'], axis=1)

In [23]:
# Create a copy of the filtered data
df_ods = df_ods.copy()

## Its not clear what Tipperary SR actually is and is not the same in the XML data so i'm going to merge tipperary and tipperary SR 
df_ods['Tipperary'] = df_ods[['Tipperary', 'Tipperary SR']].max(axis=1)
df_ods = df_ods.drop('Tipperary SR', axis=1)

In [70]:
###XML data has a lot of duplicates for the same event just issued multiple times, so this will consolidate into single events 
import pandas as pd
from datetime import datetime

def load_weather_warnings(file_path):
    """
    Load weather warnings from a CSV file and perform initial data cleaning.
    This function preserves all county columns and ensures proper datetime formatting.
    """
    df = pd.read_csv(file_path)
    
    # Convert datetime columns to proper datetime objects
    datetime_columns = ['Issue Time', 'Valid From', 'Valid To']
    for col in datetime_columns:
        df[col] = pd.to_datetime(df[col])
    
    return df

def consolidate_warnings(df):
    """
    Consolidate weather warnings by grouping similar events.
    Uses a flat aggregation structure to avoid nested dictionary errors.
    """
    # First, let's identify our county columns
    base_columns = ['Issue Time', 'Valid From', 'Valid To', 'Warning Element',
                   'Warning Text', 'WhereToText', 'Warning Colour']
    county_columns = [col for col in df.columns if col not in base_columns]
    
    # Create the event key for grouping
    df['event_key'] = df.apply(
        lambda x: f"{x['Valid To']}_{x['Warning Element']}_{x['Warning Colour']}_{x['Warning Text']}", 
        axis=1
    )
    
    # Create a flat aggregation dictionary
    agg_dict = {
        'Issue Time': 'first',                    # Keep the first issue time
        'Valid From': 'first',
        'Valid To': 'first',
        'Warning Element': 'first',
        'Warning Text': 'first',
        'WhereToText': 'first',
        'Warning Colour': 'first'
    }
    
    # Add county columns to aggregation
    for col in county_columns:
        agg_dict[col] = 'first'
    
    # First grouping to get the basic consolidated data
    df_consolidated = df.groupby('event_key').agg(agg_dict).reset_index()
    
    # Now calculate the additional metrics separately
    issue_counts = df.groupby('event_key').size().reset_index(name='issue_count')
    first_issues = df.groupby('event_key')['Issue Time'].min().reset_index(name='first_issue')
    last_issues = df.groupby('event_key')['Issue Time'].max().reset_index(name='last_issue')
    
    # Merge all the metrics back together
    df_consolidated = (df_consolidated
                      .merge(issue_counts, on='event_key')
                      .merge(first_issues, on='event_key')
                      .merge(last_issues, on='event_key'))
    
    # Rename columns to match desired output
    rename_dict = {
        'Warning Element': 'warning_type',
        'Warning Text': 'warning_text',
        'WhereToText': 'location',
        'Warning Colour': 'warning_colour'
    }
    df_consolidated = df_consolidated.rename(columns=rename_dict)
    
    # Arrange columns in the desired order
    column_order = [
        'event_key', 
        'Issue Time', 
        'issue_count',
        'first_issue',
        'last_issue',
        'Valid From',
        'Valid To',
        'warning_type',
        'warning_text',
        'location',
        'warning_colour'
    ] + county_columns
    
    # Only select columns that exist
    final_columns = [col for col in column_order if col in df_consolidated.columns]
    df_consolidated = df_consolidated[final_columns]
    
    return df_consolidated

def analyze_warnings(df):
    """
    Generate summary statistics about the weather warnings.
    """
    analysis = {
        'total_warnings': len(df),
        'unique_events': df['event_key'].nunique(),
        'warning_types': df['warning_type'].value_counts().to_dict(),
        'most_reissued': df.nlargest(1, 'issue_count')[['warning_type', 'location', 'issue_count']].to_dict('records')[0],
        'avg_issues_per_event': df['issue_count'].mean()
    }
    return analysis

def process_weather_warnings(input_file, output_file):
    """
    Main function to process weather warnings data.
    """
    print(f"Loading data from {input_file}...")
    df = load_weather_warnings(input_file)
    
    print("Consolidating warnings...")
    df_xml_consolidated = consolidate_warnings(df)
    
    print("Analyzing results...")
    analysis_results = analyze_warnings(df_xml_consolidated)
    
    print("\nWeather Warnings Analysis Summary:")
    print(f"Total warnings issued: {analysis_results['total_warnings']}")
    print(f"Number of unique events: {analysis_results['unique_events']}")
    print(f"\nWarning types frequency:")
    for warning_type, count in analysis_results['warning_types'].items():
        print(f"- {warning_type}: {count}")
    print(f"\nMost reissued warning:")
    print(f"- Type: {analysis_results['most_reissued']['warning_type']}")
    print(f"- Location: {analysis_results['most_reissued']['location']}")
    print(f"- Times issued: {analysis_results['most_reissued']['issue_count']}")
    print(f"\nAverage issues per event: {analysis_results['avg_issues_per_event']:.2f}")
    
    print(f"\nExporting consolidated data to {output_file}...")
    df_xml_consolidated.to_csv(output_file, index=False)
    print("Export complete!")
    
    return df_xml_consolidated

from pathlib import Path
input_file = "/mnt/hgfs/shared/ul_project_Msc_AI/data/met_eireann/eda/output.csv"
output_file = "/mnt/hgfs/shared/ul_project_Msc_AI/data/met_eireann/eda/consolidated_weather_warnings.csv"
df_xml_consolidated = process_weather_warnings(input_file, output_file)

Loading data from /mnt/hgfs/shared/ul_project_Msc_AI/data/met_eireann/eda/output.csv...
Analyzing results...

Number of unique events: 1353

- Yellow Wind: 129
- Yellow Rain: 123
- Yellow Thunderstorm: 103
- Yellow Snow/Ice: 27
- Orange Wind: 21
- Yellow fog: 20
- Yellow Low Temperature/Ice: 20
- Orange Rain: 15
- Orange Thunderstorm: 13
- Yellow High Temperature: 10
- Orange Snow/Ice: 6
- Red Wind: 5
- Orange Low Temperature/Ice: 5
- Orange High Temperature: 2
- Rain: 1
- Orange Fog: 1

- Location: Ireland
- Times issued: 49

Average issues per event: 5.94

Export complete!


In [71]:
year_to_count = 2017
events_in_year = len(df_xml_consolidated[df_xml_consolidated['Issue Time'].dt.year == year_to_count])

print(f"Number of events in {year_to_count}: {events_in_year}")

# If you want to see the distribution across all years
year_counts = df_xml_consolidated['Issue Time'].dt.year.value_counts().sort_index()
print("\nEvents per year:")
print(year_counts)

Number of events in 2017: 91

Events per year:
Issue Time
2017     91
2018    258
2019    176
2020    270
2021    206
2022    212
2023    140
Name: count, dtype: int64


In [63]:
year_to_count = 2017
events_in_year = len(df_ods[df_ods['Issue Time'].dt.year == year_to_count])

print(f"Number of events in {year_to_count}: {events_in_year}")

# If you want to see the distribution across all years
year_counts = df_ods['Issue Time'].dt.year.value_counts().sort_index()
print("\nEvents per year:")
print(year_counts)

Number of events in 2017: 193

Events per year:
Issue Time
2012     14
2013    135
2014    180
2015    255
2016    122
2017    193
2018    249
2019    187
2020    277
2021     42
Name: count, dtype: int64


#### Step:6 confirm date/time settings, check date ranges and select appropriate rnages to combine 

In [19]:
##make sure datetime is the same in both dataframes
# Ensure datetime columns are consistently UTC
datetime_cols = ['Issue Time', 'Valid From', 'Valid To']
df_ods[datetime_cols] = df_ods[datetime_cols].apply(pd.to_datetime, utc=True)
df_xml_consolidated[datetime_cols] = df_xml_consolidated[datetime_cols].apply(pd.to_datetime, utc=True)

In [21]:
print("ODS Filtered Date Range:")
print(f"Start date: {df_ods['Issue Time'].min()}")
print(f"End date: {df_ods['Issue Time'].max()}")

print("XML Filtered Date Range:")
print(f"Start date: {df_xml_consolidated['Issue Time'].min()}")
print(f"End date: {df_xml_consolidated['Issue Time'].max()}")

ODS Filtered Date Range:
Start date: 2012-04-25 12:00:00+00:00
End date: 2021-02-17 09:00:00+00:00
XML Filtered Date Range:
Start date: 2017-09-25 04:34:48+00:00
End date: 2023-08-05 11:33:52+00:00


In [24]:
####on examination the XML data has dupocate entires for the same event e'g one event the warning may be issues mutliple times during the event. 
####the ods data has just individual events so i will use as much data from ODS and remove duplicates from the xml data  
# Filter ODS data from 2013 to end of 2020
### check plots on date change check for duplicates 
df_ods_filtered = df_ods[
    (df_ods['Issue Time'] >= '2013-01-01') & 
    (df_ods['Issue Time'] <= '2020-12-31 23:59:59')
]

# Filter XML data from start of 2018 to 2023
df_xml_filtered = df_xml_consolidated[
    (df_xml_consolidated['Issue Time'] >= '2021-01-01') & 
    (df_xml_consolidated['Issue Time'] <= '2023-08-05 23:59:59')
]


In [4]:
df_xml_filtered.to_csv('xml_warnings_2020_2023_08.csv', index=False)

NameError: name 'df_xml_filtered' is not defined