In [1]:
##save env file
!conda env export > combined_met_environment.yml

In [2]:
##import all modules here 
import pandas as pd
import matplotlib as plt
import glob
import os
import xml.etree.ElementTree as ET
from datetime import datetime, timezone
from tqdm import tqdm
import codecs
import csv

In [3]:
##set data directories
data_directory_xml = "/mnt/hgfs/shared/weather_warnings/archive_warnings/archive"
data_directory_ods = "/mnt/hgfs/shared/ul_project_Msc_AI/data/met_eireann/eda/Archived_Wx_Warnings_25April2012_17February2021.ods"
full_path_xml = os.path.abspath(data_directory_xml)
full_path_ods = os.path.abspath(data_directory_ods)


if os.path.exists(data_directory_xml):
    print(f"XML format directory exists: {data_directory_xml}")
else:
    print(f"XML format directory does not exist: {data_directory_xml}")
print(os.getcwd())

if os.path.exists(data_directory_ods):
    print(f"ODS format directory exists: {data_directory_ods}")
else:
    print(f"ODS format directory does not exist: {data_directory_ods}")
print(os.getcwd())

/mnt/hgfs/shared/ul_project_Msc_AI/data/met_eireann/eda
/mnt/hgfs/shared/ul_project_Msc_AI/data/met_eireann/eda


In [4]:
# Read the .ods file
df_ods = pd.read_excel(full_path_ods, engine='odf', parse_dates=['Issue Time', 'Valid From', 'Valid To'])

# check the first few rows
print(df_ods.head())

# look at dataframe info
print(df_ods.info())

# check the df shape
print(f"Number of rows: {df_ods.shape[0]}")
print(f"Number of columns: {df_ods.shape[1]}")

print("ODS Filtered Date Range:")
print(f"Start date: {df_ods['Issue Time'].min()}")
print(f"End date: {df_ods['Issue Time'].max()}")

0 2012-04-25 12:00:00 2012-04-25 12:00:00 2012-04-26 12:00:00         Yellow   
1 2012-06-01 21:00:00 2012-06-02 12:00:00 2012-06-03 21:00:00         Yellow   
2 2012-06-02 14:00:00 2012-06-02 14:00:00 2012-06-03 12:00:00         Orange   
3 2012-06-08 10:00:00 2012-06-08 10:00:00 2012-06-08 23:59:00         Yellow   
4 2012-06-14 20:00:00 2012-06-14 20:00:00 2012-06-16 12:00:00         Yellow   

0            Rain                               Munster and Leinster   
1            Rain                     Munster, Connacht and Leinster   
2            Rain                               Munster and Leinster   
3            Rain                              Connacht and Leinster   
4            Rain  Munster, Leinster, Connacht, Donegal, Monaghan...   

0  Heavy rain moving into Southern coastal counti...     True   True   True   
1  Between 25 and 65 mm of rain possible, (heavie...     True   True   True   
2  Between 25mm & 65mm of rain expected over Lein...     True   True   True   
3

In [8]:
df_ods.head()

Unnamed: 0,Issue Time,Valid From,Valid To,Warning Colour,Warning Element,WhereToText,Warning Text,Clare,Cork,Kerry,...,Wexford,Wicklow,Cavan,Donegal,Monaghan,Galway,Leitrim,Mayo,Roscommon,Sligo
0,2012-04-25 12:00:00,2012-04-25 12:00:00,2012-04-26 12:00:00,Yellow,Rain,Munster and Leinster,Heavy rain moving into Southern coastal counti...,True,True,True,...,True,True,False,False,False,False,False,False,False,False
1,2012-06-01 21:00:00,2012-06-02 12:00:00,2012-06-03 21:00:00,Yellow,Rain,"Munster, Connacht and Leinster","Between 25 and 65 mm of rain possible, (heavie...",True,True,True,...,True,True,False,False,False,True,True,True,True,True
2,2012-06-02 14:00:00,2012-06-02 14:00:00,2012-06-03 12:00:00,Orange,Rain,Munster and Leinster,Between 25mm & 65mm of rain expected over Lein...,True,True,True,...,True,True,False,False,False,False,False,False,False,False
3,2012-06-08 10:00:00,2012-06-08 10:00:00,2012-06-08 23:59:00,Yellow,Rain,Connacht and Leinster,Further persistent and sometimes heavy rain to...,False,False,False,...,True,True,False,False,False,True,True,True,True,True
4,2012-06-14 20:00:00,2012-06-14 20:00:00,2012-06-16 12:00:00,Yellow,Rain,"Munster, Leinster, Connacht, Donegal, Monaghan...","Further spells of rain, persistant and heavy a...",True,True,True,...,True,True,True,True,True,True,True,True,True,True


In [6]:
(df_ods.isna().sum()/df_ods.shape[0] * 100).sort_values()

Issue Time         0.000000
Longford           0.000000
Louth              0.000000
Meath              0.000000
Offaly             0.000000
Westmeath          0.000000
Wexford            0.000000
Wicklow            0.000000
Ulster             0.000000
Cavan              0.000000
Donegal            0.000000
Monaghan           0.000000
Connacht           0.000000
Galway             0.000000
Leitrim            0.000000
Mayo               0.000000
Laois              0.000000
Kilkenny           0.000000
Kildare            0.000000
Dublin             0.000000
Valid From         0.000000
Valid To           0.000000
Munster            0.000000
Roscommon          0.000000
Cork               0.000000
Clare              0.000000
Limerick           0.000000
Tipperary          0.000000
Tipperary SR       0.000000
Waterford          0.000000
Leinster           0.000000
Carlow             0.000000
Kerry              0.000000
Sligo              0.000000
WhereToText        0.181378
dtype: float64

## ODS - MEt Eirean old system (manual) for recording Adverse weather
Start date: 2012-04-25 12:00:00
End date: 2021-02-17 09:00:00
only 1654 events in totat which seems low compared to the new rss xml system even with Advisories removed
will use data from 2013 to when the new system starts in 2018 


In [7]:
##we dont need provinces will add hse regions after combined
df_ods= df_ods.drop(['Connacht', 'Leinster', 'Munster', 'Ulster'], axis=1)

In [11]:
df_ods.head()

Unnamed: 0,Issue Time,Valid From,Valid To,Warning Colour,Warning Element,WhereToText,Warning Text,Clare,Cork,Kerry,...,Wexford,Wicklow,Cavan,Donegal,Monaghan,Galway,Leitrim,Mayo,Roscommon,Sligo
0,2012-04-25 12:00:00,2012-04-25 12:00:00,2012-04-26 12:00:00,Yellow,Rain,Munster and Leinster,Heavy rain moving into Southern coastal counti...,True,True,True,...,True,True,False,False,False,False,False,False,False,False
1,2012-06-01 21:00:00,2012-06-02 12:00:00,2012-06-03 21:00:00,Yellow,Rain,"Munster, Connacht and Leinster","Between 25 and 65 mm of rain possible, (heavie...",True,True,True,...,True,True,False,False,False,True,True,True,True,True
2,2012-06-02 14:00:00,2012-06-02 14:00:00,2012-06-03 12:00:00,Orange,Rain,Munster and Leinster,Between 25mm & 65mm of rain expected over Lein...,True,True,True,...,True,True,False,False,False,False,False,False,False,False
3,2012-06-08 10:00:00,2012-06-08 10:00:00,2012-06-08 23:59:00,Yellow,Rain,Connacht and Leinster,Further persistent and sometimes heavy rain to...,False,False,False,...,True,True,False,False,False,True,True,True,True,True
4,2012-06-14 20:00:00,2012-06-14 20:00:00,2012-06-16 12:00:00,Yellow,Rain,"Munster, Leinster, Connacht, Donegal, Monaghan...","Further spells of rain, persistant and heavy a...",True,True,True,...,True,True,True,True,True,True,True,True,True,True


In [13]:
# Convert boolean counties to binary (0/1)
df_ods = df_ods.astype({col: int for col in df_ods.select_dtypes(include=['bool']).columns})

In [25]:
# Create a copy of the filtered dataf
df_ods = df_ods.copy()

# Now do the merge
df_ods['Tipperary'] = df_ods[['Tipperary', 'Tipperary SR']].max(axis=1)
df_ods = df_ods.drop('Tipperary SR', axis=1)

# XML data parsing

In [9]:
def severity_to_color(severity):
    """Map severity levels to warning colors"""
    mapping = {
        'Extreme': 'Red',
        'Severe': 'Orange',
        'Moderate': 'Yellow'
    }
    return mapping.get(severity, 'Yellowish')  # Default to Yellow if unknown

def get_element_text(element, path, namespace):
    """Safely get text from an XML element"""
    found = element.find(path, namespace)
    return found.text if found is not None else None

def parse_xml(file_path_xml):
    """Parse an XML file and extract relevant information."""
    try:
        # Read the file content
        with codecs.open(file_path_xml, 'r', encoding='utf-8', errors='ignore') as file:
            xml_content = file.read()
        
        # Parse the XML content
        root = ET.fromstring(xml_content)
        namespace = {'cap': 'urn:oasis:names:tc:emergency:cap:1.2'}
        
        # Extract data from the info element first to check if we should process this warning
        info = root.find('cap:info', namespace)
        if info is None:
            return None
            
        # Check for advisory warnings (type 22) - exclude them
        parameters = info.findall('cap:parameter', namespace)
        for param in parameters:
            if get_element_text(param, 'cap:valueName', namespace) == 'awareness_type':
                awareness_type = get_element_text(param, 'cap:value', namespace)
                if awareness_type and '22' in awareness_type:
                    return None
                break
        
        # Check for county information
        area = info.find('cap:area', namespace)
        if area is None:
            return None
            
        geocodes = area.findall('cap:geocode', namespace)
        if not geocodes:
            return None

        # Initialize row with the old column format
        row = {
            'Issue Time': get_element_text(root, 'cap:sent', namespace),
            'Valid From': (get_element_text(info, 'cap:effective', namespace) or 
                         get_element_text(info, 'cap:onset', namespace)),
            'Valid To': get_element_text(info, 'cap:expires', namespace),
            'Warning Element': get_element_text(info, 'cap:event', namespace),
            'Warning Text': get_element_text(info, 'cap:description', namespace),
            'WhereToText': get_element_text(area, 'cap:areaDesc', namespace),
            'Warning Colour': severity_to_color(get_element_text(info, 'cap:severity', namespace))
        }
        
        # Initialize all county columns to 0
        county_info = {
            'EI01': 'Carlow', 'EI02': 'Cavan', 'EI03': 'Clare', 'EI04': 'Cork', 'EI32': 'Cork City',
            'EI06': 'Donegal', 'EI33': 'Dublin City', 'EI34': 'Dún Laoghaire-Rathdown', 'EI35': 'Fingal',
            'EI10': 'Galway', 'EI36': 'Galway City', 'EI11': 'Kerry', 'EI12': 'Kildare', 'EI13': 'Kilkenny',
            'EI15': 'Laois', 'EI14': 'Leitrim', 'EI42': 'Limerick', 'EI37': 'Limerick City', 'EI18': 'Longford',
            'EI19': 'Louth', 'EI20': 'Mayo', 'EI21': 'Meath', 'EI22': 'Monaghan', 'EI23': 'Offaly',
            'EI24': 'Roscommon', 'EI25': 'Sligo', 'EI39': 'South Dublin', 'EI43': 'Tipperary',
            'EI44': 'Waterford', 'EI29': 'Westmeath', 'EI30': 'Wexford', 'EI31': 'Wicklow'
        }
        
        # Initialize all counties to 0
        for county_name in county_info.values():
            row[county_name] = 0
            
        # Set affected counties to 1
        for gc in geocodes:
            if get_element_text(gc, 'cap:valueName', namespace) == 'FIPS':
                county_code = get_element_text(gc, 'cap:value', namespace)
                if county_code in county_info:
                    row[county_info[county_code]] = 1
        
        return row
    
    except ET.ParseError as e:
        return None
    except Exception as e:
        return None

def process_files(data_directory):
    """Process all XML files in the given directory and extract weather warning data."""
    file_pattern = os.path.join(data_directory, '*.xml')
    file_list = glob.glob(file_pattern)

    data = []
    error_files = []
    excluded_files = []

    for file in tqdm(file_list, desc="Processing files"):
        row = parse_xml(file)
        if row is not None:
            data.append(row)
        else:
            try:
                with codecs.open(file, 'r', encoding='utf-8', errors='ignore') as f:
                    ET.parse(f)
                excluded_files.append(file)
            except:
                error_files.append(file)

    # Create DataFrame
    df_xml = pd.DataFrame(data)

    # Convert date fields to datetime
    date_columns = ['Issue Time', 'Valid From', 'Valid To']
    for col in date_columns:
        if col in df_xml.columns:
            df_xml[col] = pd.to_datetime(df_xml[col], utc=True, errors='coerce')

    return df_xml, error_files, excluded_files

def save_error_files(file_list, filename):
    """Save list of error files to CSV"""
    pd.DataFrame({'file': file_list}).to_csv(filename, index=False)


df_xml, error_files, excluded_files = process_files(data_directory_xml)

# Save results
# df_xml.to_csv('weather_warnings.csv', index=False)
save_error_files(error_files, 'error_files.csv')
save_error_files(excluded_files, 'excluded_files.csv')

Processing files: 100%|██████████| 10697/10697 [03:26<00:00, 51.77it/s]


In [10]:
# check the first few rows
print(df_xml.head())

# look at dataframe info
print(df_xml.info())

# check the df shape
print(f"Number of rows: {df_xml.shape[0]}")
print(f"Number of columns: {df_xml.shape[1]}")

print("XML Filtered Date Range:")
print(f"Start date: {df_xml['Issue Time'].min()}")
print(f"End date: {df_xml['Issue Time'].max()}")

                 Issue Time                Valid From  \
0 2018-03-29 19:46:16+00:00 2018-03-22 23:00:01+00:00   
1 2018-03-29 19:50:05+00:00 2018-03-29 19:00:01+00:00   
2 2018-03-29 21:56:16+00:00 2018-03-29 19:00:01+00:00   
3 2018-03-29 23:10:21+00:00 2018-03-29 19:00:01+00:00   
4 2018-03-30 04:15:33+00:00 2018-03-29 19:00:01+00:00   


0  Heavy showery rain continuing overnight with s...     Ireland   
1  Heavy showery rain overnight with some wintry ...     Ireland   
2  Heavy showery rain overnight with some wintry ...     Ireland   
3  Heavy showery rain overnight with some wintry ...     Ireland   
4  Heavy showery rain overnight with some wintry ...     Ireland   

0         Yellow       1      1      1  ...         1       1          1   
1         Yellow       1      1      1  ...         1       1          1   
2         Yellow       1      1      1  ...         1       1          1   
3         Yellow       1      1      1  ...         1       1          1   
4         Y

In [14]:
##make sure datetime is the same in both dataframes
# Ensure datetime columns are consistently UTC
datetime_cols = ['Issue Time', 'Valid From', 'Valid To']
df_ods[datetime_cols] = df_ods[datetime_cols].apply(pd.to_datetime, utc=True)
df_xml[datetime_cols] = df_xml[datetime_cols].apply(pd.to_datetime, utc=True)

In [35]:
print("ODS Filtered Date Range:")
print(f"Start date: {df_ods['Issue Time'].min()}")
print(f"End date: {df_ods['Issue Time'].max()}")

print("XML Filtered Date Range:")
print(f"Start date: {df_xml['Issue Time'].min()}")
print(f"End date: {df_xml['Issue Time'].max()}")

ODS Filtered Date Range:
Start date: 2012-04-25 12:00:00+00:00
End date: 2021-02-17 09:00:00+00:00
XML Filtered Date Range:
Start date: 2017-09-25 04:34:48+00:00
End date: 2023-08-05 11:33:52+00:00


In [36]:
## filter data use as much as possibel from the newer xml type
# Filter ODS data from 2013 to end of 2017
df_ods_filtered = df_ods[
    (df_ods['Issue Time'] >= '2013-01-01') & 
    (df_ods['Issue Time'] <= '2017-09-24 23:59:59')
]

# Filter XML data from start of 2018 to 2023
df_xml_filtered = df_xml[
    (df_xml['Issue Time'] >= '2017-09-25') & 
    (df_xml['Issue Time'] <= '2022-12-31 23:59:59')
]

In [37]:
##deal with cities and dublin broken up 

df_xml_filtered = df_xml_filtered.copy()

# Merge Cork City into Cork
df_xml_filtered['Cork'] = df_xml_filtered[['Cork', 'Cork City']].max(axis=1)

# Merge all Dublin areas into Dublin
df_xml_filtered['Dublin'] = df_xml_filtered[['Dublin City', 'Dún Laoghaire-Rathdown', 'Fingal', 'South Dublin']].max(axis=1)

# Merge Galway City into Galway
df_xml_filtered['Galway'] = df_xml_filtered[['Galway', 'Galway City']].max(axis=1)

# Merge Limerick City into Limerick
df_xml_filtered['Limerick'] = df_xml_filtered[['Limerick', 'Limerick City']].max(axis=1)

# Drop the city columns
city_columns = ['Cork City', 'Dublin City', 'Dún Laoghaire-Rathdown', 'Fingal', 'South Dublin', 'Galway City', 'Limerick City']
df_xml_filtered = df_xml_filtered.drop(city_columns, axis=1)

In [38]:
# Combine the dataframes
df_combined = pd.concat([df_ods_filtered, df_xml_filtered], axis=0, ignore_index=True)

# Basic checks
print("Combined dataset shape:", df_combined.shape)
print("\nDate range:")
print("Start date:", df_combined['Issue Time'].min())
print("End date:", df_combined['Issue Time'].max())
print("\nNumber of warnings per year:")
print(df_combined['Issue Time'].dt.year.value_counts().sort_index())
print("\nCheck for any missing values:")
#print(df_combined.isnull().sum())
#print(df_xml_filtered.isnull().sum())
#print(df_ods_filtered.isnull().sum())

Combined dataset shape: (8311, 33)

Date range:
Start date: 2013-02-21 18:00:00+00:00
End date: 2022-12-30 05:10:24+00:00

Issue Time
2013     135
2014     180
2015     255
2016     122
2017     439
2018    2230
2019    1116
2020    1936
2021    1003
2022     895
Name: count, dtype: int64

Check for any missing values:


In [44]:
print(df_combined.isnull().any().any())  # Returns True if any NaN exists

# Get count of NaN values per column
print(df_combined.isnull().sum())

# Or if you want to see what percentage of each column is NaN
print(df_combined.isnull().sum() / len(df_combined) * 100)

# To find rows with any NaN values
print(df_combined[df_combined.isnull().any(axis=1)])

True
Issue Time         0
Valid From         0
Valid To           1
WhereToText        0
Clare              0
Cork               0
Kerry              0
Limerick           0
Tipperary          0
Waterford          0
Carlow             0
Dublin             0
Kildare            0
Kilkenny           0
Laois              0
Longford           0
Louth              0
Meath              0
Offaly             0
Westmeath          0
Wexford            0
Wicklow            0
Cavan              0
Donegal            0
Monaghan           0
Galway             0
Leitrim            0
Mayo               0
Roscommon          0
Sligo              0
dtype: int64
Issue Time         0.000000
Valid From         0.000000
Valid To           0.012032
WhereToText        0.000000
Clare              0.000000
Cork               0.000000
Kerry              0.000000
Limerick           0.000000
Tipperary          0.000000
Waterford          0.000000
Carlow             0.000000
Dublin             0.000000
Kildare         

In [45]:
print(df_combined[df_combined['Valid To'].isnull()][['Issue Time', 'Valid From', 'Valid To', 'Warning Element', 'Warning Text']])


                   Issue Time                Valid From Valid To  \
993 2017-10-16 11:44:33+00:00 2017-10-16 11:44:33+00:00      NaT   



In [46]:
# Get the index of the row with missing Valid To
missing_idx = df_combined[df_combined['Valid To'].isnull()].index

# Set Valid To to 24 hours after Valid From for that row
df_combined.loc[missing_idx, 'Valid To'] = df_combined.loc[missing_idx, 'Valid From'] + pd.Timedelta(hours=24)

In [52]:
df_combined.to_csv('weather_warnings_combined.csv', index=False)

In [48]:
# Get the unique combinations and sort them
warning_elements = df_combined[['Warning Colour', 'Warning Element']].drop_duplicates().sort_values(['Warning Colour', 'Warning Element'])

# Print each combination
for _, row in warning_elements.iterrows():
    print(f"{row['Warning Colour']} - {row['Warning Element']}")

Orange - Fog (or freezing fog)
Orange - Low Temperature/Ice
Orange - Orange Fog
Orange - Orange High Temperature
Orange - Orange Low Temperature/Ice
Orange - Orange Rain
Orange - Orange Snow/Ice
Orange - Orange Thunderstorm
Orange - Orange Wind
Orange - Rain
Orange - Snow/Ice
Orange - Thunderstorm
Orange - Wind
Red - Rain
Red - Red Wind
Red - Wind
Yellow - Fog (or freezing fog)
Yellow - High Temperature
Yellow - Low Temperature/Ice
Yellow - Rain
Yellow - Snow/Ice
Yellow - Thunderstorm
Yellow - Wind
Yellow - Yellow High Temperature
Yellow - Yellow Low Temperature/Ice
Yellow - Yellow Rain
Yellow - Yellow Snow/Ice
Yellow - Yellow Thunderstorm
Yellow - Yellow Wind
Yellow - Yellow fog
Yellowish - Rain


In [60]:
##cleanup
# Get list of county columns (excluding the non-county columns)
county_cols = [col for col in df_combined.columns if col not in ['Issue Time', 'Valid From', 'Valid To', 'Warning Colour', 'Warning Element', 'WhereToText', 'Warning Text']]

# Drop rows meeting all conditions
df_combined = df_combined[~((df_combined['Warning Colour'] == 'Orange') & 
                          (df_combined['Warning Element'].str.contains('Severe warning', na=False)) & 
                          (df_combined['Warning Text'].str.contains('unknown', na=False, case=False)) &
                          (df_combined[county_cols].sum(axis=1) == 0))]

# Verify
print(f"Number of rows after filtering: {len(df_combined)}")

Number of rows after filtering: 8311


In [63]:
##cleanup - 
# Get list of county columns (excluding the non-county columns)
county_cols = [col for col in df_combined.columns if col not in ['Issue Time', 'Valid From', 'Valid To', 'Warning Colour', 'Warning Element', 'WhereToText', 'Warning Text']]

# Drop rows meeting all conditions
df_combined = df_combined[~((df_combined['Warning Colour'] == 'Yellow') & 
                          (df_combined['Warning Element'].str.contains('Moderate', na=False)) & 
                          (df_combined['Warning Text'].str.contains('unknown', na=False, case=False)) &
                          (df_combined[county_cols].sum(axis=1) == 0))]

# Verify
print(f"Number of rows after filtering: {len(df_combined)}")

Number of rows after filtering: 8311


In [62]:
##cleanup - 
# Get list of county columns (excluding the non-county columns)
county_cols = [col for col in df_combined.columns if col not in ['Issue Time', 'Valid From', 'Valid To', 'Warning Colour', 'Warning Element', 'WhereToText', 'Warning Text']]

# Drop rows meeting all conditions
df_combined = df_combined[~((df_combined['Warning Colour'] == 'Red') & 
                          (df_combined['Warning Element'].str.contains('Extreme', na=False)) & 
                          (df_combined['Warning Text'].str.contains('unknown', na=False, case=False)) &
                          (df_combined[county_cols].sum(axis=1) == 0))]

# Verify
print(f"Number of rows after filtering: {len(df_combined)}")

Number of rows after filtering: 8311


In [64]:
# Check each condition separately
print("Rows with Warning Colour 'Red':", 
      sum(df_combined['Warning Colour'] == 'Red'))

print("\nRows with 'Extreme  warning':", 
      sum(df_combined['Warning Element'].str.contains('Extreme  warning', na=False)))

print("\nRows with 'unknown' in Warning Text:", 
      sum(df_combined['Warning Text'].str.contains('unknown', na=False, case=False)))

print("\nRows with all counties as 0:", 
      sum(df_combined[county_cols].sum(axis=1) == 0))

# Check exact pattern of some matching rows
print("\nSample of rows with Red warnings:")
print(df_combined[df_combined['Warning Colour'] == 'Red'][['Warning Colour', 'Warning Element', 'Warning Text']].head())




Rows with all counties as 0: 76

103            Red            Wind   
104            Red            Wind   
124            Red            Wind   
125            Red            Wind   
181            Red            Wind   

124  UpdateBecoming very stormy this evening with g...  
125  UpdateBecoming very stormy this evening with g...  
181  Stormy conditions will affect Cork and Kerry d...  


In [67]:
df_combined.to_csv('weather_warnings_combined1.csv', index=False)