#### Set styling for plotting

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.ticker as ticker
import seaborn as sns
sns.set_palette('colorblind')
from matplotlib.pyplot import tight_layout
# ##SETTING PARAMS FOR MATPLOTLIB FIGURES
plt.rcParams.update({"figure.figsize": (6, 6),
                 "axes.facecolor": "white",
                 "axes.edgecolor": "black"})
plt.rcParams['axes.prop_cycle'] = plt.cycler(color=sns.color_palette('colorblind'))
##set font size
font = {'family': 'sans-serif',
       'weight': 'normal',
       'size': 14}
plt.rc('font', **font)
# ##PANDAS PLOTTING
pd.plotting.register_matplotlib_converters()

#### Step 1: save environment file

In [2]:
!conda env export > combined_met_environment.yml

#### Step 2: import modules

In [3]:
import glob
import os
import xml.etree.ElementTree as ET
from datetime import datetime, timezone
from tqdm import tqdm
import codecs
import csv

## INFORMATION

##load the 3 processed .csv fiels for combination in a single complete dataset 

#### Step 3: import csv files

In [4]:
data_directory_xml = "/home/paulharford/college/project/project_data/met_eireann/xml_warnings_2018_2023_08.csv"
data_directory_ods = "/home/paulharford/college/project/project_data/met_eireann/ods_warnings_2017_2020.csv"
data_directory_xl = "/home/paulharford/college/project/project_data/met_eireann/xl_warnings_2023.csv"
full_path_xml = os.path.abspath(data_directory_xml)
full_path_ods = os.path.abspath(data_directory_ods)
full_path_xl = os.path.abspath(data_directory_xl)

In [5]:
# Read the 2023 excel file
df_xml = pd.read_csv(full_path_xml)
df_ods = pd.read_csv(full_path_ods)
df_xl = pd.read_csv(full_path_xl)

In [6]:
# First, ensure our datetime conversions are correct
df_xml['Issue Time'] = pd.to_datetime(df_xml['Issue Time'])
df_ods['Issue Time'] = pd.to_datetime(df_ods['Issue Time'])

# Get counts for specific years (2018-2020) for both datasets
xml_year_counts = df_xml['Issue Time'].dt.year.value_counts().sort_index()
ods_year_counts = df_ods['Issue Time'].dt.year.value_counts().sort_index()

# Create a DataFrame to display the years side by side
comparison_df = pd.DataFrame({
    'XML Dataset': xml_year_counts,
    'ODS Dataset': ods_year_counts
})

# Filter 2018-2020
comparison_df = comparison_df.loc[2018:2020]

# Calculate the difference between datasets
comparison_df['Difference'] = comparison_df['XML Dataset'] - comparison_df['ODS Dataset']

# Display the comparison
print("\nComparison of Events (2018-2020):")
print(comparison_df)

# Optional: Create a percentage difference column to see relative changes
comparison_df['Percentage Difference'] = (
    (comparison_df['XML Dataset'] - comparison_df['ODS Dataset']) / 
    comparison_df['ODS Dataset'] * 100
).round(2)

print("\nWith Percentage Difference:")
print(comparison_df)


Comparison of Events (2018-2020):
            XML Dataset  ODS Dataset  Difference
Issue Time                                      
2018              264.0        249.0        15.0
2019              179.0        187.0        -8.0
2020              274.0        277.0        -3.0

With Percentage Difference:
            XML Dataset  ODS Dataset  Difference  Percentage Difference
Issue Time                                                             
2018              264.0        249.0        15.0                   6.02
2019              179.0        187.0        -8.0                  -4.28
2020              274.0        277.0        -3.0                  -1.08


#### Step:6 confirm date/time settings, check date ranges and select appropriate rnages to combine 

In [7]:
##make sure datetime is the same in both dataframes
# Ensure datetime columns are consistently UTC
datetime_cols = ['Issue Time', 'Valid From', 'Valid To']
df_ods[datetime_cols] = df_ods[datetime_cols].apply(pd.to_datetime, utc=True)
df_xml[datetime_cols] = df_xml[datetime_cols].apply(pd.to_datetime, utc=True)
df_xl[datetime_cols] = df_xl[datetime_cols].apply(pd.to_datetime, utc=True)

In [8]:
##Check Columns
df_ods.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1598 entries, 0 to 1597
Data columns (total 33 columns):
 #   Column           Non-Null Count  Dtype              
---  ------           --------------  -----              
 0   Issue Time       1598 non-null   datetime64[ns, UTC]
 1   Valid From       1598 non-null   datetime64[ns, UTC]
 2   Valid To         1598 non-null   datetime64[ns, UTC]
 5   WhereToText      1596 non-null   object             
 7   Clare            1598 non-null   int64              
 8   Cork             1598 non-null   int64              
 9   Kerry            1598 non-null   int64              
 10  Limerick         1598 non-null   int64              
 11  Tipperary        1598 non-null   int64              
 12  Waterford        1598 non-null   int64              
 13  Carlow           1598 non-null   int64              
 14  Dublin           1598 non-null   int64              
 15  Kildare          1598 non-null   int64              
 16  Kilkenny         1

In [9]:
##where to text column not required
df_ods.drop("WhereToText", axis=1, inplace=True)

In [10]:
df_xml.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2016 entries, 0 to 2015
Data columns (total 33 columns):
 #   Column           Non-Null Count  Dtype              
---  ------           --------------  -----              
 0   Issue Time       2016 non-null   datetime64[ns, UTC]
 1   Valid From       2016 non-null   datetime64[ns, UTC]
 2   Valid To         2016 non-null   datetime64[ns, UTC]
 5   WhereToText      2016 non-null   object             
 7   Carlow           2016 non-null   int64              
 8   Cavan            2016 non-null   int64              
 9   Clare            2016 non-null   int64              
 10  Cork             2016 non-null   int64              
 11  Donegal          2016 non-null   int64              
 12  Galway           2016 non-null   int64              
 13  Kerry            2016 non-null   int64              
 14  Kildare          2016 non-null   int64              
 15  Kilkenny         2016 non-null   int64              
 16  Laois            2

In [11]:
df_xml.drop("WhereToText", axis=1, inplace=True)

In [12]:
df_xl.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 36 columns):
 #   Column           Non-Null Count  Dtype              
---  ------           --------------  -----              
 1   Carlow           299 non-null    int64              
 2   Cavan            299 non-null    int64              
 3   Clare            299 non-null    int64              
 4   Cork             299 non-null    int64              
 5   Donegal          299 non-null    int64              
 6   Dublin           299 non-null    int64              
 7   Galway           299 non-null    int64              
 8   Kerry            299 non-null    int64              
 9   Kildare          299 non-null    int64              
 10  Kilkenny         299 non-null    int64              
 11  Laois            299 non-null    int64              
 12  Leitrim          299 non-null    int64              
 13  Limerick         299 non-null    int64              
 14  Longford         299

In [13]:
# Define the desired order
desired_order = [
    "Issue Time", "Valid From", "Valid To", "Warning Colour", "Warning Element",
    "Warning Text", "Clare", "Cork", "Kerry", "Limerick", 
    "Tipperary", "Waterford", "Carlow", "Dublin", "Kildare", "Kilkenny", 
    "Laois", "Longford", "Louth", "Meath", "Offaly", "Westmeath", "Wexford", 
    "Wicklow", "Cavan", "Donegal", "Monaghan", "Galway", "Leitrim", "Mayo", 
    "Roscommon", "Sligo"
]

# This will select (and order) only those columns; note that if your second DataFrame 
# is missing any of these, a KeyError will be raised.
df_xl_ro = df_xl[desired_order]

In [14]:
df_xl_ro.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 32 columns):
 #   Column           Non-Null Count  Dtype              
---  ------           --------------  -----              
 0   Issue Time       286 non-null    datetime64[ns, UTC]
 1   Valid From       287 non-null    datetime64[ns, UTC]
 2   Valid To         285 non-null    datetime64[ns, UTC]
 6   Clare            299 non-null    int64              
 7   Cork             299 non-null    int64              
 8   Kerry            299 non-null    int64              
 9   Limerick         299 non-null    int64              
 10  Tipperary        299 non-null    int64              
 11  Waterford        299 non-null    int64              
 12  Carlow           299 non-null    int64              
 13  Dublin           299 non-null    int64              
 14  Kildare          299 non-null    int64              
 15  Kilkenny         299 non-null    int64              
 16  Laois            299

In [15]:
### date ranges
print("ODS Filtered Date Range:")
print(f"Start date: {df_ods['Issue Time'].min()}")
print(f"End date: {df_ods['Issue Time'].max()}")
print("XML Filtered Date Range:")
print(f"Start date: {df_xml['Issue Time'].min()}")
print(f"End date: {df_xml['Issue Time'].max()}")
print("XML Filtered Date Range:")
print(f"Start date: {df_xl_ro['Issue Time'].min()}")
print(f"End date: {df_xl_ro['Issue Time'].max()}")

ODS Filtered Date Range:
Start date: 2013-02-21 18:00:00+00:00
End date: 2020-12-30 14:00:00+00:00
XML Filtered Date Range:
Start date: 2018-01-01 14:02:14+00:00
End date: 2023-08-04 11:49:46+00:00
XML Filtered Date Range:
Start date: 2023-01-01 13:22:00+00:00
End date: 2023-12-29 19:12:00+00:00


In [16]:
df_ods_filtered = df_ods[
    (df_ods['Issue Time'] >= '2014-01-01') & 
    (df_ods['Issue Time'] <= '2017-12-31 23:59:59')
]
df_xml_filtered = df_xml[
    (df_xml['Issue Time'] >= '2018-01-01') & 
    (df_xml['Issue Time'] <= '2022-12-31 23:59:59')
]
df_xl_filtered = df_xl_ro[
    (df_xl_ro['Issue Time'] >= '2023-01-01') & 
    (df_xl_ro['Issue Time'] <= '2023-12-29 23:59:59')
]

In [17]:
df_combined = pd.concat([df_ods_filtered, df_xml_filtered, df_xl_filtered], ignore_index=True)

In [18]:
print("Combined Filtered Date Range:")
print(f"Start date: {df_combined['Issue Time'].min()}")
print(f"End date: {df_combined['Issue Time'].max()}")

Combined Filtered Date Range:
Start date: 2014-01-02 09:00:00+00:00
End date: 2023-12-29 19:12:00+00:00


In [19]:
# List unique Warning Elements
unique_warning_elements = df_combined['Warning Element'].unique()
print("Unique Warning Elements:")
print(unique_warning_elements)

# Optionally, if you'd like them sorted:
print("Sorted Unique Warning Elements:")
print(sorted(unique_warning_elements))


['Wind' 'Snow/Ice' 'Rain' 'Fog (or freezing fog)' 'Thunderstorm'
 'Yellow fog' 'Orange Wind' 'Yellow Thunderstorm'
 'Yellow High Temperature' 'Orange High Temperature' 'Orange Thunderstorm'
 'Yellow Snow/Ice' 'Orange Snow/Ice' 'Orange Low Temperature/Ice'
 'Orange Fog' 'Fog / Ice' 'Rainfall' 'Fog' 'wind']


In [20]:
def normalize_warning(warning_element: str) -> str:
    text = warning_element.lower()

    # Step 1: Determine severity
    if "extreme" in text or "red" in text:
        severity = "Extreme"
    elif "severe" in text or "orange" in text:
        severity = "Severe"
    elif "moderate" in text or "yellow" in text:
        severity = "Moderate"
    elif "minor" in text:
        severity = "Minor"
    else:
        # If nothing found, assume "Moderate" or you can leave it blank
        severity = "Moderate"  

    # Step 2: Determine phenomenon
    if "hail" in text:
        phenomenon = "Hail"
    elif "small-craft" in text or "wind" in text:
        # "small-craft" warnings generally are wind-related
        phenomenon = "Wind"
    elif "fog" in text:
        phenomenon = "Fog"
    elif "snow" in text or "ice" in text:
        # covers "Snow/Ice", "Snow-ice", etc.
        phenomenon = "Snow-ice"
    elif "high temperature" in text or "high-temperature" in text:
        phenomenon = "High-Temperature"
    elif "low temperature" in text or "low-temperature" in text:
        phenomenon = "Low-Temperature"
    elif "thunder" in text:
        phenomenon = "Thunder"
    elif "rain" in text or "rainfall" in text:
        phenomenon = "Rainfall"
    else:
        # fallback if unknown
        phenomenon = "Unknown"

    return f"{severity} {phenomenon} warning"


In [21]:
df_combined["Normalized Warning"] = df_combined["Warning Element"].apply(normalize_warning)



In [27]:
minor_rows = df_combined[df_combined["Warning Element"].str.contains("minor", case=False, na=False)]

# Show the filtered rows
print(minor_rows)

                    Issue Time                Valid From  \
1652 2021-08-10 16:38:53+00:00 2021-08-10 16:38:40+00:00   
1653 2021-08-10 16:38:53+00:00 2021-08-10 16:38:40+00:00   
1654 2021-08-10 16:38:53+00:00 2021-08-10 16:38:40+00:00   
1655 2021-08-10 16:38:53+00:00 2021-08-10 16:38:40+00:00   
1656 2021-08-10 16:38:53+00:00 2021-08-10 16:38:40+00:00   
1657 2021-08-10 16:38:53+00:00 2021-08-10 16:38:40+00:00   
1658 2021-08-10 16:38:53+00:00 2021-08-10 16:38:40+00:00   
1659 2021-08-10 16:38:53+00:00 2021-08-10 16:38:40+00:00   
1660 2021-08-10 16:38:53+00:00 2021-08-10 15:30:00+00:00   
1661 2021-08-10 16:38:53+00:00 2021-08-10 15:30:00+00:00   
1662 2021-08-10 16:38:53+00:00 2021-08-10 15:30:00+00:00   
1663 2021-08-10 16:38:53+00:00 2021-08-10 15:30:00+00:00   
1664 2021-08-10 16:38:53+00:00 2021-08-10 15:30:00+00:00   
1665 2021-08-10 16:38:53+00:00 2021-08-10 15:30:00+00:00   
1666 2021-08-10 16:38:53+00:00 2021-08-10 15:30:00+00:00   

1652 2021-08-13 17:00:00+00:00      not

In [30]:
df_combined = df_combined[~df_combined["Warning Element"].str.contains("minor", case=False, na=False)]


In [31]:
# Create new columns for each HSE region, initialized with 0
hse_regions = {
    'HSE Dublin and North East': ['Cavan', 'Monaghan', 'Louth', 'Meath', 'Dublin'],
    'HSE Dublin and Midlands': ['Kildare', 'Wicklow', 'Laois', 'Offaly', 'Longford', 'Westmeath', 'Dublin'],
    'HSE Dublin and South East': ['Carlow', 'Kilkenny', 'Tipperary', 'Waterford', 'Wexford', 'Wicklow', 'Dublin'],
    'HSE Mid West': ['Clare', 'Limerick', 'Tipperary'],
    'HSE South West': ['Cork', 'Kerry'],
    'HSE West and North West': ['Donegal', 'Sligo', 'Leitrim', 'Mayo', 'Galway', 'Roscommon']
}

# Initialize new columns with 0
for region in hse_regions.keys():
    df_combined[region] = 0

# For each region, set to 1 if any of its counties has a 1
for region, counties in hse_regions.items():
    df_combined[region] = (df_combined[counties].sum(axis=1) > 0).astype(int)

# Verify the results
print("Sample of rows with HSE region columns:")
print(df_combined[list(hse_regions.keys())].head())

# Check distribution of regional warnings
print("\nNumber of warnings per HSE region:")
for region in hse_regions.keys():
    print(f"{region}: {df_combined[region].sum()}")

Sample of rows with HSE region columns:
   HSE Dublin and North East  HSE Dublin and Midlands  \
0                          0                        0   
1                          1                        1   
2                          1                        1   
3                          1                        1   
4                          1                        1   

   HSE Dublin and South East  HSE Mid West  HSE South West  \
0                          0             1               1   
1                          1             1               0   
2                          1             1               1   
3                          1             1               1   
4                          1             1               1   

   HSE West and North West  
0                        1  
1                        0  
2                        1  
3                        1  
4                        1  

HSE Dublin and North East: 1159
HSE Dublin and Midlands: 1271
HSE Dub

In [32]:
##add length of event feature
# Ensure the 'Valid From' and 'Valid To' columns are in datetime format
df_combined['Valid From'] = pd.to_datetime(df_combined['Valid From'])
df_combined['Valid To'] = pd.to_datetime(df_combined['Valid To'])

# Calculate the duration in hours and create a new column
df_combined['Duration_hours'] = (df_combined['Valid To'] - df_combined['Valid From']).dt.total_seconds() / 3600

# Check the first few rows to verify the new column
print(df_combined[['Valid From', 'Valid To', 'Duration_hours']].head())


                 Valid From                  Valid To  Duration_hours
0 2014-01-02 17:00:00+00:00 2014-01-03 14:00:00+00:00            21.0
1 2014-01-02 17:00:00+00:00 2014-01-03 14:00:00+00:00            21.0
2 2014-01-03 14:00:00+00:00 2014-01-03 19:00:00+00:00             5.0
3 2014-01-04 03:00:00+00:00 2014-01-04 14:00:00+00:00            11.0
4 2014-01-05 08:00:00+00:00 2014-01-05 20:00:00+00:00            12.0


In [33]:
df_combined.to_csv('/home/paulharford/college/project/project_data/processed/WEATHERED_warnings_2014-2023_cleaned.csv', index=False)