#### Set styling for plotting

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.ticker as ticker
import seaborn as sns
sns.set_palette('colorblind')
from matplotlib.pyplot import tight_layout
# ##SETTING PARAMS FOR MATPLOTLIB FIGURES
plt.rcParams.update({"figure.figsize": (6, 6),
                 "axes.facecolor": "white",
                 "axes.edgecolor": "black"})
plt.rcParams['axes.prop_cycle'] = plt.cycler(color=sns.color_palette('colorblind'))
##set font size
font = {'family': 'sans-serif',
       'weight': 'normal',
       'size': 14}
plt.rc('font', **font)
# ##PANDAS PLOTTING
pd.plotting.register_matplotlib_converters()

### Step 1: save environment file

In [2]:
!conda env export > combined_met_environment.yml

### Step 2: import modules

In [3]:
import glob
import os
import xml.etree.ElementTree as ET
from datetime import datetime, timezone
from tqdm import tqdm
import codecs
import csv
import pandas as pd
from datetime import timedelta

## INFORMATION

##load the 3 processed .csv fiels for combination in a single complete dataset 

### Step 3: import csv files

In [4]:
data_directory_xml = "/home/paulharford/college/project/project_data/met_eireann/WEATHERED_xml_warnings_2018_2023_08_v1.csv"
data_directory_ods = "/home/paulharford/college/project/project_data/met_eireann/ods_warnings_2017_2020.csv"
data_directory_xl = "/home/paulharford/college/project/project_data/met_eireann/xl_warnings_2023.csv"
full_path_xml = os.path.abspath(data_directory_xml)
full_path_ods = os.path.abspath(data_directory_ods)
full_path_xl = os.path.abspath(data_directory_xl)

In [5]:
# Read the 2023 excel file
df_xml = pd.read_csv(full_path_xml)
df_ods = pd.read_csv(full_path_ods)
df_xl = pd.read_csv(full_path_xl)

### Step 4 Compare overlapping dates ranegs from different datasets as an accuracy check of processing output

In [6]:
# First, ensure our datetime conversions are correct
df_xml['Issue Time'] = pd.to_datetime(df_xml['Issue Time'])
df_ods['Issue Time'] = pd.to_datetime(df_ods['Issue Time'])

# Get counts for specific years (2018-2020) for both datasets
xml_year_counts = df_xml['Issue Time'].dt.year.value_counts().sort_index()
ods_year_counts = df_ods['Issue Time'].dt.year.value_counts().sort_index()

# Create a DataFrame to display the years side by side
comparison_df = pd.DataFrame({
    'XML Dataset': xml_year_counts,
    'ODS Dataset': ods_year_counts
})

# Filter 2018-2020
comparison_df = comparison_df.loc[2018:2020]

# Calculate the difference between datasets
comparison_df['Difference'] = comparison_df['XML Dataset'] - comparison_df['ODS Dataset']

# Display the comparison
print("\nComparison of Events (2018-2020):")
print(comparison_df)

# Optional: Create a percentage difference column to see relative changes
comparison_df['Percentage Difference'] = (
    (comparison_df['XML Dataset'] - comparison_df['ODS Dataset']) / 
    comparison_df['ODS Dataset'] * 100
).round(2)

print("\nWith Percentage Difference:")
print(comparison_df)


Comparison of Events (2018-2020):
            XML Dataset  ODS Dataset  Difference
Issue Time                                      
2018              270.0        249.0        21.0
2019              186.0        187.0        -1.0
2020              289.0        277.0        12.0

With Percentage Difference:
            XML Dataset  ODS Dataset  Difference  Percentage Difference
Issue Time                                                             
2018              270.0        249.0        21.0                   8.43
2019              186.0        187.0        -1.0                  -0.53
2020              289.0        277.0        12.0                   4.33


#### Step:6 confirm date/time settings, check date ranges and select appropriate rnages to combine 

In [7]:
##make sure datetime is the same in both dataframes
# Ensure datetime columns are consistently UTC
datetime_cols = ['Issue Time', 'Valid From', 'Valid To']
df_ods[datetime_cols] = df_ods[datetime_cols].apply(pd.to_datetime, utc=True)
df_xml[datetime_cols] = df_xml[datetime_cols].apply(pd.to_datetime, utc=True)
df_xl[datetime_cols] = df_xl[datetime_cols].apply(pd.to_datetime, utc=True)

### STEP 5

### Step 5.1 Drop unnecessary columns and reorder columns

In [8]:
##where to text column not required
df_ods.drop("WhereToText", axis=1, inplace=True)

In [9]:
df_xml.drop("WhereToText", axis=1, inplace=True)

In [10]:
# Define the desired order
desired_order = [
    "Issue Time", "Valid From", "Valid To", "Warning Colour", "Warning Element",
    "Warning Text", "Clare", "Cork", "Kerry", "Limerick", 
    "Tipperary", "Waterford", "Carlow", "Dublin", "Kildare", "Kilkenny", 
    "Laois", "Longford", "Louth", "Meath", "Offaly", "Westmeath", "Wexford", 
    "Wicklow", "Cavan", "Donegal", "Monaghan", "Galway", "Leitrim", "Mayo", 
    "Roscommon", "Sligo"
]

# This will select (and order) only those columns; note that if your second DataFrame 
# is missing any of these, a KeyError will be raised.
df_xl_ro = df_xl[desired_order]

In [11]:
df_xl_ro.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 32 columns):
 #   Column           Non-Null Count  Dtype              
---  ------           --------------  -----              
 0   Issue Time       286 non-null    datetime64[ns, UTC]
 1   Valid From       287 non-null    datetime64[ns, UTC]
 2   Valid To         285 non-null    datetime64[ns, UTC]
 6   Clare            299 non-null    int64              
 7   Cork             299 non-null    int64              
 8   Kerry            299 non-null    int64              
 9   Limerick         299 non-null    int64              
 10  Tipperary        299 non-null    int64              
 11  Waterford        299 non-null    int64              
 12  Carlow           299 non-null    int64              
 13  Dublin           299 non-null    int64              
 14  Kildare          299 non-null    int64              
 15  Kilkenny         299 non-null    int64              
 16  Laois            299

### STEP 5.2 - filter date ranges before combining

In [12]:
### date ranges
print("ODS Filtered Date Range:")
print(f"Start date: {df_ods['Issue Time'].min()}")
print(f"End date: {df_ods['Issue Time'].max()}")
print("XML Filtered Date Range:")
print(f"Start date: {df_xml['Issue Time'].min()}")
print(f"End date: {df_xml['Issue Time'].max()}")
print("XML Filtered Date Range:")
print(f"Start date: {df_xl_ro['Issue Time'].min()}")
print(f"End date: {df_xl_ro['Issue Time'].max()}")

ODS Filtered Date Range:
Start date: 2013-02-21 18:00:00+00:00
End date: 2020-12-30 14:00:00+00:00
XML Filtered Date Range:
Start date: 2017-09-25 04:34:48+00:00
End date: 2023-08-05 11:33:52+00:00
XML Filtered Date Range:
Start date: 2023-01-01 13:22:00+00:00
End date: 2023-12-29 19:12:00+00:00


In [13]:
df_ods_filtered = df_ods[
    (df_ods['Issue Time'] >= '2014-01-01') & 
    (df_ods['Issue Time'] <= '2017-12-31 23:59:59')
]
df_xml_filtered = df_xml[
    (df_xml['Issue Time'] >= '2018-01-01') & 
    (df_xml['Issue Time'] <= '2022-12-31 23:59:59')
]
df_xl_filtered = df_xl_ro[
    (df_xl_ro['Issue Time'] >= '2023-01-01') & 
    (df_xl_ro['Issue Time'] <= '2023-12-29 23:59:59')
]

In [14]:
## count events from each dataset befor processing 
# Count events by severity in each dataset
xml_counts = df_xml_filtered['Warning Colour'].value_counts()
ods_counts = df_ods_filtered['Warning Colour'].value_counts() 
xl_counts = df_xl_filtered['Warning Colour'].value_counts()

# Print the counts for each dataset
print("XML dataset counts:")
print(xml_counts)
print("\nODS dataset counts:")
print(ods_counts)
print("\nXL dataset counts:")
print(xl_counts)

# If you want combined counts across all datasets
combined_counts = pd.concat([df_xml_filtered['Warning Colour'], df_ods_filtered['Warning Colour'], df_xl_filtered['Warning Colour']]).value_counts()
print("\nCombined counts across all datasets:")
print(combined_counts)

XML dataset counts:
Yellow       1457
Orange        239
Red            38
notmapped      16
Name: count, dtype: int64

ODS dataset counts:
Yellow    557
Orange    175
Red        18
Name: count, dtype: int64

XL dataset counts:
Yellow    231
Orange     48
Red         6
Name: count, dtype: int64

Combined counts across all datasets:
Yellow       2245
Orange        462
Red            62
notmapped      16
Name: count, dtype: int64


### STEP 5.3 Create combined dataset

In [15]:
df_combined = pd.concat([df_ods_filtered, df_xml_filtered, df_xl_filtered], ignore_index=True)

In [16]:
print("Combined Filtered Date Range:")
print(f"Start date: {df_combined['Issue Time'].min()}")
print(f"End date: {df_combined['Issue Time'].max()}")

Combined Filtered Date Range:
Start date: 2014-01-02 09:00:00+00:00
End date: 2023-12-29 19:12:00+00:00


In [17]:
df_combined.isnull().sum()

Issue Time         0
Valid From         0
Valid To           2
Clare              0
Cork               0
Kerry              0
Limerick           0
Tipperary          0
Waterford          0
Carlow             0
Dublin             0
Kildare            0
Kilkenny           0
Laois              0
Longford           0
Louth              0
Meath              0
Offaly             0
Westmeath          0
Wexford            0
Wicklow            0
Cavan              0
Donegal            0
Monaghan           0
Galway             0
Leitrim            0
Mayo               0
Roscommon          0
Sligo              0
dtype: int64

### STEP 5.4 Remove events that have no "Warning Colour" or "Valid To" time

In [18]:
color_missing = df_combined[df_combined['Warning Colour'].isna()]
print(f"Number of rows with missing Warning Colour: {len(color_missing)}")
color_missing.head(5)



Unnamed: 0,Issue Time,Valid From,Valid To,Warning Colour,Warning Element,Warning Text,Clare,Cork,Kerry,Limerick,...,Wexford,Wicklow,Cavan,Donegal,Monaghan,Galway,Leitrim,Mayo,Roscommon,Sligo
2755,2023-12-09 11:44:00+00:00,2023-12-10 14:00:00+00:00,2023-12-10 21:00:00+00:00,,Wind,Storm Fergus will generate very strong and gus...,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
df_combined = df_combined.dropna(subset=['Warning Colour'])

In [20]:
df_combined = df_combined.dropna(subset=['Valid To'])

In [21]:
df_combined['Warning Text'] = df_combined['Warning Text'].fillna('no_warning_text')

In [22]:
df_combined.head()

Unnamed: 0,Issue Time,Valid From,Valid To,Warning Colour,Warning Element,Warning Text,Clare,Cork,Kerry,Limerick,...,Wexford,Wicklow,Cavan,Donegal,Monaghan,Galway,Leitrim,Mayo,Roscommon,Sligo
0,2014-01-02 09:00:00+00:00,2014-01-02 17:00:00+00:00,2014-01-03 14:00:00+00:00,Orange,Wind,Becoming stormy this evening and tonight and c...,1,1,1,1,...,0,0,0,1,0,1,1,1,1,1
1,2014-01-02 09:00:00+00:00,2014-01-02 17:00:00+00:00,2014-01-03 14:00:00+00:00,Yellow,Wind,Becoming extremely windy or stormy this evenin...,0,0,0,0,...,1,1,1,0,1,0,0,0,0,0
2,2014-01-02 09:00:00+00:00,2014-01-03 14:00:00+00:00,2014-01-03 19:00:00+00:00,Yellow,Wind,Blustery for the rest of the afternoon with so...,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
3,2014-01-04 03:00:00+00:00,2014-01-04 03:00:00+00:00,2014-01-04 14:00:00+00:00,Yellow,Snow/Ice,Scattered outbreaks of rain and sleet spreadin...,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
4,2014-01-04 10:00:00+00:00,2014-01-05 08:00:00+00:00,2014-01-05 20:00:00+00:00,Yellow,Wind,Southeast winds mean speeds of 50 to 65 km/h w...,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


### STEP 5.5 Standardize the weather types across severities 

In [23]:
# List unique Warning Elements
unique_warning_elements = df_combined['Warning Element'].unique()
print("Unique Warning Elements:")
print(unique_warning_elements)

# Optionally, if you'd like them sorted:
print("Sorted Unique Warning Elements:")
print(sorted(unique_warning_elements))

['Wind' 'Snow/Ice' 'Rain' 'Fog (or freezing fog)' 'Thunderstorm'
 'Yellow fog' 'Orange Wind' 'Yellow Thunderstorm'
 'Yellow High Temperature' 'Orange High Temperature' 'Orange Thunderstorm'
 'Yellow Snow/Ice' 'Orange Snow/Ice' 'Orange Low Temperature/Ice'
 'Orange Fog' 'Fog / Ice' 'Rainfall' 'Fog' 'wind']


In [24]:
def normalize_warning(warning_element: str, warning_color: str = None) -> tuple:
    """
    Normalize weather warning types to a consistent format, separating phenomenon and severity.
    
    Args:
        warning_element: The warning text to normalize
        warning_color: Optional color from separate column (Red, Orange, Yellow)
        
    Returns:
        Tuple of (phenomenon, severity)
    """
    text = warning_element.lower().strip()
    
    #Determine severity using color column if available
    if warning_color and warning_color.strip():
        if warning_color.lower() in ['red', 'extreme']:
            severity = "Red"
        elif warning_color.lower() in ['orange', 'severe']:
            severity = "Orange"
        elif warning_color.lower() in ['yellow', 'moderate']:
            severity = "Yellow"
        else:
            severity = "Unknown"
    else:
        if any(term in text for term in ["extreme", "red"]):
            severity = "Red"
        elif any(term in text for term in ["severe", "orange"]):
            severity = "Orange"
        elif any(term in text for term in ["moderate", "yellow"]):
            severity = "Yellow"
        elif "minor" in text:
            severity = "Minor"
        else:
            severity = "Unknown"    

    # Create a hierarchical priority order to ensure consistent classification
    if "thunder" in text or "tstorm" in text or "thunderstorm" in text:
        phenomenon = "Thunder"
    elif "hail" in text:
        phenomenon = "Hail"
    elif "snow" in text and "ice" in text:
        phenomenon = "Snow_Ice"
    elif "snow" in text:
        phenomenon = "Snow"
    elif "ice" in text and ("low" in text or "temperature" in text):
        phenomenon = "Low-Temperature_Ice"
    elif "ice" in text:
        phenomenon = "Ice"
    elif "fog" in text and "ice" in text:
        phenomenon = "Ice_Fog"
    elif "fog" in text:
        phenomenon = "Fog"
    elif ("rain" in text or "rainfall" in text) and "thunder" in text:
        phenomenon = "Thunder_Rainfall"
    elif "rain" in text or "rainfall" in text:
        phenomenon = "Rainfall"
    elif "high" in text and "temperature" in text:
        phenomenon = "High-Temperature"
    elif "low" in text and "temperature" in text:
        phenomenon = "Low-Temperature"
    elif "small-craft" in text or "wind" in text or "gust" in text or "storm" in text:
        phenomenon = "Wind"
    else:
        phenomenon = "Unknown"
    
    return phenomenon, severity


def update_dataframe_with_normalized_warnings(df, warning_column, color_column=None):
    """
    Update dataframe with normalized warning categories, separating phenomenon and severity.
    
    Args:
        df: Pandas DataFrame containing warnings
        warning_column: Column name containing the warning text
        color_column: Optional column name containing color information
    
    Returns:
        DataFrame with added 'warning_phenomenon' and 'warning_severity' columns
    """
    # Apply the normalization function to get both phenomenon and severity
    if color_column and color_column in df.columns:
        result = df.apply(
            lambda row: normalize_warning(row[warning_column], row[color_column]), 
            axis=1
        )
    else:
        result = df[warning_column].apply(normalize_warning)
    
    # Split the result into separate columns
    df['warning_phenomenon'] = [r[0] for r in result]
    df['warning_severity'] = [r[1] for r in result]
    
    # For compatibility with existing code, you can optionally create a combined column
    df['weather_type'] = df['warning_phenomenon'] + " warning"
    
    return df


In [25]:
df_weather_cat = update_dataframe_with_normalized_warnings(
    df_combined, 
    warning_column='Warning Element',  
    color_column='Warning Colour' 
)



In [26]:
# List unique Warning Elements
unique_warning_elements = df_weather_cat['weather_type'].unique()
print("Unique Warning Elements:")
print(unique_warning_elements)

# Optionally, if you'd like them sorted:
print("Sorted Unique Warning Elements:")
print(sorted(unique_warning_elements))



In [27]:
minor_rows = df_combined[df_combined["weather_type"].str.contains("minor", case=False, na=False)]

# Show the filtered rows
minor_rows.head()

Unnamed: 0,Issue Time,Valid From,Valid To,Warning Colour,Warning Element,Warning Text,Clare,Cork,Kerry,Limerick,...,Donegal,Monaghan,Galway,Leitrim,Mayo,Roscommon,Sligo,warning_phenomenon,warning_severity,weather_type


In [28]:
df_weather_cat.isnull().sum()

Issue Time            0
Valid From            0
Valid To              0
Clare                 0
Cork                  0
Kerry                 0
Limerick              0
Tipperary             0
Waterford             0
Carlow                0
Dublin                0
Kildare               0
Kilkenny              0
Laois                 0
Longford              0
Louth                 0
Meath                 0
Offaly                0
Westmeath             0
Wexford               0
Wicklow               0
Cavan                 0
Donegal               0
Monaghan              0
Galway                0
Leitrim               0
Mayo                  0
Roscommon             0
Sligo                 0
weather_type          0
dtype: int64

### STEP 5.6 Check All counties have at least 1 selected (if not its deemed an all of Ireland event)

In [29]:
county_cols = [
    "Clare", "Cork", "Kerry", "Limerick", "Tipperary", "Waterford",
    "Carlow", "Dublin", "Kildare", "Kilkenny", "Laois", "Longford",
    "Louth", "Meath", "Offaly", "Westmeath", "Wexford", "Wicklow",
    "Cavan", "Donegal", "Monaghan", "Galway", "Leitrim", "Mayo",
    "Roscommon", "Sligo"
]

rows_all_zero = df_weather_cat[(df_weather_cat[county_cols] == 0).all(axis=1)]
row_count = len(rows_all_zero)
print(row_count)
rows_all_zero.head(10)

309


Unnamed: 0,Issue Time,Valid From,Valid To,Warning Colour,Warning Element,Warning Text,Clare,Cork,Kerry,Limerick,...,Donegal,Monaghan,Galway,Leitrim,Mayo,Roscommon,Sligo,warning_phenomenon,warning_severity,weather_type
14,2014-01-06 21:00:00+00:00,2014-01-06 21:00:00+00:00,2014-01-07 05:00:00+00:00,Yellow,Wind,Strong to gale force south to southwest winds ...,0,0,0,0,...,0,0,0,0,0,0,0,Wind,Yellow,Wind warning
40,2014-02-10 12:00:00+00:00,2014-02-10 22:00:00+00:00,2014-02-11 03:00:00+00:00,Yellow,Wind,"Strong to gale force south to southeast winds,...",0,0,0,0,...,0,0,0,0,0,0,0,Wind,Yellow,Wind warning
112,2014-08-09 23:00:00+00:00,2014-08-09 23:00:00+00:00,2014-08-10 15:00:00+00:00,Yellow,Rain,"Heavy rain this evening, overnight and on Sund...",0,0,0,0,...,0,0,0,0,0,0,0,Rainfall,Yellow,Rainfall warning
315,2015-08-02 14:00:00+00:00,2015-08-03 09:00:00+00:00,2015-08-03 21:00:00+00:00,Yellow,Wind,Very strong southerly winds developing during ...,0,0,0,0,...,0,0,0,0,0,0,0,Wind,Yellow,Wind warning
389,2015-12-04 11:00:00+00:00,2015-12-04 11:00:00+00:00,2015-12-05 18:00:00+00:00,Orange,Wind,Southwest winds with mean speeds of 55 to 75km...,0,0,0,0,...,0,0,0,0,0,0,0,Wind,Orange,Wind warning
527,2016-11-20 13:00:00+00:00,2016-11-20 13:00:00+00:00,2016-11-21 09:00:00+00:00,Orange,Fog (or freezing fog),The fog will become fairly widespread this eve...,0,0,0,0,...,0,0,0,0,0,0,0,Fog,Orange,Fog warning
621,2017-07-26 16:00:00+00:00,2017-07-26 16:00:00+00:00,2017-07-26 20:00:00+00:00,Orange,Thunderstorm,Thunderstorm activity this evening expected ov...,0,0,0,0,...,0,0,0,0,0,0,0,Thunder,Orange,Thunder warning
626,2017-08-22 09:00:00+00:00,2017-08-22 12:00:00+00:00,2017-08-22 23:59:00+00:00,Orange,Thunderstorm,Widespread thundery activity expected this aft...,0,0,0,0,...,0,0,0,0,0,0,0,Thunder,Orange,Thunder warning
627,2017-08-22 11:00:34+00:00,2017-08-22 12:00:00+00:00,2017-08-22 23:00:00+00:00,Orange,Thunderstorm,Widespread thundery activity expected this aft...,0,0,0,0,...,0,0,0,0,0,0,0,Thunder,Orange,Thunder warning
1087,2019-04-26 04:04:26+00:00,2019-04-26 21:00:01+00:00,2019-04-27 01:00:01+00:00,Orange,Severe Wind warning,In addition to the yellow wind alert for these...,0,0,0,0,...,0,0,0,0,0,0,0,Wind,Orange,Wind warning


In [30]:
##from looking at the various datasets whne all th ecounties are 0 is an all ireland event so i will set all those counties to 1
# Identify rows where all county columns are 0 (all-Ireland events)
rows_all_zero = df_weather_cat[(df_weather_cat[county_cols] == 0).all(axis=1)]
print(f"Found {len(rows_all_zero)} all-Ireland events (all counties set to 0)")

# Create a copy of the DataFrame to avoid warnings
df_updated = df_weather_cat.copy()

# Get indices of rows where all counties are 0
all_zero_indices = rows_all_zero.index

# For these rows, set all county columns to 1
for idx in all_zero_indices:
    df_updated.loc[idx, county_cols] = 1

# Verify the change
verification = df_updated.loc[all_zero_indices]
all_ones_check = (verification[county_cols] == 1).all(axis=1).all()
print(f"Successfully updated all counties to 1: {all_ones_check}")

Found 309 all-Ireland events (all counties set to 0)
Successfully updated all counties to 1: True


In [31]:
##add length of event feature
# Ensure the 'Valid From' and 'Valid To' columns are in datetime format
df_weather_cat['Valid From'] = pd.to_datetime(df_weather_cat['Valid From'])

df_weather_cat['Valid To'] = pd.to_datetime(df_weather_cat['Valid To'])

# Calculate the duration in hours and create a new column
df_weather_cat['Duration_hours'] = (df_weather_cat['Valid To'] - df_weather_cat['Valid From']).dt.total_seconds() / 3600

# Check the first few rows to verify the new column
print(df_weather_cat[['Valid From', 'Valid To', 'Duration_hours']].head())


                 Valid From                  Valid To  Duration_hours
0 2014-01-02 17:00:00+00:00 2014-01-03 14:00:00+00:00            21.0
1 2014-01-02 17:00:00+00:00 2014-01-03 14:00:00+00:00            21.0
2 2014-01-03 14:00:00+00:00 2014-01-03 19:00:00+00:00             5.0
3 2014-01-04 03:00:00+00:00 2014-01-04 14:00:00+00:00            11.0
4 2014-01-05 08:00:00+00:00 2014-01-05 20:00:00+00:00            12.0


In [32]:
df_weather_cat.head()

Unnamed: 0,Issue Time,Valid From,Valid To,Warning Colour,Warning Element,Warning Text,Clare,Cork,Kerry,Limerick,...,Monaghan,Galway,Leitrim,Mayo,Roscommon,Sligo,warning_phenomenon,warning_severity,weather_type,Duration_hours
0,2014-01-02 09:00:00+00:00,2014-01-02 17:00:00+00:00,2014-01-03 14:00:00+00:00,Orange,Wind,Becoming stormy this evening and tonight and c...,1,1,1,1,...,0,1,1,1,1,1,Wind,Orange,Wind warning,21.0
1,2014-01-02 09:00:00+00:00,2014-01-02 17:00:00+00:00,2014-01-03 14:00:00+00:00,Yellow,Wind,Becoming extremely windy or stormy this evenin...,0,0,0,0,...,1,0,0,0,0,0,Wind,Yellow,Wind warning,21.0
2,2014-01-02 09:00:00+00:00,2014-01-03 14:00:00+00:00,2014-01-03 19:00:00+00:00,Yellow,Wind,Blustery for the rest of the afternoon with so...,1,1,1,1,...,1,1,1,1,1,1,Wind,Yellow,Wind warning,5.0
3,2014-01-04 03:00:00+00:00,2014-01-04 03:00:00+00:00,2014-01-04 14:00:00+00:00,Yellow,Snow/Ice,Scattered outbreaks of rain and sleet spreadin...,1,1,1,1,...,1,1,1,1,1,1,Snow_Ice,Yellow,Snow_Ice warning,11.0
4,2014-01-04 10:00:00+00:00,2014-01-05 08:00:00+00:00,2014-01-05 20:00:00+00:00,Yellow,Wind,Southeast winds mean speeds of 50 to 65 km/h w...,1,1,1,1,...,1,1,1,1,1,1,Wind,Yellow,Wind warning,12.0


### STEP 5.7 Catogorise all events into their repspective HSE region

In [33]:
# Define your HSE region mapping.
hse_regions = {
    'HSE Dublin and North East': ['Cavan', 'Monaghan', 'Louth', 'Meath', 'Dublin'],
    'HSE Dublin and Midlands': ['Kildare', 'Wicklow', 'Laois', 'Offaly', 'Longford', 'Westmeath', 'Dublin'],
    'HSE Dublin and South East': ['Carlow', 'Kilkenny', 'Tipperary', 'Waterford', 'Wexford', 'Wicklow', 'Dublin'],
    'HSE Mid West': ['Clare', 'Limerick', 'Tipperary'],
    'HSE South West': ['Cork', 'Kerry'],
    'HSE West and North West': ['Donegal', 'Sligo', 'Leitrim', 'Mayo', 'Galway', 'Roscommon']
}

# List of all county columns
county_cols = [
    "Clare", "Cork", "Kerry", "Limerick", "Tipperary", "Waterford",
    "Carlow", "Dublin", "Kildare", "Kilkenny", "Laois", "Longford",
    "Louth", "Meath", "Offaly", "Westmeath", "Wexford", "Wicklow",
    "Cavan", "Donegal", "Monaghan", "Galway", "Leitrim", "Mayo",
    "Roscommon", "Sligo"
]


In [34]:
df_weather_cat['date'] = pd.to_datetime(df_weather_cat['Valid From']).dt.date

In [35]:
##check counts 
## count events from each dataset befor processing 
# Count events by severity in each dataset
all_counts = df_combined['Warning Colour'].value_counts()

# Print the counts for each dataset
print("Combined dataset counts:")
print(all_counts)


Combined dataset counts:
Yellow       2244
Orange        461
Red            62
notmapped      16
Name: count, dtype: int64


In [36]:
def process_weather_data_final(df_weather, hse_regions, county_cols):
    """
    Process weather data by:
    1. Adding region information
    2. Removing exact duplicates
    3. Keeping only highest severity warnings
    4. Adding county count per region
    5. Adding inverse county count weighting
    
    Args:
        df_weather: DataFrame containing weather data
        hse_regions: Dictionary mapping regions to counties
        county_cols: List of county column names
        
    Returns:
        Clean processed DataFrame
    """
    # Ensure date column
    df = df_weather.copy()
    if 'date' not in df.columns:
        df['date'] = pd.to_datetime(df['Valid From']).dt.date
    
    # Add region information
    def get_regions(row):
        triggered_counties = [county for county in county_cols if row.get(county, 0) == 1]
        if not triggered_counties:
            return list(hse_regions.keys())
        regions = set()
        for region, counties in hse_regions.items():
            if any(county in triggered_counties for county in counties):
                regions.add(region)
        return list(regions)
    
    df['region_list'] = df.apply(get_regions, axis=1)
    df_expanded = df.explode('region_list').rename(columns={'region_list': 'region'})
    
    # Remove exact duplicates
    duplicate_cols = ['region', 'date', 'weather_type', 'warning_severity', 'Valid From', 'Valid To', 'Warning Text']
    df_clean = df_expanded.drop_duplicates(subset=duplicate_cols, keep='first')
    
    # Step 4: Keep only the highest severity for each region/date/weather_type
    # Define severity order (highest to lowest)
    severity_order = {'Red': 3, 'Orange': 2, 'Yellow': 1, 'None': 0}
    
    # Create a numeric severity column
    df_clean['severity_value'] = df_clean['warning_severity'].map(
        lambda x: severity_order.get(x, 0) if isinstance(x, str) else 0
    )
    
    # Group by region, date, weather_type and keep only highest severity
    df_highsev = df_clean.sort_values('severity_value', ascending=False)
    df_highsev = df_highsev.drop_duplicates(subset=['region', 'date', 'weather_type'], keep='first')
    
    # Drop temporary column
    df_highsev = df_highsev.drop('severity_value', axis=1)
    
    # Add county count per region as a covariate -1
    region_county_counts = {region: len(counties) for region, counties in hse_regions.items()}
    df_highsev['counties_in_region'] = df_highsev['region'].map(region_county_counts)
    
    # Calculate weather event density per county
    df_highsev['weather_event_density'] = df_highsev['weather_event'] / df_highsev['counties_in_region'] \
        if 'weather_event' in df_highsev.columns else 1
    
    # Add inverse county count weighting (approach #7)
    df_highsev['county_weight'] = 1 / df_highsev['counties_in_region']
    
    # Count counties affected by the specific weather event (if county data available)
    def count_affected_counties(row):
        triggered_counties = [county for county in county_cols if row.get(county, 0) == 1]
        region_counties = hse_regions.get(row['region'], [])
        return sum(1 for county in triggered_counties if county in region_counties)
    
    # Only calculate this if we have county-level information
    if len(county_cols) > 0:
        df_highsev['affected_counties_count'] = df_highsev.apply(count_affected_counties, axis=1)
        
        # Calculate proportion of counties in the region affected
        df_highsev['affected_counties_proportion'] = df_highsev['affected_counties_count'] / df_highsev['counties_in_region']
    
    return df_highsev

In [37]:
df_weather_final = process_weather_data_final(df_weather_cat,hse_regions,county_cols)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [38]:
# Count the rows before filtering
total_before = len(df_weather_final)

# Filter out rows where Warning Colour is 'notmapped'
df_filtered = df_weather_final[df_weather_final['Warning Colour'] != 'notmapped'].copy()

# Count the rows after filtering
total_after = len(df_weather_final)
removed = total_before - total_after

# Print the results
print(f"Removed {removed} rows with 'notmapped' Warning Colour")
print(f"This represents {removed/total_before*100:.2f}% of the original data")

This represents 0.00% of the original data


In [39]:
# Count rows where warning_severity is 'Unknown'
unknown_count = (df_weather_final['warning_severity'] == 'Unknown').sum()
print(f"Total 'Unknown' entries in warning_severity: {unknown_count}")

# See what percentage of total records this represents
total_records = len(df_weather_final)
unknown_percentage = (unknown_count / total_records) * 100
print(f"'Unknown' entries represent {unknown_percentage:.2f}% of total records")

# You can also look at a few examples of these rows
unknown_examples = df_weather_final[df_weather_final['warning_severity'] == 'Unknown'].head(5)
print("\nExample rows with 'Unknown' warning severity:")
print(unknown_examples[['date', 'weather_type', 'warning_severity', 'Warning Colour', 'Warning Text']])

'Unknown' entries represent 1.17% of total records




In [40]:
##remove Minor Warnings
# Check how many rows contain "Minor" in Warning Text
minor_count = df_weather_final['Warning Text'].str.contains('Minor', case=True, na=False).sum()
print(f"Found {minor_count} rows containing 'Minor' in Warning Text")

# Remove rows with "Minor" in Warning Text
processed_df_cleaned = df_weather_final[~df_weather_final['Warning Text'].str.contains('Minor', case=True, na=False)]

# Check how many rows were removed
rows_removed = len(df_weather_final) - len(processed_df_cleaned)
print(f"Removed {rows_removed} rows containing 'Minor'")

Removed 48 rows containing 'Minor'


In [41]:
# Initialize a dictionary to store results
unknown_counts = {}
total_rows = len(processed_df_cleaned)

# Loop through each column
for col in processed_df_cleaned.columns:
    # Only check string/object columns
    if processed_df_cleaned[col].dtype == 'object':
        # Count 'Unknown' values
        count = (processed_df_cleaned[col] == 'Unknown').sum()
        
        # If any found, save the count
        if count > 0:
            unknown_counts[col] = count

# Print results
print(f"Found 'Unknown' values in {len(unknown_counts)} columns:")
for col, count in unknown_counts.items():
    percentage = (count / total_rows) * 100
    print(f"  - {col}: {count} rows ({percentage:.2f}%)")

Found 'Unknown' values in 1 columns:


In [42]:
# Check how many rows have "Unknown" severity before removing
unknown_count = (processed_df_cleaned['warning_severity'] == 'Unknown').sum()
print(f"Found {unknown_count} rows with 'Unknown' warning severity")

# Remove rows with "Unknown" severity
processed_df_cleaned = processed_df_cleaned[processed_df_cleaned['warning_severity'] != 'Unknown']




In [43]:
# Filter to just the HSE Mid West region
#HSE Dublin and North East
#HSE Dublin and Midlands
#HSE Dublin and South East
#HSE Mid West
#HSE South West
#HSE West and North West

midwest_df = processed_df_cleaned[processed_df_cleaned['region'] == 'HSE Dublin and Midlands'].copy()

# Convert date to string for consistent comparison if it's not already
if pd.api.types.is_datetime64_dtype(midwest_df['date']):
    midwest_df['date_str'] = midwest_df['date'].dt.strftime('%Y-%m-%d')
else:
    midwest_df['date_str'] = midwest_df['date'].astype(str)

# Define columns that identify a unique event (exact duplicates)
exact_dupe_cols = ['date_str', 'weather_type', 'Valid From', 'Valid To', 'Warning Text']

# Mark exact duplicates
exact_dup_mask = midwest_df.duplicated(subset=exact_dupe_cols, keep=False)

# Get exactly duplicated rows and sort them
exact_dups = midwest_df[exact_dup_mask].sort_values(exact_dupe_cols)

# Show the count
exact_dupe_count = exact_dup_mask.sum()
total_count = len(midwest_df)
print(f"Exact duplicates in HSE Mid West: {exact_dupe_count} out of {total_count} rows ({round(exact_dupe_count/total_count*100, 1)}%)")

# Show duplicates with key information
columns_to_show = ['date_str', 'weather_type', 'warning_severity', 'Valid From', 'Valid To', 'Warning Text']
midwest_exact_dups = exact_dups[columns_to_show]
print(midwest_exact_dups.head(20))

Exact duplicates in HSE Mid West: 0 out of 734 rows (0.0%)
Empty DataFrame
Index: []


In [44]:
##check counts 
## count events from each dataset befor processing 
# Count events by severity in each dataset
all_counts_2 = processed_df_cleaned['Warning Colour'].value_counts()

# Print the counts for each dataset
print("Combined dataset counts:")
print(all_counts_2)


Combined dataset counts:
Yellow    3744
Orange     760
Red         67
Name: count, dtype: int64


### Step 6 Save combined datasets to csv

In [45]:
processed_df_cleaned.to_csv('/home/paulharford/college/project/project_data/processed/WEATHERED_warnings_2014-2023_cleaned_v4.csv', index=False)