#### Set styling for plotting

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.ticker as ticker
import seaborn as sns
sns.set_palette('colorblind')
from matplotlib.pyplot import tight_layout
# ##SETTING PARAMS FOR MATPLOTLIB FIGURES
plt.rcParams.update({"figure.figsize": (6, 6),
                 "axes.facecolor": "white",
                 "axes.edgecolor": "black"})
plt.rcParams['axes.prop_cycle'] = plt.cycler(color=sns.color_palette('colorblind'))
##set font size
font = {'family': 'sans-serif',
       'weight': 'normal',
       'size': 14}
plt.rc('font', **font)
# ##PANDAS PLOTTING
pd.plotting.register_matplotlib_converters()

#### Step 1: save environment file

In [2]:
!conda env export > ods_met_environment.yml

#### Step 2: import modules

In [3]:
import glob
import os
import xml.etree.ElementTree as ET
from datetime import datetime, timezone
from tqdm import tqdm
import codecs
import csv

## INFORMATION

## ODS - Met Eirean old system (manual) for recording Adverse weather
Start date: 2012-04-25 12:00:00
End date: 2021-02-17 09:00:00
only 1654 events in totat which seems low compared to the new rss xml system even with Advisories removed
will use data from 2013 to when the new system starts in 2018 


#### Step 3: import file

In [4]:
data_directory_ods = "/mnt/hgfs/shared/project_data/met_eireann/Archived_Wx_Warnings_25April2012_17February2021.ods"
full_path_ods = os.path.abspath(data_directory_ods)


In [5]:
# Read the .ods file
df_ods = pd.read_excel(full_path_ods, engine='odf', parse_dates=['Issue Time', 'Valid From', 'Valid To'])

In [6]:
# check the first few rows
print(df_ods.head())

# look at dataframe info
print(df_ods.info())

# check the df shape
print(f"Number of rows: {df_ods.shape[0]}")
print(f"Number of columns: {df_ods.shape[1]}")

print("ODS Filtered Date Range:")
print(f"Start date: {df_ods['Issue Time'].min()}")
print(f"End date: {df_ods['Issue Time'].max()}")

0 2012-04-25 12:00:00 2012-04-25 12:00:00 2012-04-26 12:00:00         Yellow   
1 2012-06-01 21:00:00 2012-06-02 12:00:00 2012-06-03 21:00:00         Yellow   
2 2012-06-02 14:00:00 2012-06-02 14:00:00 2012-06-03 12:00:00         Orange   
3 2012-06-08 10:00:00 2012-06-08 10:00:00 2012-06-08 23:59:00         Yellow   
4 2012-06-14 20:00:00 2012-06-14 20:00:00 2012-06-16 12:00:00         Yellow   

0            Rain                               Munster and Leinster   
1            Rain                     Munster, Connacht and Leinster   
2            Rain                               Munster and Leinster   
3            Rain                              Connacht and Leinster   
4            Rain  Munster, Leinster, Connacht, Donegal, Monaghan...   

0  Heavy rain moving into Southern coastal counti...     True   True   True   
1  Between 25 and 65 mm of rain possible, (heavie...     True   True   True   
2  Between 25mm & 65mm of rain expected over Lein...     True   True   True   
3

#### Step 4: Cleaning and processing data into a common format for combination

##### Step 4.1: deal with provinces

In [9]:
##check if countires are also set to true when the province is seleted 
# list of counties
counties = ['Clare', 'Cork', 'Kerry', 'Limerick', 'Tipperary', 'Waterford']

# Filter where 'Munster' is True
df_munster = df_ods[df_ods['Munster']]

# Check if all specified counties are True in the filtered DataFrame
all_counties_true = df_munster[counties].all(axis=1)

# Identify rows where not all counties are True
rows_with_false_counties = df_munster[~all_counties_true]

# Display the results
print(f"Total rows where 'Munster' is True: {df_munster.shape[0]}")
print(f"Rows where 'Munster' is True but not all counties are True: {rows_with_false_counties.shape[0]}")

if not rows_with_false_counties.empty:
    print("\nSample rows with inconsistencies:")
    print(rows_with_false_counties.head())
else:
    print("All county columns are True when 'Munster' is True.")

Total rows where 'Munster' is True: 478
Rows where 'Munster' is True but not all counties are True: 0
All county columns are True when 'Munster' is True.


In [11]:
##check if countires are also set to true when the province is seleted 
# list of counties
counties = ['Carlow', 'Dublin', 'Kildare', 'Kilkenny', 'Laois', 'Longford', 'Louth', 'Meath', 'Offaly', 'Westmeath', 'Wexford','Wicklow']

# Filter where 'Leinster' is True
df_leinster = df_ods[df_ods['Leinster']]

# Check if all specified counties are True in the filtered DataFrame
all_counties_true = df_leinster[counties].all(axis=1)

# Identify rows where not all counties are True
rows_with_false_counties = df_leinster[~all_counties_true]

# Display the results
print(f"Total rows where 'Leinster' is True: {df_leinster.shape[0]}")
print(f"Rows where 'Leinster' is True but not all counties are True: {rows_with_false_counties.shape[0]}")

if not rows_with_false_counties.empty:
    print("\nSample rows with inconsistencies:")
    print(rows_with_false_counties.head())
else:
    print("All county columns are True when 'Leinster' is True.")

Total rows where 'Leinster' is True: 513
Rows where 'Leinster' is True but not all counties are True: 0
All county columns are True when 'Leinster' is True.


In [12]:
##After checking 2 provinces, when a province is true all the counties are also true so i can drop the provinces 
df_ods= df_ods.drop(['Connacht', 'Leinster', 'Munster', 'Ulster'], axis=1)

In [28]:
##checking if ods data has an Ireland columns whihc rekates to an all of ireland warning (present in other datasets) 
df_ods.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1654 entries, 0 to 1653
Data columns (total 33 columns):
 #   Column           Non-Null Count  Dtype              
---  ------           --------------  -----              
 0   Issue Time       1654 non-null   datetime64[ns, UTC]
 1   Valid From       1654 non-null   datetime64[ns, UTC]
 2   Valid To         1654 non-null   datetime64[ns, UTC]
 5   WhereToText      1651 non-null   object             
 7   Clare            1654 non-null   int64              
 8   Cork             1654 non-null   int64              
 9   Kerry            1654 non-null   int64              
 10  Limerick         1654 non-null   int64              
 11  Tipperary        1654 non-null   int64              
 12  Waterford        1654 non-null   int64              
 13  Carlow           1654 non-null   int64              
 14  Dublin           1654 non-null   int64              
 15  Kildare          1654 non-null   int64              
 16  Kilkenny         1

##### Step 4.2 Merge Tipperary SR 
there are only 3 events in Tipperary SR so i will merge it with Tipperary 

In [13]:
df_tippsr = df_ods[df_ods['Tipperary SR']]
df_tippsr.head()


Unnamed: 0,Issue Time,Valid From,Valid To,Warning Colour,Warning Element,WhereToText,Warning Text,Clare,Cork,Kerry,...,Wexford,Wicklow,Cavan,Donegal,Monaghan,Galway,Leitrim,Mayo,Roscommon,Sligo
838,2017-11-23 12:00:00,2017-11-24 11:00:00,2017-11-25 23:59:00,Yellow,Snow/Ice,Ireland,Update: Scattered snow showers across the coun...,True,True,True,...,True,True,True,True,True,True,True,True,True,True
840,2017-11-24 11:34:50,2017-11-24 11:00:00,2017-11-25 23:59:00,Yellow,Snow/Ice,Ireland,Update: Scattered snow showers across the coun...,True,True,True,...,True,True,True,True,True,True,True,True,True,True
841,2017-11-25 09:56:08,2017-11-25 10:00:00,2017-11-26 10:00:00,Yellow,Snow/Ice,Ireland,Update: Scattered wintry showers across the co...,True,True,True,...,True,True,True,True,True,True,True,True,True,True


In [14]:
# Create a copy of the filtered data
df_ods = df_ods.copy()

## Its not clear what Tipperary SR actually is and is not the same in the XML data so i'm going to merge tipperary and tipperary SR 
df_ods['Tipperary'] = df_ods[['Tipperary', 'Tipperary SR']].max(axis=1)
df_ods = df_ods.drop('Tipperary SR', axis=1)

##### Step 4.3 convert true/false to 0/1


In [16]:
# Identify all boolean columns in the DataFrame
bool_cols = df_ods.select_dtypes(include=['bool']).columns

# Convert boolean columns to integers (True -> 1, False -> 0)
df_ods[bool_cols] = df_ods[bool_cols].astype(int)

# Verify the changes by displaying data types
print("Data types after conversion:")
print(df_ods.dtypes)

Data types after conversion:
Issue Time         datetime64[ns]
Valid From         datetime64[ns]
Valid To           datetime64[ns]
WhereToText                object
Clare                       int64
Cork                        int64
Kerry                       int64
Limerick                    int64
Tipperary                   int64
Waterford                   int64
Carlow                      int64
Dublin                      int64
Kildare                     int64
Kilkenny                    int64
Laois                       int64
Longford                    int64
Louth                       int64
Meath                       int64
Offaly                      int64
Westmeath                   int64
Wexford                     int64
Wicklow                     int64
Cavan                       int64
Donegal                     int64
Monaghan                    int64
Galway                      int64
Leitrim                     int64
Mayo                        int64
Roscommon          

##### Step 4.4 confirm date/time settings, check date ranges and select appropriate rnages to combine 

In [17]:
##make sure datetime is the same in both dataframes
# Ensure datetime columns are consistently UTC
datetime_cols = ['Issue Time', 'Valid From', 'Valid To']
df_ods[datetime_cols] = df_ods[datetime_cols].apply(pd.to_datetime, utc=True)

In [18]:
print("ODS Filtered Date Range:")
print(f"Start date: {df_ods['Issue Time'].min()}")
print(f"End date: {df_ods['Issue Time'].max()}")

ODS Filtered Date Range:
Start date: 2012-04-25 12:00:00+00:00
End date: 2021-02-17 09:00:00+00:00


##### Step 4.5 filter data for dates of interest 2013 to 2020 (full years of data)

In [19]:
 # Filter ODS data from 2013 to end of 2020
### check plots on date change check for duplicates 
df_ods_filtered = df_ods[
    (df_ods['Issue Time'] >= '2013-01-01') & 
    (df_ods['Issue Time'] <= '2020-12-31 23:59:59')
]



#####Step 5 check for any missing values and duplicates and drop unneeded columns

In [20]:
  total_missing = df_ods_filtered.isnull().sum().sum()
  percent_missing = (total_missing / df_ods_filtered.size) * 100
  print(f"Total missing values: {total_missing}")
  print(f"Percentage of missing values: {percent_missing:.2f}%")



Total missing values: 5
Percentage of missing values: 0.01%


In [22]:
# All rows with any missing values
rows_with_missing = df_ods[df_ods.isnull().any(axis=1)]
rows_with_missing.head()


Unnamed: 0,Issue Time,Valid From,Valid To,Warning Colour,Warning Element,WhereToText,Warning Text,Clare,Cork,Kerry,...,Wexford,Wicklow,Cavan,Donegal,Monaghan,Galway,Leitrim,Mayo,Roscommon,Sligo
10,2012-09-25 09:00:00+00:00,2012-09-25 09:00:00+00:00,2012-09-26 06:00:00+00:00,Yellow,Rain,,1. Further spells of heavy rain today with a r...,0,0,0,...,0,0,0,0,0,0,0,0,0,0
859,2017-12-10 09:00:00+00:00,2017-12-10 09:00:00+00:00,2017-12-10 09:00:00+00:00,Yellow,Snow/Ice,"Wexford, Cork, Kerry, Limerick and Waterford",,0,1,1,...,1,0,0,0,0,0,0,0,0,0
893,2017-12-30 08:00:00+00:00,2017-12-30 21:00:00+00:00,2017-12-31 10:00:00+00:00,Yellow,Wind,,Update of warning:Tonight and Sunday morning s...,1,0,0,...,0,0,1,1,1,1,1,1,1,1
997,2018-03-02 00:00:00+00:00,2018-03-02 00:00:00+00:00,2018-03-02 00:00:00+00:00,Red,Snow/Ice,"Munster, Leinster and Galway",,1,1,1,...,1,1,0,0,0,1,0,0,0,0
1094,2018-10-10 11:00:00+00:00,2018-10-10 11:00:00+00:00,2018-10-10 11:00:00+00:00,Yellow,Wind,,,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
##look for duplicates
duplicate_rows = df_ods_filtered.duplicated()
duplicate_rows.sum()
df_ods_filtered[duplicate_rows]

Unnamed: 0,Issue Time,Valid From,Valid To,Warning Colour,Warning Element,WhereToText,Warning Text,Clare,Cork,Kerry,...,Wexford,Wicklow,Cavan,Donegal,Monaghan,Galway,Leitrim,Mayo,Roscommon,Sligo


In [25]:
##drop columns
columns_to_drop = ['WhereToText', 'Warning Text']
df_ods_filtered = df_ods_filtered.drop(columns=columns_to_drop)


In [26]:
df_ods_filtered.head(10)

Unnamed: 0,Issue Time,Valid From,Valid To,Warning Colour,Warning Element,Clare,Cork,Kerry,Limerick,Tipperary,...,Wexford,Wicklow,Cavan,Donegal,Monaghan,Galway,Leitrim,Mayo,Roscommon,Sligo
14,2013-02-21 18:00:00+00:00,2013-02-21 18:00:00+00:00,2013-02-22 09:00:00+00:00,Yellow,Low Temperature/Ice,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
15,2013-02-23 11:00:00+00:00,2013-02-23 11:00:00+00:00,2013-02-24 12:00:00+00:00,Yellow,Snow/Ice,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
16,2013-02-24 11:00:00+00:00,2013-02-24 18:00:00+00:00,2013-02-25 09:00:00+00:00,Yellow,Low Temperature/Ice,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
17,2013-02-25 20:00:00+00:00,2013-02-25 20:00:00+00:00,2013-02-26 09:00:00+00:00,Yellow,Low Temperature/Ice,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
18,2013-03-10 12:00:00+00:00,2013-03-10 18:00:00+00:00,2013-03-11 18:00:00+00:00,Orange,Snow/Ice,0,1,1,0,1,...,1,1,0,1,0,0,0,1,0,0
19,2013-03-11 09:00:00+00:00,2013-03-11 09:00:00+00:00,2013-03-11 21:00:00+00:00,Yellow,Low Temperature/Ice,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
20,2013-03-15 11:00:00+00:00,2013-03-15 21:00:00+00:00,2013-03-16 07:00:00+00:00,Yellow,Low Temperature/Ice,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
21,2013-03-15 19:00:00+00:00,2013-03-15 23:00:00+00:00,2013-03-16 09:00:00+00:00,Yellow,Low Temperature/Ice,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
22,2013-03-18 15:00:00+00:00,2013-03-18 21:00:00+00:00,2013-03-19 12:00:00+00:00,Yellow,Snow/Ice,0,0,0,0,0,...,0,0,1,1,1,0,0,0,0,0
23,2013-03-18 22:00:00+00:00,2013-03-18 22:00:00+00:00,2013-03-19 12:00:00+00:00,Yellow,Snow/Ice,0,0,0,0,0,...,0,1,1,1,1,0,1,0,0,1


### Step 5 save data
With the provinces removed - the whetertotext and text also dropped

In [29]:
df_ods_filtered.to_csv('/mnt/hgfs/shared/project_data/met_eireann/ods_warnings_2017_2020.csv', index=False)