### Set styling for plotting

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.ticker as ticker
import seaborn as sns
sns.set_palette('colorblind')
from matplotlib.pyplot import tight_layout
# ##SETTING PARAMS FOR MATPLOTLIB FIGURES
plt.rcParams.update({"figure.figsize": (6, 6),
                 "axes.facecolor": "white",
                 "axes.edgecolor": "black"})
plt.rcParams['axes.prop_cycle'] = plt.cycler(color=sns.color_palette('colorblind'))
##set font size
font = {'family': 'sans-serif',
       'weight': 'normal',
       'size': 14}
plt.rc('font', **font)
# ##PANDAS PLOTTING
pd.plotting.register_matplotlib_converters()

### Step 1: save environment file

In [2]:
!conda env export > ods_met_environment.yml

### Step 2: import modules

In [10]:
!pip install odfpy

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting odfpy
  Downloading odfpy-1.4.1.tar.gz (717 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m717.0/717.0 kB[0m [31m30.4 MB/s[0m eta [36m0:00:00[0m
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: odfpy
  Building wheel for odfpy (setup.py) ... [?25done
[?25h  Created wheel for odfpy: filename=odfpy-1.4.1-py2.py3-none-any.whl size=160718 sha256=fe2379f2b274ddd2d7d83a5c34c0ddc61c294e17f4ff37f88aa25c314ba266bc
  Stored in directory: /tmp/pip-ephem-wheel-cache-vo9juapr/wheels/36/5d/63/8243a7ee78fff0f944d638fd0e66d7278888f5e2285d7346b6
Successfully built odfpy
Installing collected packages: odfpy
Successfully installed odfpy-1.4.1


In [12]:
import glob
import os
import xml.etree.ElementTree as ET
from datetime import datetime, timezone
from tqdm import tqdm
import codecs
import csv

### Step 3: import file

In [17]:
data_directory_ods = "/home/paulharford/college/project/project_data/met_eireann/Archived_Wx_Warnings_25April2012_17February2021.ods"
full_path_ods = os.path.abspath(data_directory_ods)


In [19]:
# Read the .ods file
df_ods = pd.read_excel(full_path_ods, engine='odf', parse_dates=['Issue Time', 'Valid From', 'Valid To'])

### Step 4: Cleaning and processing data into a common format for combination

### Step 4.1: Remove provinces

In [7]:
##check if countires are also set to true when the province is seleted 
# list of counties
counties = ['Clare', 'Cork', 'Kerry', 'Limerick', 'Tipperary', 'Waterford']

# Filter where 'Munster' is True
df_munster = df_ods[df_ods['Munster']]

# Check if all specified counties are True in the filtered DataFrame
all_counties_true = df_munster[counties].all(axis=1)

# Identify rows where not all counties are True
rows_with_false_counties = df_munster[~all_counties_true]

# Display the results
print(f"Total rows where 'Munster' is True: {df_munster.shape[0]}")
print(f"Rows where 'Munster' is True but not all counties are True: {rows_with_false_counties.shape[0]}")

if not rows_with_false_counties.empty:
    print("\nSample rows with inconsistencies:")
    print(rows_with_false_counties.head())
else:
    print("All county columns are True when 'Munster' is True.")

Total rows where 'Munster' is True: 478
Rows where 'Munster' is True but not all counties are True: 0
All county columns are True when 'Munster' is True.


In [8]:
##check if countires are also set to true when the province is seleted 
# list of counties
counties = ['Carlow', 'Dublin', 'Kildare', 'Kilkenny', 'Laois', 'Longford', 'Louth', 'Meath', 'Offaly', 'Westmeath', 'Wexford','Wicklow']

# Filter where 'Leinster' is True
df_leinster = df_ods[df_ods['Leinster']]

# Check if all specified counties are True in the filtered DataFrame
all_counties_true = df_leinster[counties].all(axis=1)

# Identify rows where not all counties are True
rows_with_false_counties = df_leinster[~all_counties_true]

# Display the results
print(f"Total rows where 'Leinster' is True: {df_leinster.shape[0]}")
print(f"Rows where 'Leinster' is True but not all counties are True: {rows_with_false_counties.shape[0]}")

if not rows_with_false_counties.empty:
    print("\nSample rows with inconsistencies:")
    print(rows_with_false_counties.head())
else:
    print("All county columns are True when 'Leinster' is True.")

Total rows where 'Leinster' is True: 513
Rows where 'Leinster' is True but not all counties are True: 0
All county columns are True when 'Leinster' is True.


In [9]:
##After checking 2 provinces, when a province is true all the counties are also true so i can drop the provinces 
df_ods= df_ods.drop(['Connacht', 'Leinster', 'Munster', 'Ulster'], axis=1)

In [10]:
##checking if ods data has an Ireland columns whihc rekates to an all of ireland warning (present in other datasets) 
df_ods.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1654 entries, 0 to 1653
Data columns (total 34 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   Issue Time       1654 non-null   datetime64[ns]
 1   Valid From       1654 non-null   datetime64[ns]
 2   Valid To         1654 non-null   datetime64[ns]
 5   WhereToText      1651 non-null   object        
 7   Clare            1654 non-null   bool          
 8   Cork             1654 non-null   bool          
 9   Kerry            1654 non-null   bool          
 10  Limerick         1654 non-null   bool          
 11  Tipperary        1654 non-null   bool          
 12  Tipperary SR     1654 non-null   bool          
 13  Waterford        1654 non-null   bool          
 14  Carlow           1654 non-null   bool          
 15  Dublin           1654 non-null   bool          
 16  Kildare          1654 non-null   bool          
 17  Kilkenny         1654 non-null   bool   

##### Step 4.2 Merge Tipperary SR 
there are only 3 events in Tipperary SR so i will merge it with Tipperary 

In [11]:
df_tippsr = df_ods[df_ods['Tipperary SR']]
df_tippsr.head()


Unnamed: 0,Issue Time,Valid From,Valid To,Warning Colour,Warning Element,WhereToText,Warning Text,Clare,Cork,Kerry,...,Wexford,Wicklow,Cavan,Donegal,Monaghan,Galway,Leitrim,Mayo,Roscommon,Sligo
838,2017-11-23 12:00:00,2017-11-24 11:00:00,2017-11-25 23:59:00,Yellow,Snow/Ice,Ireland,Update: Scattered snow showers across the coun...,True,True,True,...,True,True,True,True,True,True,True,True,True,True
840,2017-11-24 11:34:50,2017-11-24 11:00:00,2017-11-25 23:59:00,Yellow,Snow/Ice,Ireland,Update: Scattered snow showers across the coun...,True,True,True,...,True,True,True,True,True,True,True,True,True,True
841,2017-11-25 09:56:08,2017-11-25 10:00:00,2017-11-26 10:00:00,Yellow,Snow/Ice,Ireland,Update: Scattered wintry showers across the co...,True,True,True,...,True,True,True,True,True,True,True,True,True,True


### Step 4.2:  Merge Tipperary SR (3 events) to simplify county breakdown

In [12]:
# Create a copy of the filtered data
df_ods = df_ods.copy()

## Its not clear what Tipperary SR actually is and is not the same in the XML data so i'm going to merge tipperary and tipperary SR 
df_ods['Tipperary'] = df_ods[['Tipperary', 'Tipperary SR']].max(axis=1)
df_ods = df_ods.drop('Tipperary SR', axis=1)

##### Step 4.3 convert true/false to 0/1


In [13]:
# Identify all boolean columns in the DataFrame
bool_cols = df_ods.select_dtypes(include=['bool']).columns

# Convert boolean columns to integers (True -> 1, False -> 0)
df_ods[bool_cols] = df_ods[bool_cols].astype(int)

# Verify the changes by displaying data types
print("Data types after conversion:")
print(df_ods.dtypes)

Data types after conversion:
Issue Time         datetime64[ns]
Valid From         datetime64[ns]
Valid To           datetime64[ns]
WhereToText                object
Clare                       int64
Cork                        int64
Kerry                       int64
Limerick                    int64
Tipperary                   int64
Waterford                   int64
Carlow                      int64
Dublin                      int64
Kildare                     int64
Kilkenny                    int64
Laois                       int64
Longford                    int64
Louth                       int64
Meath                       int64
Offaly                      int64
Westmeath                   int64
Wexford                     int64
Wicklow                     int64
Cavan                       int64
Donegal                     int64
Monaghan                    int64
Galway                      int64
Leitrim                     int64
Mayo                        int64
Roscommon          

#### Step 4.4 confirm date/time settings, check date ranges and select appropriate rnages to combine 

In [14]:
##make sure datetime is the same in both dataframes
# Ensure datetime columns are consistently UTC
datetime_cols = ['Issue Time', 'Valid From', 'Valid To']
df_ods[datetime_cols] = df_ods[datetime_cols].apply(pd.to_datetime, utc=True)

In [15]:
print("ODS Filtered Date Range:")
print(f"Start date: {df_ods['Issue Time'].min()}")
print(f"End date: {df_ods['Issue Time'].max()}")

ODS Filtered Date Range:
Start date: 2012-04-25 12:00:00+00:00
End date: 2021-02-17 09:00:00+00:00


#### Step 4.5 filter data for dates of interest 2013 to 2020 (full years of data)

In [16]:
 # Filter ODS data from 2013 to end of 2020
### check plots on date change check for duplicates 
df_ods_filtered = df_ods[
    (df_ods['Issue Time'] >= '2013-01-01') & 
    (df_ods['Issue Time'] <= '2020-12-31 23:59:59')
]



#### Step 5 check for any missing values and duplicates and drop unneeded columns

In [17]:
  total_missing = df_ods_filtered.isnull().sum().sum()
  percent_missing = (total_missing / df_ods_filtered.size) * 100
  print(f"Total missing values: {total_missing}")
  print(f"Percentage of missing values: {percent_missing:.2f}%")



Total missing values: 5
Percentage of missing values: 0.01%


In [18]:
# All rows with any missing values
rows_with_missing = df_ods[df_ods.isnull().any(axis=1)]
rows_with_missing.head()


Unnamed: 0,Issue Time,Valid From,Valid To,Warning Colour,Warning Element,WhereToText,Warning Text,Clare,Cork,Kerry,...,Wexford,Wicklow,Cavan,Donegal,Monaghan,Galway,Leitrim,Mayo,Roscommon,Sligo
10,2012-09-25 09:00:00+00:00,2012-09-25 09:00:00+00:00,2012-09-26 06:00:00+00:00,Yellow,Rain,,1. Further spells of heavy rain today with a r...,0,0,0,...,0,0,0,0,0,0,0,0,0,0
859,2017-12-10 09:00:00+00:00,2017-12-10 09:00:00+00:00,2017-12-10 09:00:00+00:00,Yellow,Snow/Ice,"Wexford, Cork, Kerry, Limerick and Waterford",,0,1,1,...,1,0,0,0,0,0,0,0,0,0
893,2017-12-30 08:00:00+00:00,2017-12-30 21:00:00+00:00,2017-12-31 10:00:00+00:00,Yellow,Wind,,Update of warning:Tonight and Sunday morning s...,1,0,0,...,0,0,1,1,1,1,1,1,1,1
997,2018-03-02 00:00:00+00:00,2018-03-02 00:00:00+00:00,2018-03-02 00:00:00+00:00,Red,Snow/Ice,"Munster, Leinster and Galway",,1,1,1,...,1,1,0,0,0,1,0,0,0,0
1094,2018-10-10 11:00:00+00:00,2018-10-10 11:00:00+00:00,2018-10-10 11:00:00+00:00,Yellow,Wind,,,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
##look for duplicates
duplicate_rows = df_ods_filtered.duplicated()
duplicate_rows.sum()
df_ods_filtered[duplicate_rows]

Unnamed: 0,Issue Time,Valid From,Valid To,Warning Colour,Warning Element,WhereToText,Warning Text,Clare,Cork,Kerry,...,Wexford,Wicklow,Cavan,Donegal,Monaghan,Galway,Leitrim,Mayo,Roscommon,Sligo


In [20]:
##drop columns
#columns_to_drop = ['WhereToText', 'Warning Text']
#df_ods_filtered = df_ods_filtered.drop(columns=columns_to_drop)


In [21]:
df_ods_filtered.head(10)

Unnamed: 0,Issue Time,Valid From,Valid To,Warning Colour,Warning Element,WhereToText,Warning Text,Clare,Cork,Kerry,...,Wexford,Wicklow,Cavan,Donegal,Monaghan,Galway,Leitrim,Mayo,Roscommon,Sligo
14,2013-02-21 18:00:00+00:00,2013-02-21 18:00:00+00:00,2013-02-22 09:00:00+00:00,Yellow,Low Temperature/Ice,"Connacht, Leinster, Cavan, Monaghan and Donegal",Temperatures will fall to minus 3 tonight in p...,0,0,0,...,1,1,1,1,1,1,1,1,1,1
15,2013-02-23 11:00:00+00:00,2013-02-23 11:00:00+00:00,2013-02-24 12:00:00+00:00,Yellow,Snow/Ice,Leinster,Scattered snow showers over Leinster with a ri...,0,0,0,...,1,1,0,0,0,0,0,0,0,0
16,2013-02-24 11:00:00+00:00,2013-02-24 18:00:00+00:00,2013-02-25 09:00:00+00:00,Yellow,Low Temperature/Ice,Ireland,Very cold and frosty overnight with temperatur...,1,1,1,...,1,1,1,1,1,1,1,1,1,1
17,2013-02-25 20:00:00+00:00,2013-02-25 20:00:00+00:00,2013-02-26 09:00:00+00:00,Yellow,Low Temperature/Ice,"Munster, Leinster, Connacht, Donegal, Monaghan...",Cold tonight with air temperatures as low as -...,1,1,1,...,1,1,1,1,1,1,1,1,1,1
18,2013-03-10 12:00:00+00:00,2013-03-10 18:00:00+00:00,2013-03-11 18:00:00+00:00,Orange,Snow/Ice,"Dublin, Carlow, Kildare, Kilkenny, Laois, Lout...","Scattered snow showers this evening, tonight a...",0,1,1,...,1,1,0,1,0,0,0,1,0,0
19,2013-03-11 09:00:00+00:00,2013-03-11 09:00:00+00:00,2013-03-11 21:00:00+00:00,Yellow,Low Temperature/Ice,"Munster, Leinster, Connacht, Donegal, Monaghan...","Abnormally low temperatures, combined with str...",1,1,1,...,1,1,1,1,1,1,1,1,1,1
20,2013-03-15 11:00:00+00:00,2013-03-15 21:00:00+00:00,2013-03-16 07:00:00+00:00,Yellow,Low Temperature/Ice,"Munster, Leinster, Connacht, Donegal, Monaghan...","Widespread subzero temperatures tonight, as lo...",1,1,1,...,1,1,1,1,1,1,1,1,1,1
21,2013-03-15 19:00:00+00:00,2013-03-15 23:00:00+00:00,2013-03-16 09:00:00+00:00,Yellow,Low Temperature/Ice,"Munster, Leinster, Connacht, Donegal, Monaghan...",Very cold and frosty overnight with icy roads....,1,1,1,...,1,1,1,1,1,1,1,1,1,1
22,2013-03-18 15:00:00+00:00,2013-03-18 21:00:00+00:00,2013-03-19 12:00:00+00:00,Yellow,Snow/Ice,"Cavan, Monaghan, Donegal and Louth",Snow showers tonight (Monday) and Tuesday morn...,0,0,0,...,0,0,1,1,1,0,0,0,0,0
23,2013-03-18 22:00:00+00:00,2013-03-18 22:00:00+00:00,2013-03-19 12:00:00+00:00,Yellow,Snow/Ice,"Cavan, Monaghan, Donegal, Dublin, Longford, Lo...",Snow showers expected overnight and on Tuesday...,0,0,0,...,0,1,1,1,1,0,1,0,0,1


### Step 6 save data
With the provinces removed - the whetertotext and text also dropped

In [22]:
df_ods_filtered.to_csv('/mnt/hgfs/shared/project_data/met_eireann/ods_warnings_2017_2020.csv', index=False)