#### Set styling for plotting

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.ticker as ticker
import seaborn as sns
sns.set_palette('colorblind')
from matplotlib.pyplot import tight_layout
# ##SETTING PARAMS FOR MATPLOTLIB FIGURES
plt.rcParams.update({"figure.figsize": (6, 6),
                 "axes.facecolor": "white",
                 "axes.edgecolor": "black"})
plt.rcParams['axes.prop_cycle'] = plt.cycler(color=sns.color_palette('colorblind'))
##set font size
font = {'family': 'sans-serif',
       'weight': 'normal',
       'size': 14}
plt.rc('font', **font)
# ##PANDAS PLOTTING
pd.plotting.register_matplotlib_converters()

#### Step 1: save environment file

In [2]:
!conda env export > combined_met_environment.yml

#### Step 2: import modules

In [3]:
import glob
import os
import xml.etree.ElementTree as ET
from datetime import datetime, timezone
from tqdm import tqdm
import codecs
import csv

## INFORMATION

##load the 3 processed .csv fiels for combination in a single complete dataset 

#### Step 3: import csv files

In [12]:
data_directory_xml = "/mnt/hgfs/shared/project_data/met_eireann/xml_warnings_2018_2023_08.csv"
data_directory_ods = "/mnt/hgfs/shared/project_data/met_eireann/ods_warnings_2017_2020.csv"
#data_directory_xl = "/mnt/hgfs/shared/project_data/met_eireann/National warnings from pdfs_2023.xlsx"
full_path_xml = os.path.abspath(data_directory_xml)
full_path_ods = os.path.abspath(data_directory_ods)
#full_path_xl = os.path.abspath(data_directory_xl)

In [15]:
# Read the 2023 excel file
df_xml = pd.read_csv(full_path_xml)
df_ods = pd.read_csv(full_path_ods)

In [16]:
# First, ensure our datetime conversions are correct
df_xml['Issue Time'] = pd.to_datetime(df_xml['Issue Time'])
df_ods['Issue Time'] = pd.to_datetime(df_ods['Issue Time'])

# Get counts for specific years (2018-2020) for both datasets
xml_year_counts = df_xml['Issue Time'].dt.year.value_counts().sort_index()
ods_year_counts = df_ods['Issue Time'].dt.year.value_counts().sort_index()

# Create a DataFrame to display the years side by side
comparison_df = pd.DataFrame({
    'XML Dataset': xml_year_counts,
    'ODS Dataset': ods_year_counts
})

# Filter 2018-2020
comparison_df = comparison_df.loc[2018:2020]

# Calculate the difference between datasets
comparison_df['Difference'] = comparison_df['XML Dataset'] - comparison_df['ODS Dataset']

# Display the comparison
print("\nComparison of Events (2018-2020):")
print(comparison_df)

# Optional: Create a percentage difference column to see relative changes
comparison_df['Percentage Difference'] = (
    (comparison_df['XML Dataset'] - comparison_df['ODS Dataset']) / 
    comparison_df['ODS Dataset'] * 100
).round(2)

print("\nWith Percentage Difference:")
print(comparison_df)


Comparison of Events (2018-2020):
            XML Dataset  ODS Dataset  Difference
Issue Time                                      
2018              227.0        249.0       -22.0
2019              162.0        187.0       -25.0
2020              265.0        277.0       -12.0

With Percentage Difference:
            XML Dataset  ODS Dataset  Difference  Percentage Difference
Issue Time                                                             
2018              227.0        249.0       -22.0                  -8.84
2019              162.0        187.0       -25.0                 -13.37
2020              265.0        277.0       -12.0                  -4.33


In [22]:
df_xml['event_id'] = df_xml['Valid From'].astype(str) + '_' + df_xml['Warning Element'] + '_' + df_xml['Warning Colour']
df_ods['event_id'] = df_ods['Valid From'].astype(str) + '_' + df_ods['Warning Element'] + '_' + df_ods['Warning Colour']

# Find events that are only in ODS dataset
ods_only_events = df_ods[~df_ods['event_id'].isin(df_xml['event_id'])]

# Display sample of unique events for each year
for year in [2018, 2019, 2020]:
    year_events = ods_only_events[ods_only_events['Issue Time'].dt.year == year]
    print(f"\nSample of events unique to ODS dataset in {year}:")
    print(year_events[['Issue Time', 'Warning Element', 'Warning Colour']].head())


Sample of events unique to ODS dataset in 2018:
885 2018-01-01 13:00:00+00:00            Wind         Yellow
886 2018-01-01 19:00:00+00:00            Wind         Orange
887 2018-01-02 05:00:00+00:00            Wind         Yellow
888 2018-01-02 05:03:21+00:00            Wind         Orange
889 2018-01-02 12:00:00+00:00            Rain         Yellow

Sample of events unique to ODS dataset in 2019:
1134 2019-01-15 20:00:00+00:00            Wind         Yellow
1135 2019-01-15 19:00:00+00:00            Wind         Yellow
1136 2019-01-15 19:00:00+00:00            Wind         Yellow
1137 2019-01-16 13:00:00+00:00            Wind         Yellow
1138 2019-01-21 11:00:00+00:00        Snow/Ice         Yellow

Sample of events unique to ODS dataset in 2020:
1321 2020-01-05 09:00:00+00:00            Wind         Yellow
1322 2020-01-05 10:00:00+00:00            Wind         Yellow
1323 2020-01-06 08:00:00+00:00            Wind         Yellow
1324 2020-01-06 15:00:00+00:00            Wind      

#### Step:6 confirm date/time settings, check date ranges and select appropriate rnages to combine 

In [19]:
##make sure datetime is the same in both dataframes
# Ensure datetime columns are consistently UTC
datetime_cols = ['Issue Time', 'Valid From', 'Valid To']
df_ods[datetime_cols] = df_ods[datetime_cols].apply(pd.to_datetime, utc=True)
df_xml_consolidated[datetime_cols] = df_xml_consolidated[datetime_cols].apply(pd.to_datetime, utc=True)