#### Set styling for plotting

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.ticker as ticker
import seaborn as sns
sns.set_palette('colorblind')
from matplotlib.pyplot import tight_layout
# ##SETTING PARAMS FOR MATPLOTLIB FIGURES
plt.rcParams.update({"figure.figsize": (6, 6),
                 "axes.facecolor": "white",
                 "axes.edgecolor": "black"})
plt.rcParams['axes.prop_cycle'] = plt.cycler(color=sns.color_palette('colorblind'))
##set font size
font = {'family': 'sans-serif',
       'weight': 'normal',
       'size': 14}
plt.rc('font', **font)
# ##PANDAS PLOTTING
pd.plotting.register_matplotlib_converters()

### Step 1: save environment file

In [4]:
!conda env export > xml_met_environment.yml

### Step 2: import modules

In [6]:
import os
import glob
import shutil
import codecs
import logging
from datetime import datetime
import xml.etree.ElementTree as ET
import pandas as pd
from tqdm.notebook import tqdm

### Step 3: process XML data 

In [8]:
class WeatherDataProcessor:
    """Process weather warning XML files, handling advisories, errors, and valid weather events."""
    
    def __init__(self, data_directory):
        """Initialize the processor with directory setup and configurations."""
        self.data_directory = data_directory
        
        # Set up directory paths first
        self.advisories_dir = os.path.join(self.data_directory, 'advisories')
        self.error_dir = os.path.join(self.data_directory, 'error')
        self.log_dir = os.path.join(self.data_directory, 'logs')
        
        # Create directories
        for directory in [self.advisories_dir, self.error_dir, self.log_dir]:
            os.makedirs(directory, exist_ok=True)
        
        # Set up logger
        self.setup_logger()
        
        # Log initialization
        self.logger.info(f"WeatherDataProcessor initialized for directory: {data_directory}")
        self.logger.info("Directory structure created successfully")
        
        # Initialize processing statistics
        self.stats = {
            'total_files': 0,
            'advisory_files': 0,
            'error_files': 0,
            'area_missing_files': 0,
            'valid_files': 0
        }
        
        # Define county mapping
        self.county_info = {
            'EI01': 'Carlow', 'EI02': 'Cavan', 'EI03': 'Clare', 'EI04': 'Cork',
            'EI32': 'Cork City', 'EI06': 'Donegal', 'EI33': 'Dublin City',
            'EI34': 'Dún Laoghaire-Rathdown', 'EI35': 'Fingal', 'EI10': 'Galway',
            'EI36': 'Galway City', 'EI11': 'Kerry', 'EI12': 'Kildare',
            'EI13': 'Kilkenny', 'EI15': 'Laois', 'EI14': 'Leitrim',
            'EI42': 'Limerick', 'EI37': 'Limerick City', 'EI18': 'Longford',
            'EI19': 'Louth', 'EI20': 'Mayo', 'EI21': 'Meath', 'EI22': 'Monaghan',
            'EI23': 'Offaly', 'EI24': 'Roscommon', 'EI25': 'Sligo',
            'EI39': 'South Dublin', 'EI43': 'Tipperary', 'EI44': 'Waterford',
            'EI29': 'Westmeath', 'EI30': 'Wexford', 'EI31': 'Wicklow'
        }

    def setup_logger(self):
        """Configure logging to both file and console"""
        log_file = os.path.join(self.log_dir, f'weather_processing_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log')
        
        # Create a logger
        self.logger = logging.getLogger('WeatherProcessor')
        self.logger.setLevel(logging.INFO)
        
        # Remove any existing handlers
        if self.logger.handlers:
            self.logger.handlers.clear()
        
        # handlers
        file_handler = logging.FileHandler(log_file)
        console_handler = logging.StreamHandler()
        
        # formatting style that works in Jupyter
        formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
        file_handler.setFormatter(formatter)
        console_handler.setFormatter(formatter)
        
        # Add handlers to logger
        self.logger.addHandler(file_handler)
        self.logger.addHandler(console_handler)

    def get_element_text(self, element, path, namespace):
        """Safely extract text from an XML element."""
        found = element.find(path, namespace)
        return found.text if found is not None else None

    def check_advisory(self, info, namespace):
        """Check if the weather event is an advisory (type 22)."""
        parameters = info.findall('cap:parameter', namespace)
        for param in parameters:
            if self.get_element_text(param, 'cap:valueName', namespace) == 'awareness_type':
                awareness_type = self.get_element_text(param, 'cap:value', namespace)
                if awareness_type and '22' in awareness_type:
                    return True, awareness_type
        return False, None

    def move_file_to_directory(self, file_path, target_dir):
        """Move a file to the specified directory and return the new path."""
        filename = os.path.basename(file_path)
        target_path = os.path.join(target_dir, filename)
        shutil.move(file_path, target_path)
        return target_path

    def process_single_file(self, file_path):
        """Process a single XML file and categorize it appropriately."""
        try:
            with codecs.open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
                xml_content = file.read()
            
            root = ET.fromstring(xml_content)
            namespace = {'cap': 'urn:oasis:names:tc:emergency:cap:1.2'}
            
            info = root.find('cap:info', namespace)
            if info is None:
                raise ValueError("Missing info element")

            # Check if it's an advisory
            is_advisory, awareness_type = self.check_advisory(info, namespace)
            if is_advisory:
                self.stats['advisory_files'] += 1
                new_path = self.move_file_to_directory(file_path, self.advisories_dir)
                return 'advisory', new_path, awareness_type

            # Check area description
            area = info.find('cap:area', namespace)
            if area is None or not area.findall('cap:geocode', namespace):
                self.stats['area_missing_files'] += 1
                new_path = self.move_file_to_directory(file_path, self.error_dir)
                return 'error', new_path, "Missing area description"

            # File is valid
            self.stats['valid_files'] += 1
            return 'valid', file_path, None

        except Exception as e:
            self.stats['error_files'] += 1
            new_path = self.move_file_to_directory(file_path, self.error_dir)
            return 'error', new_path, str(e)

    def severity_to_color(self, severity):
        """Map severity levels to warning colors."""
        mapping = {
            'Extreme': 'Red',
            'Severe': 'Orange',
            'Moderate': 'Yellow'
        }
        return mapping.get(severity, 'notmapped')

    def parse_valid_file(self, file_path):
        """Parse a valid weather warning XML file."""
        try:
            with codecs.open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
                xml_content = file.read()
            
            root = ET.fromstring(xml_content)
            namespace = {'cap': 'urn:oasis:names:tc:emergency:cap:1.2'}
            info = root.find('cap:info', namespace)
            area = info.find('cap:area', namespace)
            
            row = {
                'Issue Time': self.get_element_text(root, 'cap:sent', namespace),
                'Valid From': (self.get_element_text(info, 'cap:onset', namespace) or 
                             self.get_element_text(info, 'cap:effective', namespace)),
                'Valid To': self.get_element_text(info, 'cap:expires', namespace),
                'Warning Element': self.get_element_text(info, 'cap:event', namespace),
                'Warning Text': self.get_element_text(info, 'cap:description', namespace),
                'WhereToText': self.get_element_text(area, 'cap:areaDesc', namespace),
                'Warning Colour': self.severity_to_color(
                    self.get_element_text(info, 'cap:severity', namespace)
                )
            }
            
            # Initialize all counties to 0
            for county_name in self.county_info.values():
                row[county_name] = 0
                
            # Set affected counties to 1
            geocodes = area.findall('cap:geocode', namespace)
            for gc in geocodes:
                if self.get_element_text(gc, 'cap:valueName', namespace) == 'FIPS':
                    county_code = self.get_element_text(gc, 'cap:value', namespace)
                    if county_code in self.county_info:
                        row[self.county_info[county_code]] = 1
            
            return row
            
        except Exception as e:
            self.logger.error(f"Error parsing valid file {file_path}: {str(e)}")
            return None

    def process_files(self):
        """Process all XML files in the data directory."""
        file_pattern = os.path.join(self.data_directory, '*.xml')
        file_list = glob.glob(file_pattern)
        self.stats['total_files'] = len(file_list)
        
        advisory_data = []
        error_data = []
        valid_data = []

        self.logger.info(f"Starting to process {len(file_list)} files")
        
        # Using tqdm.notebook for progress bar
        for file_path in tqdm(file_list, desc="Processing files"):
            category, new_path, additional_info = self.process_single_file(file_path)
            
            if category == 'advisory':
                advisory_data.append({
                    'filename': os.path.basename(new_path),
                    'awareness_type': additional_info
                })
            elif category == 'error':
                error_data.append({
                    'filename': os.path.basename(new_path),
                    'error_message': additional_info
                })
            elif category == 'valid':
                row = self.parse_valid_file(new_path)
                if row:
                    valid_data.append(row)

        # Save results
        self.save_results(advisory_data, error_data, valid_data)
        self.save_statistics()
        
        return pd.DataFrame(valid_data)

    def save_results(self, advisory_data, error_data, valid_data):
        """Save processing results to files."""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        
        # Save advisory files list
        if advisory_data:
            advisory_df = pd.DataFrame(advisory_data)
            advisory_path = os.path.join(self.advisories_dir, f'advisory_files_{timestamp}.xlsx')
            advisory_df.to_excel(advisory_path, index=False)
            self.logger.info(f"Saved {len(advisory_data)} advisory files to {advisory_path}")
        
        # Save error files list
        if error_data:
            error_df = pd.DataFrame(error_data)
            error_path = os.path.join(self.error_dir, f'error_files_{timestamp}.xlsx')
            error_df.to_excel(error_path, index=False)
            self.logger.info(f"Saved {len(error_data)} error files to {error_path}")
        
        # Save valid data
        if valid_data:
            df_valid = pd.DataFrame(valid_data)
            # Convert date fields to datetime
            date_columns = ['Issue Time', 'Valid From', 'Valid To']
            for col in date_columns:
                if col in df_valid.columns:
                    df_valid[col] = pd.to_datetime(df_valid[col], utc=True, errors='coerce')
            
            valid_path = os.path.join(self.data_directory, f'weather_warnings_{timestamp}.csv')
            df_valid.to_csv(valid_path, index=False)
            self.logger.info(f"Saved {len(valid_data)} valid weather warnings to {valid_path}")

    def save_statistics(self):
        """Save processing statistics to a file and display them."""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        stats_file = os.path.join(self.log_dir, f'processing_stats_{timestamp}.txt')
        
        stats_text = [
            "Weather Data Processing Statistics",
            "=================================",
            f"Processing completed at: {datetime.now()}",
            "",
            f"Total files processed: {self.stats['total_files']}",
            f"Advisory files found: {self.stats['advisory_files']}",
            f"Files with missing area description: {self.stats['area_missing_files']}",
            f"Files with XML parsing errors: {self.stats['error_files']}",
            f"Valid files processed: {self.stats['valid_files']}"
        ]
        
        # Save to file
        with open(stats_file, 'w') as f:
            f.write('\n'.join(stats_text))
        
        # Log statistics
        self.logger.info("Processing Statistics:")
        for stat in stats_text[4:]:  # Skip the header lines
            self.logger.info(stat)


In [9]:
# initialize the processor:
data_directory = "/home/paulharford/college/project/weather_warnings/archive_warnings/archive"
processor = WeatherDataProcessor(data_directory)

2025-04-28 09:23:33,866 - INFO - Directory structure created successfully


In [10]:
df_xml = processor.process_files()

2025-04-28 09:23:33,933 - INFO - Starting to process 8037 files


Processing files:   0%|          | 0/8037 [00:00<?, ?it/s]

2025-04-28 09:23:35,425 - INFO - Processing Statistics:
2025-04-28 09:23:35,426 - INFO - Total files processed: 8037
2025-04-28 09:23:35,426 - INFO - Advisory files found: 0
2025-04-28 09:23:35,426 - INFO - Files with missing area description: 0
2025-04-28 09:23:35,426 - INFO - Files with XML parsing errors: 0
2025-04-28 09:23:35,426 - INFO - Valid files processed: 8037


In [11]:
print("\nProcessed Data Summary:")
print("-----------------------")
print(f"Total rows: {len(df_xml)}")
print("\nWarning Elements distribution:")
print(df_xml['Warning Element'].value_counts())
print("\nWarning Colors distribution:")
print(df_xml['Warning Colour'].value_counts())


Processed Data Summary:
-----------------------
Total rows: 8037

Yellow Wind                           641
Yellow Rain                           593
Yellow Thunderstorm                   205
Orange Wind                           130
Yellow Snow/Ice                       123
Yellow Low Temperature/Ice            107
Orange Rain                            61
Yellow fog                             50
Red Wind                               39
Yellow High Temperature                35
Orange Thunderstorm                    24
Orange Low Temperature/Ice             24
Orange Snow/Ice                        22
Orange Fog                              3
Orange High Temperature                 3
Rain                                    1
Name: count, dtype: int64

Yellow       6349
Orange       1388
Red           284
notmapped      16
Name: count, dtype: int64


In [12]:
# check the first few rows
print(df_xml.head())

# look at dataframe info
print(df_xml.info())

# check the df shape
print(f"Number of rows: {df_xml.shape[0]}")
print(f"Number of columns: {df_xml.shape[1]}")

print("XML Filtered Date Range:")
print(f"Start date: {df_xml['Issue Time'].min()}")
print(f"End date: {df_xml['Issue Time'].max()}")

                  Issue Time                 Valid From  \
0  2018-03-29T19:46:16+00:00  2018-03-22T23:01:01+00:00   
1  2018-03-29T19:50:05+00:00  2018-03-29T19:00:01+00:00   
2  2018-03-29T21:56:16+00:00  2018-03-29T19:00:01+00:00   
3  2018-03-29T23:10:21+00:00  2018-03-29T19:00:01+00:00   
4  2018-03-30T04:15:33+00:00  2018-03-29T19:00:01+00:00   


0  Heavy showery rain continuing overnight with s...     Ireland   
1  Heavy showery rain overnight with some wintry ...     Ireland   
2  Heavy showery rain overnight with some wintry ...     Ireland   
3  Heavy showery rain overnight with some wintry ...     Ireland   
4  Heavy showery rain overnight with some wintry ...     Ireland   

0         Yellow       1      1      1  ...         1       1          1   
1         Yellow       1      1      1  ...         1       1          1   
2         Yellow       1      1      1  ...         1       1          1   
3         Yellow       1      1      1  ...         1       1          1   

### Step 4: Cleaning and processing data into a common format for combination

### Step 4.1 check for provinces 

In [15]:
##list all columns (check does xml have provinces)
df_xml.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8037 entries, 0 to 8036
Data columns (total 39 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Issue Time              8037 non-null   object
 1   Valid From              8037 non-null   object
 2   Valid To                8036 non-null   object
 5   WhereToText             8037 non-null   object
 7   Carlow                  8037 non-null   int64 
 8   Cavan                   8037 non-null   int64 
 9   Clare                   8037 non-null   int64 
 10  Cork                    8037 non-null   int64 
 11  Cork City               8037 non-null   int64 
 12  Donegal                 8037 non-null   int64 
 13  Dublin City             8037 non-null   int64 
 14  Dún Laoghaire-Rathdown  8037 non-null   int64 
 15  Fingal                  8037 non-null   int64 
 16  Galway                  8037 non-null   int64 
 17  Galway City             8037 non-null   int64 
 18  Kerr

##### data has no provinces or specifc ireland columns

### Step 4.2 check for missing values and duplicates

In [18]:
total_missing = df_xml.isnull().sum().sum()
percent_missing = (total_missing / df_xml.size) * 100
print(f"Total missing values: {total_missing}")
print(f"Percentage of missing values: {percent_missing:.2f}%")


Total missing values: 1
Percentage of missing values: 0.00%


In [19]:
# All rows with any missing values
rows_with_missing = df_xml[df_xml.isnull().any(axis=1)]
rows_with_missing.head()

Unnamed: 0,Issue Time,Valid From,Valid To,Warning Element,Warning Text,WhereToText,Warning Colour,Carlow,Cavan,Clare,...,Monaghan,Offaly,Roscommon,Sligo,South Dublin,Tipperary,Waterford,Westmeath,Wexford,Wicklow
212,2017-10-16T11:44:33+00:00,2017-10-16T11:44:33+00:00,,Severe Wind warning,Cyclonic variable Storm force 10 to Hurricane ...,Ireland,Orange,1,1,1,...,1,1,1,1,0,0,0,1,1,1


In [20]:
##look for duplicates
duplicate_rows = df_xml.duplicated()
duplicate_rows.sum()
df_xml[duplicate_rows]

Unnamed: 0,Issue Time,Valid From,Valid To,Warning Element,Warning Text,WhereToText,Warning Colour,Carlow,Cavan,Clare,...,Monaghan,Offaly,Roscommon,Sligo,South Dublin,Tipperary,Waterford,Westmeath,Wexford,Wicklow
163,2017-10-12T14:10:06+00:00,2017-10-12T13:00:00+01:00,2017-10-17T05:00:00+01:00,Moderate Advisory warning,A combination of a vigorous Atlantic weather s...,Ireland,Yellow,1,1,1,...,1,1,1,1,0,0,0,1,1,1
170,2017-10-13T08:44:24+00:00,2017-10-16T05:00:00+01:00,2017-10-17T05:00:00+01:00,Moderate Advisory warning,"On Monday, an Atlantic storm from the remnants...",Ireland,Yellow,1,1,1,...,1,1,1,1,0,0,0,1,1,1
405,2017-12-28T10:16:26+00:00,2017-12-28T15:00:01+00:00,2017-12-29T03:00:01+00:00,Moderate Snow-ice warning,Frost and icy conditions will develop again th...,"Leinster, Cavan, Monaghan and Donegal",Yellow,1,1,0,...,1,1,0,0,0,0,0,1,1,1
787,2018-02-09T06:03:39+00:00,2018-02-09T05:00:01+00:00,2018-02-09T12:00:01+00:00,Moderate Snow-ice warning,"Update\nThis morning, scattered snow showers m...",Ireland,Yellow,1,1,1,...,1,1,1,1,0,0,0,1,1,1
1190,2018-02-28T20:53:25+00:00,2018-02-28T20:00:01+00:00,2018-03-01T12:00:01+00:00,Extreme Snow-ice warning,Update\nFurther disruptive heavy snow showers ...,"Dublin, Kildare, Louth, Wexford, Wicklow, Meat...",Red,0,0,0,...,0,0,0,0,0,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5777,2021-02-08T17:19:36+00:00,2021-02-08T18:00:01+00:00,2021-02-09T18:00:01+00:00,Moderate Snow-ice warning,Snow accumulations of 2 to 5 cm in places.,"Dublin, Kildare, Louth, Wicklow, Meath and Mon...",Yellow,0,0,0,...,1,0,0,0,0,0,0,0,0,1
5779,2021-02-08T23:16:58+00:00,2021-02-08T18:00:01+00:00,2021-02-09T18:00:01+00:00,Moderate Snow-ice warning,Snow accumulations of 2 to 5 cm in places.,"Dublin, Kildare, Louth, Wicklow, Meath and Mon...",Yellow,0,0,0,...,1,0,0,0,0,0,0,0,0,1
5781,2021-02-08T23:18:07+00:00,2021-02-08T18:00:01+00:00,2021-02-09T18:00:01+00:00,Moderate Snow-ice warning,Snow accumulations of 2 to 5 cm in places.,"Dublin, Kildare, Louth, Wicklow, Meath and Mon...",Yellow,0,0,0,...,1,0,0,0,0,0,0,0,0,1
5783,2021-02-09T05:21:27+00:00,2021-02-08T18:00:01+00:00,2021-02-09T18:00:01+00:00,Moderate Snow-ice warning,Snow accumulations of 2 to 5 cm in places.,"Dublin, Kildare, Louth, Wicklow, Meath and Mon...",Yellow,0,0,0,...,1,0,0,0,0,0,0,0,0,1


### Step 4.3:  Aggregate duplicate events into individual per county events

In [22]:
#### Duplictes could be due to the expanded list of location compared to the ods data and additionally multiple notificatiosn of the same event 
#### consolidate the data into single events 
### Also as xml is a feed from Met Eireann there are a lot of duplicates events most likely just multiple notificatiosn or updates in some cases. 
### so if an event has exacl;ty teh same information in these variables i class it as a duplictze and remove from the datsest
### variables for the duplicate checks:
###'Valid From'
###'Valid To'
###'Warning Colour'
###'Warning Element'
###'WhereToText'
###'Warning Text'
###
###
def identify_and_aggregate_unique_events(df):
    df_processed = df.copy()
    
    try:
        # full timestamp instead of just date
        for column in ['Valid From', 'Valid To']:
            # Using timestamp conversion instead of date
            df_processed[f'{column}_timestamp'] = pd.to_datetime(
                df_processed[column], 
                utc=True,
                errors='coerce'
            )
            
            # Format with time information included
            df_processed[f'{column}_formatted'] = df_processed[f'{column}_timestamp'].apply(
                lambda x: x.strftime('%Y-%m-%d %H:%M:%S') if pd.notnull(x) else 'unknown'
            )
            
            # Clean up intermediate column
            df_processed = df_processed.drop(columns=[f'{column}_timestamp'])
    
    except Exception as e:
        print(f"\nError processing timestamps for column: {column}")
        print(f"Error details: {str(e)}")
        raise
    
    # CHANGED: Using formatted timestamps in event identifier
    df_processed['event_id'] = (
        df_processed['Valid From_formatted'] + '_' +  # includes time
        df_processed['Valid To_formatted'] + '_' +    # includes time
        df_processed['Warning Colour'].fillna('unknown').astype(str) + '_' +
        df_processed['Warning Element'].fillna('unknown').astype(str) + '_' +
        df_processed['WhereToText'].fillna('unknown').astype(str) + '_' +
        df_processed['Warning Text'].fillna('unknown').astype(str) 
    )
    
    def aggregate_regions_row(row):
        row['Dublin'] = int(
            row['Dublin City'] | 
            row['Dún Laoghaire-Rathdown'] | 
            row['South Dublin'] | 
            row['Fingal']
        )
        row['Limerick'] = int(row['Limerick'] | row['Limerick City'])
        row['Cork'] = int(row['Cork'] | row['Cork City'])
        row['Galway'] = int(row['Galway'] | row['Galway City'])
        return row
    
    df_processed = df_processed.apply(aggregate_regions_row, axis=1)
    
    columns_to_drop = [
        'Dublin City', 'Dún Laoghaire-Rathdown', 'South Dublin', 'Fingal',
        'Limerick City', 'Cork City', 'Galway City',
        'Valid From_formatted', 'Valid To_formatted'  # Changed from *date to *formatted
    ]
    
    df_processed = df_processed.drop(columns=columns_to_drop)
    
    # Before dropping duplicates, identify and count them using expanded criteria
    duplicate_mask = df_processed.duplicated(
        subset=['Valid From', 'Valid To', 'Warning Colour', 'Warning Element', 'WhereToText', 'Warning Text'], 
        keep=False
    )
    duplicates = df_processed[duplicate_mask].copy()
    
    # Add a count of duplicates
    if not duplicates.empty:
        duplicate_counts = duplicates.groupby(
            ['Valid From', 'Valid To', 'Warning Colour', 'Warning Element', 'WhereToText', 'Warning Text']
        ).size().reset_index(name='duplicate_count')
        print(f"Found {len(duplicate_counts)} events with duplicates")
        print(f"Total duplicate entries: {len(duplicates)}")
        
        # Optional: display the first few duplicates
        if len(duplicate_counts) > 0:
            print("\nSample of duplicate events:")
            print(duplicate_counts.head())
    else:
        print("No duplicates found based on criteria")
    
    # Continue with deduplication as before
    df_unique = df_processed.drop_duplicates(subset=['event_id'])
    df_unique = df_unique.drop(columns=['event_id'])
    
    return df_unique

def verify_aggregation(original_df, aggregated_df):
    print(f"Original number of rows: {len(original_df)}")
    print(f"Aggregated number of rows: {len(aggregated_df)}")
    
    temp_df = original_df.copy()
    
    try:
        # use full timestamps
        for column in ['Valid From', 'Valid To']:
            temp_datetime = pd.to_datetime(temp_df[column], utc=True, errors='coerce')
            #Using timestamp instead of just date
            temp_df[f'{column}_formatted'] = temp_datetime.apply(
                lambda x: x.strftime('%Y-%m-%d %H:%M:%S') if pd.notnull(x) else 'unknown'
            )
        
        # Use formatted timestamps in groupby
        original_events = temp_df.groupby([
            'Valid From_formatted',
            'Valid To_formatted',
            'Warning Colour',
            'Warning Element',
            #'WhereToText',
            'Warning Text'
        ]).size().reset_index(name='count')
        
        print(f"Number of unique events (including time): {len(original_events)}")
        print("\nSample of unique events with timestamps:")
        print(original_events.head())
        
    except Exception as e:
        print(f"Error during verification: {str(e)}")
        raise

In [23]:
# For each dataset
df_xml_unique = identify_and_aggregate_unique_events(df_xml)

# Verify aggregations
verify_aggregation(df_xml, df_xml_unique)




Found 1003 events with duplicates
Total duplicate entries: 6866

Sample of duplicate events:
0  2017-09-25T04:00:01+01:00  2017-09-25T11:00:01+01:00         Orange   
1  2017-09-26T20:00:01+01:00  2017-09-27T20:00:01+01:00         Yellow   
2  2017-09-26T20:00:01+01:00  2017-09-27T20:00:01+01:00         Yellow   
3  2017-09-27T08:00:01+01:00  2017-09-27T21:00:01+01:00         Yellow   
4  2017-09-28T13:00:01+01:00  2017-09-29T13:00:01+01:00         Yellow   


                                         WhereToText  \
0                                            Ireland   
1  Connacht,Kilkenny,Donegal,Clare,Limerick,Tippe...   
2                                         Cork,Kerry   
3                                            Ireland   
4                    Donegal,Galway,Mayo,Clare,Kerry   

0      Widespread fog this morning, dense in places.                2  
1  Rainfall accumulations between 30mm and 50mm e...                7  
2  Rainfall accumulations between 30mm and 50mm e...  

In [24]:
####check for duplicates again after processing 
df_final = df_xml_unique.duplicated()
duplicate_rows.sum()
df_final[duplicate_rows]

Series([], dtype: bool)

### Step 4.4: convert true/false to 0/1

In [26]:
# Identify all boolean columns in the DataFrame
bool_cols = df_xml_unique.select_dtypes(include=['bool']).columns

# Convert boolean columns to integers (True -> 1, False -> 0)
df_xml_unique[bool_cols] = df_xml_unique[bool_cols].astype(int)

# Verify the changes by displaying data types
print("Data types after conversion:")
print(df_xml_unique.dtypes)

Data types after conversion:
Issue Time         object
Valid From         object
Valid To           object
WhereToText        object
Carlow              int64
Cavan               int64
Clare               int64
Cork                int64
Donegal             int64
Galway              int64
Kerry               int64
Kildare             int64
Kilkenny            int64
Laois               int64
Leitrim             int64
Limerick            int64
Longford            int64
Louth               int64
Mayo                int64
Meath               int64
Monaghan            int64
Offaly              int64
Roscommon           int64
Sligo               int64
Tipperary           int64
Waterford           int64
Westmeath           int64
Wexford             int64
Wicklow             int64
Dublin              int64
dtype: object


In [27]:
df_final = df_xml_unique.copy()

### Step 4.5: confirm date/time settings, check date ranges

In [29]:
##make sure datetime is the same in both dataframes
# Ensure datetime columns are consistently UTC
datetime_cols = ['Issue Time', 'Valid From', 'Valid To']
df_final[datetime_cols] = df_final[datetime_cols].apply(pd.to_datetime, utc=True)

### Step 4.6: filter data for dates of interest 2013 to 2020 (full years of data)

In [31]:
print("XML Filtered Date Range:")
print(f"Start date: {df_final['Issue Time'].min()}")
print(f"End date: {df_final['Issue Time'].max()}")

XML Filtered Date Range:
Start date: 2017-09-25 04:34:48+00:00
End date: 2023-08-05 11:33:52+00:00


In [32]:
### Filter XML data from 2018 to end of 2022
### Although we have data in the ODS format from 2017 to 2020 using the years from both datasets 2018,2019,2020 can help validate my processing steps  
df_xml_filtered = df_final[
    (df_final['Issue Time'] >= '2018-01-01') & 
    (df_final['Issue Time'] <= '2023-08-04 23:59:59')
]

In [33]:
####quick final check 
df_final.head(10)

Unnamed: 0,Issue Time,Valid From,Valid To,Warning Element,Warning Text,WhereToText,Warning Colour,Carlow,Cavan,Clare,...,Monaghan,Offaly,Roscommon,Sligo,Tipperary,Waterford,Westmeath,Wexford,Wicklow,Dublin
0,2018-03-29 19:46:16+00:00,2018-03-22 23:01:01+00:00,2018-03-23 09:00:01+00:00,Moderate Hail warning,Heavy showery rain continuing overnight with s...,Ireland,Yellow,1,1,1,...,1,1,1,1,0,0,1,1,1,0
1,2018-03-29 19:50:05+00:00,2018-03-29 19:00:01+00:00,2018-03-30 09:00:01+00:00,Moderate Hail warning,Heavy showery rain overnight with some wintry ...,Ireland,Yellow,1,1,1,...,1,1,1,1,0,0,1,1,1,0
3,2018-03-29 23:10:21+00:00,2018-03-29 19:00:01+00:00,2018-03-30 09:00:01+00:00,Moderate Snow-ice warning,Heavy showery rain overnight with some wintry ...,Ireland,Yellow,1,1,1,...,1,1,1,1,0,0,1,1,1,0
5,2018-03-31 17:10:37+00:00,2018-04-01 17:00:01+00:00,2018-04-02 11:00:01+00:00,Moderate Rainfall warning,"Heavy rain, especially near the coast, with to...","Wexford, Wicklow, Cork and Waterford",Yellow,0,0,0,...,0,0,0,0,0,0,0,1,1,0
15,2018-04-04 12:10:08+00:00,2018-04-05 13:00:01+00:00,2018-04-06 13:00:01+00:00,Moderate Rainfall warning,In the twenty four hour period from Thursday a...,"Carlow, Kilkenny, Wexford, Cork, Kerry, Limeri...",Yellow,1,0,0,...,0,0,0,0,0,0,0,1,0,0
19,2018-04-04 12:58:25+00:00,2018-04-04 20:00:01+00:00,2018-04-05 08:00:01+00:00,Moderate Low-Temperature warning,"Tonight, temperatures will fall to -3 or -4 de...",Ireland,Yellow,1,1,1,...,1,1,1,1,0,0,1,1,1,0
21,2018-04-04 16:27:37+00:00,2018-04-05 17:00:01+00:00,2018-04-06 14:00:01+00:00,Moderate Rainfall warning,30 to 50 mm of rain expected from Thursday eve...,"Carlow, Kilkenny, Wexford, Cork, Kerry, Limeri...",Yellow,1,0,0,...,0,0,0,0,0,0,0,1,0,0
37,2018-04-05 14:51:09+00:00,2018-04-05 17:00:01+00:00,2018-04-06 14:00:01+00:00,Moderate Rainfall warning,Update on previous warning.\n30 to 50 mm of ra...,"Cork, Kerry and Waterford",Yellow,0,0,0,...,0,0,0,0,0,0,0,0,0,0
38,2018-04-05 14:53:15+00:00,2018-04-05 17:00:01+00:00,2018-04-06 14:00:01+00:00,Moderate Rainfall warning,Update on previous warning.\n25 to 30mm of rai...,"Kilkenny, Wexford, Clare, Limerick and Tipperary",Yellow,0,0,1,...,0,0,0,0,0,0,0,1,0,0
60,2018-04-06 05:57:24+00:00,2018-04-06 09:00:01+00:00,2018-04-06 17:00:01+00:00,Moderate Wind warning,"From late morning through this afternoon, sout...","Galway, Mayo, Clare and Kerry",Yellow,0,0,1,...,0,0,0,0,0,0,0,0,0,0


#### Step 5 save filtered data to file 

In [35]:
df_final.to_csv('/home/paulharford/college/project/project_data/met_eireann/WEATHERED_xml_warnings_2018_2023_08_v1.csv', index=False)