In [1]:
##look at ods file see if the info is better populated 

import xml.etree.ElementTree as ET
import pandas as pd
from datetime import datetime, timezone
import glob
import os
from tqdm import tqdm
import codecs
import csv

import pandas as pd

#file path
file_path = 'Archived_Wx_Warnings_25April2012_17February2021.ods'

# Read the .ods file
df_new = pd.read_excel(file_path, engine='odf', parse_dates=['Issue Time', 'Valid From', 'Valid To'])

# check the first few rows
print(df_new.head())

# look at dataframe info
print(df_new.info())

# check the df shape
print(f"Number of rows: {df_new.shape[0]}")
print(f"Number of columns: {df_new.shape[1]}")

# List column names
print("\nColumn names:")
print(df_new.columns.tolist())

# look for any missing values
print("\nMissing values:")
print(df_new.isnull().sum())

# Display summary statistics for numeric columns
print("\nSummary statistics:")
print(df_new.describe())

# Check unique values in 'Warning Colour' and 'Warning Element' columns
print("\nUnique Warning Colours:")
print(df_new['Warning Colour'].unique())
print("\nUnique Warning Elements:")
print(df_new['Warning Element'].unique())

# Temporal Analysis
print("\nTemporal Analysis:")

# Check the range of years
min_year = df_new['Issue Time'].dt.year.min()
max_year = df_new['Issue Time'].dt.year.max()
print(f"Data spans from year {min_year} to {max_year}")

# Number of warnings per year
warnings_per_year = df_new['Issue Time'].dt.year.value_counts().sort_index()
print("\nNumber of warnings per year:")
print(warnings_per_year)

# Check for any gaps in the data
all_years = range(min_year, max_year + 1)
missing_years = [year for year in all_years if year not in warnings_per_year.index]
if missing_years:
    print(f"\nWarning: No data for years: {missing_years}")

# Analysis of months
warnings_per_month = df_new['Issue Time'].dt.month.value_counts().sort_index()
print("\nNumber of warnings per month:")
print(warnings_per_month)

# Analysis of weekdays
warnings_per_weekday = df_new['Issue Time'].dt.dayofweek.value_counts().sort_index()
print("\nNumber of warnings per day of week (0 = Monday, 6 = Sunday):")
print(warnings_per_weekday)

# Average duration of warnings
df_new['Warning Duration'] = (df_new['Valid To'] - df_new['Valid From']).dt.total_seconds() / 3600  # in hours
print(f"\nAverage warning duration: {df_new['Warning Duration'].mean():.2f} hours")

# Distribution of warning durations
print("\nWarning duration distribution (in hours):")
print(df_new['Warning Duration'].describe())

# Check for any warnings with unusual durations (e.g., very short or very long)
short_warnings = df_new[df_new['Warning Duration'] < 1]
long_warnings = df_new[df_new['Warning Duration'] > 72]

if not short_warnings.empty:
    print(f"\nNumber of warnings shorter than 1 hour: {len(short_warnings)}")
if not long_warnings.empty:
    print(f"Number of warnings longer than 72 hours: {len(long_warnings)}")

# Count yellow, orange, and red events per year
warning_levels_per_year = df_new.groupby([df_new['Issue Time'].dt.year, 'Warning Colour']).size().unstack(fill_value=0)

# Ensure all years and colors are represented, even if count is zero
for year in range(min_year, max_year + 1):
    if year not in warning_levels_per_year.index:
        warning_levels_per_year.loc[year] = 0
for color in ['Yellow', 'Orange', 'Red']:
    if color not in warning_levels_per_year.columns:
        warning_levels_per_year[color] = 0

warning_levels_per_year = warning_levels_per_year.sort_index()

print("\nNumber of yellow, orange, and red events per year:")
print(warning_levels_per_year)

# Calculate percentages
warning_levels_percentage = warning_levels_per_year.div(warning_levels_per_year.sum(axis=1), axis=0) * 100

print("\nPercentage of yellow, orange, and red events per year:")
print(warning_levels_percentage.round(2))

# Total counts for each warning level
total_counts = warning_levels_per_year.sum()
print("\nTotal counts for each warning level:")
print(total_counts)

# Overall percentages
overall_percentages = (total_counts / total_counts.sum()) * 100
print("\nOverall percentages for each warning level:")
print(overall_percentages.round(2))

# Identify years with highest number of each warning level
max_yellow_year = warning_levels_per_year['Yellow'].idxmax()
max_orange_year = warning_levels_per_year['Orange'].idxmax()
max_red_year = warning_levels_per_year['Red'].idxmax()

print(f"\nYear with most Yellow warnings: {max_yellow_year} ({warning_levels_per_year.loc[max_yellow_year, 'Yellow']} warnings)")
print(f"Year with most Orange warnings: {max_orange_year} ({warning_levels_per_year.loc[max_orange_year, 'Orange']} warnings)")
print(f"Year with most Red warnings: {max_red_year} ({warning_levels_per_year.loc[max_red_year, 'Red']} warnings)")

0 2012-04-25 12:00:00 2012-04-25 12:00:00 2012-04-26 12:00:00         Yellow   
1 2012-06-01 21:00:00 2012-06-02 12:00:00 2012-06-03 21:00:00         Yellow   
2 2012-06-02 14:00:00 2012-06-02 14:00:00 2012-06-03 12:00:00         Orange   
3 2012-06-08 10:00:00 2012-06-08 10:00:00 2012-06-08 23:59:00         Yellow   
4 2012-06-14 20:00:00 2012-06-14 20:00:00 2012-06-16 12:00:00         Yellow   

0            Rain                               Munster and Leinster   
1            Rain                     Munster, Connacht and Leinster   
2            Rain                               Munster and Leinster   
3            Rain                              Connacht and Leinster   
4            Rain  Munster, Leinster, Connacht, Donegal, Monaghan...   

0  Heavy rain moving into Southern coastal counti...     True   True   True   
1  Between 25 and 65 mm of rain possible, (heavie...     True   True   True   
2  Between 25mm & 65mm of rain expected over Lein...     True   True   True   
3

In [2]:
df_new.head(20)

Unnamed: 0,Issue Time,Valid From,Valid To,Warning Colour,Warning Element,WhereToText,Warning Text,Munster,Clare,Cork,...,Cavan,Donegal,Monaghan,Connacht,Galway,Leitrim,Mayo,Roscommon,Sligo,Warning Duration
0,2012-04-25 12:00:00,2012-04-25 12:00:00,2012-04-26 12:00:00,Yellow,Rain,Munster and Leinster,Heavy rain moving into Southern coastal counti...,True,True,True,...,False,False,False,False,False,False,False,False,False,24.0
1,2012-06-01 21:00:00,2012-06-02 12:00:00,2012-06-03 21:00:00,Yellow,Rain,"Munster, Connacht and Leinster","Between 25 and 65 mm of rain possible, (heavie...",True,True,True,...,False,False,False,True,True,True,True,True,True,33.0
2,2012-06-02 14:00:00,2012-06-02 14:00:00,2012-06-03 12:00:00,Orange,Rain,Munster and Leinster,Between 25mm & 65mm of rain expected over Lein...,True,True,True,...,False,False,False,False,False,False,False,False,False,22.0
3,2012-06-08 10:00:00,2012-06-08 10:00:00,2012-06-08 23:59:00,Yellow,Rain,Connacht and Leinster,Further persistent and sometimes heavy rain to...,False,False,False,...,False,False,False,True,True,True,True,True,True,13.983333
4,2012-06-14 20:00:00,2012-06-14 20:00:00,2012-06-16 12:00:00,Yellow,Rain,"Munster, Leinster, Connacht, Donegal, Monaghan...","Further spells of rain, persistant and heavy a...",True,True,True,...,True,True,True,True,True,True,True,True,True,40.0
5,2012-06-15 06:00:00,2012-06-15 06:00:00,2012-06-15 21:00:00,Yellow,Rain,"Munster, Leinster, Connacht, Donegal, Monaghan...","Further spells of heavy, showery rain may lead...",True,True,True,...,True,True,True,True,True,True,True,True,True,15.0
6,2012-07-04 09:00:00,2012-07-04 09:00:00,2012-07-04 21:00:00,Yellow,Rain,Munster and Connacht,"Heavy, thundery showers will affect Munster an...",True,True,True,...,False,False,False,True,True,True,True,True,True,12.0
7,2012-07-05 18:00:00,2012-07-06 12:00:00,2012-07-07 06:00:00,Yellow,Rain,"Leinster, Tipperary and Waterford",A spell of heavy and possibly thundery rain wi...,False,False,False,...,False,False,False,False,False,False,False,False,False,18.0
8,2012-08-09 00:00:00,2012-08-09 00:01:00,2012-08-09 09:00:00,Yellow,Fog (or freezing fog),"Munster, Connacht and Leinster",Dense fog and very poor visibilities overnight...,True,True,True,...,False,False,False,True,True,True,True,True,True,8.983333
9,2012-08-15 14:00:00,2012-08-15 14:00:00,2012-08-15 23:59:00,Orange,Rain,"Munster, Leinster, Connacht, Donegal, Monaghan...",Warning updateBlustery this afternoon and even...,True,True,True,...,True,True,True,True,True,True,True,True,True,9.983333


In [3]:
df_new.dtypes

Issue Time          datetime64[ns]
Valid From          datetime64[ns]
Valid To            datetime64[ns]
WhereToText                 object
Munster                       bool
Clare                         bool
Cork                          bool
Kerry                         bool
Limerick                      bool
Tipperary                     bool
Tipperary SR                  bool
Waterford                     bool
Leinster                      bool
Carlow                        bool
Dublin                        bool
Kildare                       bool
Kilkenny                      bool
Laois                         bool
Longford                      bool
Louth                         bool
Meath                         bool
Offaly                        bool
Westmeath                     bool
Wexford                       bool
Wicklow                       bool
Ulster                        bool
Cavan                         bool
Donegal                       bool
Monaghan            

In [4]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1654 entries, 0 to 1653
Data columns (total 39 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Issue Time        1654 non-null   datetime64[ns]
 1   Valid From        1654 non-null   datetime64[ns]
 2   Valid To          1654 non-null   datetime64[ns]
 5   WhereToText       1651 non-null   object        
 7   Munster           1654 non-null   bool          
 8   Clare             1654 non-null   bool          
 9   Cork              1654 non-null   bool          
 10  Kerry             1654 non-null   bool          
 11  Limerick          1654 non-null   bool          
 12  Tipperary         1654 non-null   bool          
 13  Tipperary SR      1654 non-null   bool          
 14  Waterford         1654 non-null   bool          
 15  Leinster          1654 non-null   bool          
 16  Carlow            1654 non-null   bool          
 17  Dublin            1654 n

In [5]:
(df_new.isna().sum()/df_new.shape[0]).sort_values()

Issue Time          0.000000
Longford            0.000000
Louth               0.000000
Meath               0.000000
Offaly              0.000000
Westmeath           0.000000
Wexford             0.000000
Wicklow             0.000000
Laois               0.000000
Ulster              0.000000
Donegal             0.000000
Monaghan            0.000000
Connacht            0.000000
Galway              0.000000
Leitrim             0.000000
Mayo                0.000000
Roscommon           0.000000
Cavan               0.000000
Sligo               0.000000
Kilkenny            0.000000
Dublin              0.000000
Valid From          0.000000
Valid To            0.000000
Munster             0.000000
Kildare             0.000000
Cork                0.000000
Clare               0.000000
Limerick            0.000000
Tipperary           0.000000
Tipperary SR        0.000000
Waterford           0.000000
Leinster            0.000000
Carlow              0.000000
Kerry               0.000000
WhereToText   

In [7]:
df_new.describe()

Unnamed: 0,Issue Time,Valid From,Valid To,Warning Duration
count,1654,1654,1654,1654.0
mean,2017-05-23 21:25:35.203748608,2017-05-24 08:13:14.039903488,2017-05-25 00:24:03.443772672,16.18039
min,2012-04-25 12:00:00,2012-04-25 12:00:00,2012-04-26 12:00:00,-1.0
25%,2015-03-01 22:00:00,2015-03-01 22:00:00,2015-03-03 18:00:00,9.0
50%,2017-10-19 20:00:00,2017-10-20 23:00:30,2017-10-21 19:30:00,14.0
75%,2019-07-31 07:45:00,2019-08-01 00:15:00,2019-08-01 10:45:00,22.0
max,2021-02-17 09:00:00,2021-02-17 23:00:00,2021-02-18 05:00:00,157.983333
std,,,,11.127839


Distribution of headline categories for rows with empty param_awareness_type:
headline_category
Other       1434
Advisory      18
Hail           8

headline not great for filling empty awareness_type 
what is advisory ?
