In [8]:
import pickle
import pandas as pd
import numpy as np
import datetime

pd.set_option('display.float_format', lambda x: '%.6f' % x)

In [9]:
from FormatEvents_Functions import *

In [10]:
home_dir = '/nfs/a319/gy17m2a/PhD/'
home_dir2 = '/nfs/a161/gy17m2a/PhD/'

In [11]:
durations = ['0.5', '1', '2', '3', '6', '12', '24']

### UKCP18 data

### Join together lists for different ensemble members

In [35]:
events_props_dict_present = []
ems_present = ['bc005', 'bc006', 'bc007', 'bc009', 'bc010', 'bc011', 'bc012', 'bc013', 'bc015', 'bc016', 'bc017', 'bc018']
for em in ems_present:
    with open(home_dir +  f"ProcessedData/AMAX_Events/UKCP18_30mins/Present/event_props_dict_{em}.pickle", 'rb') as handle:
        one_events_props_dict_present = pickle.load(handle)    
    events_props_dict_present = events_props_dict_present + one_events_props_dict_present
    
## Join into one dataframe    
present = pd.DataFrame(events_props_dict_present)
present['Climate'] = 'Present'    

In [14]:
events_props_dict_future = []
ems_future = ['bb195', 'bb192', 'bb198', 'bb208', 'bb225','bb222', 'bb201', 'bb204', 'bb216', 'bb219', 'bb211']
for em in ems_future:
    with open(home_dir +  f"ProcessedData/AMAX_Events/UKCP18_30mins/Future/event_props_dict_{em}.pickle", 'rb') as handle:
        one_events_props_dict_future = pickle.load(handle)    
    events_props_dict_future = events_props_dict_future + one_events_props_dict_future
    
## Join into one dataframe
future = pd.DataFrame(events_props_dict_future)
future['Climate'] = 'Future'

## Make a check on number of files (could shift this to the checking script)
NB - the method of searching on part1 doesnt work, because the filename only represents on of the files that is represented by that event


24529 is 19 * 1291 and is the number we expert with no part1s for one ensemble member.  
For 12 ems it becomes 24529 * 12 = 294348

In [34]:
# filtered_df = present[present['filename'].str.contains('part1')]

In [37]:
# Ensure that values are treated as lists. If any single numbers are not in a list, convert them to lists.
present['dur_for_which_this_is_amax'] = present['dur_for_which_this_is_amax'].apply(lambda x: x if isinstance(x, list) else [x])

# Flatten the lists and count occurrences of each number
all_numbers = [num for sublist in present['dur_for_which_this_is_amax'] for num in sublist]
number_counts = pd.Series(all_numbers).value_counts()

# Show the result
print(number_counts)

24     325393
12     296965
6      294401
0.5    294348
1      294348
2      294348
3      294348
dtype: int64


### Create one dataframe containing both present and future

In [39]:
df_long = pd.concat([present, future])

# Add D variable (day of year) and date
df_long['D'] = (df_long['theta'] * 365.25) / (2 * np.pi)
df_long['date'] = df_long.apply(lambda row: date_from_D(row['D'], row['year']), axis=1)
df_long['season'] = df_long['date'].apply(get_season)

### Check the number of files for each duration
NB: Number of files for 24h duration is longer due to compound events  
Checked this by filtering only rows with part0

### Remove entries which are less than 1.5 hours

In [40]:
df_long = df_long[df_long['duration'] >=1.5]
present = present[present['duration'] >=1.5]
future = future[future['duration'] >=1.5]
# nan_rows = df_long[df_long['D50'].isna()]

In [41]:
df_long.to_csv(home_dir + f"ProcessedData/AMAX_Events/UKCP18_30mins/all_events_characteristics.csv", index=False)

### NIMROD data

In [42]:
with open(home_dir +  f"ProcessedData/AMAX_Events/NIMROD_30mins/event_props_dict_nimrod.pickle", 'rb') as handle:
    events_props_dict_nimrod = pickle.load(handle)

In [43]:
nimrod = pd.DataFrame(events_props_dict_nimrod)
# Add D variable (day of year) and date
nimrod['D'] = (nimrod['theta'] * 365.25) / (2 * np.pi)
nimrod['date'] = nimrod.apply(lambda row: date_from_D(row['D'], row['year']), axis=1)
nimrod['season'] = nimrod['date'].apply(get_season)

In [44]:
# Ensure that values are treated as lists. If any single numbers are not in a list, convert them to lists.
nimrod['dur_for_which_this_is_amax'] = nimrod['dur_for_which_this_is_amax'].apply(lambda x: x if isinstance(x, list) else [x])

# Flatten the lists and count occurrences of each number
all_numbers = [num for sublist in nimrod['dur_for_which_this_is_amax'] for num in sublist]
number_counts = pd.Series(all_numbers).value_counts()

# Show the result
print(number_counts)

24     19451
0.5    19335
12     14591
6      11438
2       8251
1       7675
3       6972
dtype: int64


In [None]:
nimrod.to_csv(home_dir + f"ProcessedData/AMAX_Events/NIMROD_30mins/all_events_characteristics.csv", index=False)

## Create datasets with just events for each sampling duration

In [None]:
for duration in durations:
    df_long_this_dur = df_long[df_long['dur_for_which_this_is_amax'].apply(
        lambda x: isinstance(x, list) and str(duration) in x or x == str(duration))]
    print(duration, len(df_long_this_dur))

In [None]:
df_long_1hr = df_long[df_long['dur_for_which_this_is_amax'].apply(
    lambda x: isinstance(x, list) and str(1) in x or x == str(1))]

df_long_2hr = df_long[df_long['dur_for_which_this_is_amax'].apply(
    lambda x: isinstance(x, list) and str(2) in x or x == str(2))]

df_long_3hr = df_long[df_long['dur_for_which_this_is_amax'].apply(
    lambda x: isinstance(x, list) and str(3) in x or x == str(3))]

df_long_6hr = df_long[df_long['dur_for_which_this_is_amax'].apply(
    lambda x: isinstance(x, list) and str(6) in x or x == str(6))]

df_long_12hr = df_long[df_long['dur_for_which_this_is_amax'].apply(
    lambda x: isinstance(x, list) and str(12) in x or x == str(12))]

df_long_24hr = df_long[df_long['dur_for_which_this_is_amax'].apply(
    lambda x: isinstance(x, list) and str(24) in x or x == str(24))]

In [None]:
for df in [df_long_24hr, df_long_12hr, df_long_6hr, df_long_3hr, df_long_2hr, df_long_1hr, df_long_05hr]:
    print(len(df))

In [None]:
print(len(present))
print(len(future))

In [None]:
vals, counts = np.unique(present['duration'], return_counts=True)
my_df = pd.DataFrame({'values': vals, 'conts':counts})
my_df.sort_values(by='conts', ascending=False)

In [None]:
vals, counts = np.unique(future['duration'], return_counts=True)
my_df = pd.DataFrame({'values': vals, 'conts':counts})
my_df.sort_values(by='conts', ascending=False)

In [None]:
vals, counts = np.unique(future['D50'], return_counts=True)
my_df = pd.DataFrame({'values': vals, 'conts':counts})
my_df.sort_values(by='conts', ascending=False)

In [None]:
import matplotlib.pyplot as plt
plt.hist(present['D50'], bins=25)

In [None]:
group_by_columns = ['Climate', 'gauge_num']
grouped_by_gauge_allevents = group_data_calc_means(df_long, group_by_columns)
grouped_by_gauge_allevents_changes = find_change_values_in_groups_new(grouped_by_gauge_allevents, group_by_columns, 'All')

# grouped_by_gauge_allevents_changes.to_csv(home_dir + f"ProcessedData/AMAX_Events/UKCP18_30mins/grouped_by_gauge_samplingdur_changes_allevents.csv", index=False)

In [None]:
group_by_columns = ['Climate', 'gauge_num', 'season']
grouped_by_gauge_season_allevents = group_data_calc_means(df_long, group_by_columns)
grouped_by_gauge_season_allevents_changes = find_change_values_in_groups_new(grouped_by_gauge_season_allevents, group_by_columns, 'All')

grouped_by_gauge_season_allevents_changes.to_csv(home_dir + f"ProcessedData/AMAX_Events/UKCP18_30mins/grouped_by_gauge_samplingdur_season_changes_allevents.csv", index=False)

In [None]:
group_by_columns = ['Climate', 'gauge_num']
grouped_by_gauge_samplingdur_05 = group_data_calc_means(df_long_05hr, group_by_columns)
grouped_by_gauge_samplingdur_1 = group_data_calc_means(df_long_1hr, group_by_columns)
grouped_by_gauge_samplingdur_2 = group_data_calc_means(df_long_2hr, group_by_columns)
grouped_by_gauge_samplingdur_3 = group_data_calc_means(df_long_3hr, group_by_columns)
grouped_by_gauge_samplingdur_6 = group_data_calc_means(df_long_6hr, group_by_columns)
grouped_by_gauge_samplingdur_12 = group_data_calc_means(df_long_12hr, group_by_columns)
grouped_by_gauge_samplingdur_24 = group_data_calc_means(df_long_24hr, group_by_columns)

In [None]:
group_by_columns = ['Climate', 'gauge_num', 'season']
grouped_by_gauge_samplingdur_season_05 = group_data_calc_means(df_long_05hr, group_by_columns)
grouped_by_gauge_samplingdur_season_1 = group_data_calc_means(df_long_1hr, group_by_columns)
grouped_by_gauge_samplingdur_season_2 = group_data_calc_means(df_long_2hr, group_by_columns)
grouped_by_gauge_samplingdur_season_3 = group_data_calc_means(df_long_3hr, group_by_columns)
grouped_by_gauge_samplingdur_season_6 = group_data_calc_means(df_long_6hr, group_by_columns)
grouped_by_gauge_samplingdur_season_12 = group_data_calc_means(df_long_12hr, group_by_columns)
grouped_by_gauge_samplingdur_season_24 = group_data_calc_means(df_long_24hr, group_by_columns)

In [None]:
group_by_columns = ['Climate', 'gauge_num']
grouped_by_gauge_samplingdur_05_changes = find_change_values_in_groups_new(grouped_by_gauge_samplingdur_05, group_by_columns, float(0.5))
grouped_by_gauge_samplingdur_1_changes  = find_change_values_in_groups_new(grouped_by_gauge_samplingdur_1, group_by_columns, 1)
grouped_by_gauge_samplingdur_2_changes  = find_change_values_in_groups_new(grouped_by_gauge_samplingdur_2, group_by_columns, 2)
grouped_by_gauge_samplingdur_3_changes  = find_change_values_in_groups_new(grouped_by_gauge_samplingdur_3, group_by_columns,3)
grouped_by_gauge_samplingdur_6_changes  = find_change_values_in_groups_new(grouped_by_gauge_samplingdur_6, group_by_columns,6)
grouped_by_gauge_samplingdur_12_changes = find_change_values_in_groups_new(grouped_by_gauge_samplingdur_12, group_by_columns, 12)
grouped_by_gauge_samplingdur_24_changes = find_change_values_in_groups_new(grouped_by_gauge_samplingdur_24, group_by_columns, 24)

In [None]:
group_by_columns = ['Climate', 'gauge_num', 'season']
grouped_by_gauge_samplingdur_season_05_changes = find_change_values_in_groups_new(grouped_by_gauge_samplingdur_season_05, group_by_columns, float(0.5))
grouped_by_gauge_samplingdur_season_1_changes  = find_change_values_in_groups_new(grouped_by_gauge_samplingdur_season_1, group_by_columns, 1)
grouped_by_gauge_samplingdur_season_2_changes  = find_change_values_in_groups_new(grouped_by_gauge_samplingdur_season_2, group_by_columns, 2)
grouped_by_gauge_samplingdur_season_3_changes  = find_change_values_in_groups_new(grouped_by_gauge_samplingdur_season_3, group_by_columns,3)
grouped_by_gauge_samplingdur_season_6_changes  = find_change_values_in_groups_new(grouped_by_gauge_samplingdur_season_6, group_by_columns,6)
grouped_by_gauge_samplingdur_season_12_changes = find_change_values_in_groups_new(grouped_by_gauge_samplingdur_season_12, group_by_columns, 12)
grouped_by_gauge_samplingdur_season_24_changes = find_change_values_in_groups_new(grouped_by_gauge_samplingdur_season_24, group_by_columns, 24)

In [None]:
group_by_columns = ['Climate', 'gauge_num']
grouped_by_gauge_samplingdur_05_changes = find_change_values_in_groups_new(grouped_by_gauge_samplingdur_05, group_by_columns, float(0.5))
grouped_by_gauge_samplingdur_1_changes  = find_change_values_in_groups_new(grouped_by_gauge_samplingdur_1, group_by_columns, 1)
grouped_by_gauge_samplingdur_2_changes  = find_change_values_in_groups_new(grouped_by_gauge_samplingdur_2, group_by_columns, 2)
grouped_by_gauge_samplingdur_3_changes  = find_change_values_in_groups_new(grouped_by_gauge_samplingdur_3, group_by_columns,3)
grouped_by_gauge_samplingdur_6_changes  = find_change_values_in_groups_new(grouped_by_gauge_samplingdur_6, group_by_columns,6)
grouped_by_gauge_samplingdur_12_changes = find_change_values_in_groups_new(grouped_by_gauge_samplingdur_12, group_by_columns, 12)
grouped_by_gauge_samplingdur_24_changes = find_change_values_in_groups_new(grouped_by_gauge_samplingdur_24, group_by_columns, 24)

In [None]:
total = pd.concat([grouped_by_gauge_samplingdur_05_changes, grouped_by_gauge_samplingdur_1_changes, 
                  grouped_by_gauge_samplingdur_2_changes, grouped_by_gauge_samplingdur_3_changes,
                  grouped_by_gauge_samplingdur_6_changes, grouped_by_gauge_samplingdur_12_changes,
                  grouped_by_gauge_samplingdur_24_changes])

In [None]:
total_season = pd.concat([grouped_by_gauge_samplingdur_season_05_changes, grouped_by_gauge_samplingdur_season_1_changes, 
                  grouped_by_gauge_samplingdur_season_2_changes, grouped_by_gauge_samplingdur_season_3_changes,
                  grouped_by_gauge_samplingdur_season_6_changes, grouped_by_gauge_samplingdur_season_12_changes,
                  grouped_by_gauge_samplingdur_season_24_changes])

In [None]:
# group_by_columns = ['Climate', 'gauge_num', 'duration', 'season']
# grouped_by_gauge_samplingdur_season = group_data_calc_means(df_long, group_by_columns)
# grouped_by_gauge_samplingdur_season_changes = find_change_values_in_groups_new(grouped_by_gauge_samplingdur_season, group_by_columns)

# group_by_columns = ['Climate', 'gauge_num', 'duration']
# grouped_by_gauge_samplingdur = group_data_calc_means(df_long, group_by_columns)
# grouped_by_gauge_samplingdur_changes = find_change_values_in_groups_new(grouped_by_gauge_samplingdur, group_by_columns)

In [None]:
# grouped_by_gauge_samplingdur_season_changes.to_csv(home_dir + f"ProcessedData/AMAX_Events/UKCP18_30mins/grouped_by_gauge_samplingdur_season_changes_new.csv", index=False)
total.to_csv(home_dir + f"ProcessedData/AMAX_Events/UKCP18_30mins/grouped_by_gauge_samplingdur_changes_new.csv", index=False)

total_season.to_csv(home_dir + f"ProcessedData/AMAX_Events/UKCP18_30mins/grouped_by_gauge_samplingdur_season_changes_new.csv", index=False)