In [1]:
import pickle
import pandas as pd
import numpy as np
import datetime
import json

pd.set_option('display.float_format', lambda x: '%.6f' % x)

In [2]:
from FormatEvents_Functions import *

In [3]:
home_dir = '/nfs/a319/gy17m2a/PhD/'
home_dir2 = '/nfs/a161/gy17m2a/PhD/'

In [4]:
durations = ['0.5', '1', '2', '3', '6', '12', '24']

### UKCP18 data

### Join together lists for different ensemble members

In [5]:
events_props_dict_present = []
ems_present = ['bc005', 'bc006', 'bc007', 'bc009', 'bc010', 'bc011', 'bc012', 'bc013', 'bc015', 'bc016', 'bc017', 'bc018']
for em in ems_present:
    with open(home_dir +  f"ProcessedData/AMAX_Events/UKCP18_30mins/Present/event_props_dict_{em}.pickle", 'rb') as handle:
        one_events_props_dict_present = pickle.load(handle)    
    events_props_dict_present = events_props_dict_present + one_events_props_dict_present
    
## Join into one dataframe    
present = pd.DataFrame(events_props_dict_present)
present['Climate'] = 'Present'    

In [6]:
events_props_dict_future = []
# ems_future = ['bb195', 'bb192', 'bb198', 'bb208', 'bb225','bb222', 'bb201', 'bb204', 'bb216', 'bb219', 'bb211']
ems_future = ['bb192', 'bb208', 'bb225','bb222', 'bb201', 'bb204', 'bb216', 'bb219', 'bb211', 'bb189'] #bb195, #bb198
for em in ems_future:
    with open(home_dir +  f"ProcessedData/AMAX_Events/UKCP18_30mins/Future/event_props_dict_{em}.pickle", 'rb') as handle:
        one_events_props_dict_future = pickle.load(handle)    
    events_props_dict_future = events_props_dict_future + one_events_props_dict_future
    
## Join into one dataframe
future = pd.DataFrame(events_props_dict_future)
future['Climate'] = 'Future'

## Make a check on number of files (could shift this to the checking script)
NB - the method of searching on part1 doesnt work, because the filename only represents on of the files that is represented by that event


24529 is 19 * 1291 and is the number we expert with no part1s for one ensemble member.  
For 12 ems it becomes 24529 * 12 = 294348

In [7]:
# import matplotlib.pyplot as plt

# # Plot histograms for present and future D50
# plt.hist(future['D50'], bins=30, alpha=0.5, label='Present', color='blue')
# plt.hist(future['D50_new'], bins=30, alpha=0.5, label='Future', color='orange')

# # Add labels and title
# plt.xlabel('D50')
# plt.ylabel('Frequency')
# plt.title('Distribution of D50 - Present vs Future')
# plt.legend()

# # Show plot
# plt.show();

In [8]:
# filtered_df = present[present['filename'].str.contains('part1')]

In [9]:
# Ensure that values are treated as lists. If any single numbers are not in a list, convert them to lists.
present['dur_for_which_this_is_amax'] = present['dur_for_which_this_is_amax'].apply(lambda x: x if isinstance(x, list) else [x])

# Flatten the lists and count occurrences of each number
all_numbers = [num for sublist in present['dur_for_which_this_is_amax'] for num in sublist]
number_counts = pd.Series(all_numbers).value_counts()

# Show the result
print(number_counts)

24     327523
12     297624
6      294422
0.5    294348
1      294348
2      294348
3      294348
dtype: int64


In [10]:
# Ensure that values are treated as lists. If any single numbers are not in a list, convert them to lists.
future['dur_for_which_this_is_amax'] = future['dur_for_which_this_is_amax'].apply(lambda x: x if isinstance(x, list) else [x])

# Flatten the lists and count occurrences of each number
all_numbers = [num for sublist in future['dur_for_which_this_is_amax'] for num in sublist]
number_counts = pd.Series(all_numbers).value_counts()

# Show the result
print(number_counts)

24     270114
12     247512
6      245335
0.5    245290
1      245290
2      245290
3      245290
dtype: int64


### Create one dataframe containing both present and future

In [11]:
df_long = pd.concat([present, future])

# Add D variable (day of year) and date
df_long['D'] = (df_long['theta'] * 365.25) / (2 * np.pi)
df_long['date'] = df_long.apply(lambda row: date_from_D(row['D'], row['year']), axis=1)
df_long['season'] = df_long['date'].apply(get_season)

### Check the number of files for each duration
NB: Number of files for 24h duration is longer due to compound events  
Checked this by filtering only rows with part0

### Remove entries which are less than 1.5 hours

In [12]:
df_long_with_short_durations_kept = df_long.copy()

In [13]:
df_long = df_long[df_long['duration'] >=1.5]
present = present[present['duration'] >=1.5]
future = future[future['duration'] >=1.5]
# nan_rows = df_long[df_long['D50'].isna()]

### NIMROD data

In [14]:
with open(home_dir +  f"ProcessedData/AMAX_Events/NIMROD_30mins/event_props_dict.pickle", 'rb') as handle:
    events_props_dict_nimrod = pickle.load(handle)

In [15]:
nimrod = pd.DataFrame(events_props_dict_nimrod)
# Add D variable (day of year) and date
nimrod['D'] = (nimrod['theta'] * 365.25) / (2 * np.pi)
nimrod['date'] = nimrod.apply(lambda row: date_from_D(row['D'], row['year']), axis=1)
nimrod['season'] = nimrod['date'].apply(get_season)

In [16]:
# Ensure that values are treated as lists. If any single numbers are not in a list, convert them to lists.
nimrod['dur_for_which_this_is_amax'] = nimrod['dur_for_which_this_is_amax'].apply(lambda x: x if isinstance(x, list) else [x])

# Flatten the lists and count occurrences of each number
all_numbers = [num for sublist in nimrod['dur_for_which_this_is_amax'] for num in sublist]
number_counts = pd.Series(all_numbers).value_counts()

# Show the result
print(number_counts)

24     19462
0.5    19363
12     14560
6      11379
2       8136
1       7536
3       6830
dtype: int64


In [17]:
# Ensure that values are treated as lists. If any single numbers are not in a list, convert them to lists.
nimrod['dur_for_which_this_is_amax'] = nimrod['dur_for_which_this_is_amax'].apply(lambda x: x if isinstance(x, list) else [x])

# Flatten the lists and count occurrences of each number
all_numbers = [num for sublist in nimrod['dur_for_which_this_is_amax'] for num in sublist]
number_counts = pd.Series(all_numbers).value_counts()

# Show the result
print(number_counts)

24     19462
0.5    19363
12     14560
6      11379
2       8136
1       7536
3       6830
dtype: int64


In [18]:
nimrod.to_csv(home_dir + f"ProcessedData/AMAX_Events/NIMROD_30mins/all_events_characteristics.csv", index=False)

# Create grouped results, for all events (no duplicates for durations)
### Group by gauge, climate

In [19]:
group_by_columns = ['Climate', 'gauge_num']
grouped_by_gauge_allevents = group_data_calc_means(df_long, 'D50_new', group_by_columns)
grouped_by_gauge_allevents_changes = find_change_values_in_groups_new(grouped_by_gauge_allevents, group_by_columns, 'All')
grouped_by_gauge_allevents_changes.to_csv(home_dir + f"ProcessedData/AMAX_Events/UKCP18_30mins/grouped_by_gauge_changes_allevents.csv", index=False)

### Group by season, gauge, climate

In [20]:
group_by_columns = ['Climate', 'gauge_num', 'season']
grouped_by_gauge_season_allevents = group_data_calc_means(df_long, 'D50_new', group_by_columns)
grouped_by_gauge_season_allevents_changes = find_change_values_in_groups_new(grouped_by_gauge_season_allevents, group_by_columns, 'All')
grouped_by_gauge_season_allevents_changes.to_csv(home_dir + f"ProcessedData/AMAX_Events/UKCP18_30mins/grouped_by_gauge_season_changes_allevents.csv", index=False)

# Create grouped results, for all events (for each duration separately)
### Group by gauge, climate and by season, gauge, climate

In [21]:
each_dur_per_climate_changes = []
each_dur_per_climate_and_season_changes = []

# For each duration in turn
for duration in durations:
    # Get data for just this duration
    this_dur = df_long[df_long['dur_for_which_this_is_amax'].apply(
        lambda x: isinstance(x, list) and str(duration) in x or x == str(duration))]
    
    # Summary of events at each gauge, for this duration, one for present, one for future
    summary_per_climate = group_data_calc_means(this_dur, 'D50_new', ['Climate', 'gauge_num'])
    # Summary of events at each gauge, for this duration, one for each season for present, one for each season for future
    summary_per_climate_and_season = group_data_calc_means(this_dur, 'D50_new', ['Climate', 'gauge_num', 'season'])
    # Reformat, so one row per gauge, with change between present and future in the columns
    summary_per_climate_changes = find_change_values_in_groups_new(summary_per_climate, ['Climate', 'gauge_num'], float(duration))
    # Reformat, so four rows (each season) per gauge, with change between present and future in the columns
    summary_per_climate_season_changes = find_change_values_in_groups_new(summary_per_climate_and_season, ['Climate', 'gauge_num', 'season'], float(duration))
    
    ## Add to lists
    each_dur_per_climate_changes.append(summary_per_climate_changes)
    each_dur_per_climate_and_season_changes.append(summary_per_climate_season_changes)

In [25]:
total = pd.concat(each_dur_per_climate_changes)
total_season = pd.concat(each_dur_per_climate_and_season_changes)

In [26]:
total.to_csv(home_dir + f"ProcessedData/AMAX_Events/UKCP18_30mins/grouped_by_gauge_changes_bydur.csv", index=False)
total_season.to_csv(home_dir + f"ProcessedData/AMAX_Events/UKCP18_30mins/grouped_by_gauge_season_changes_bydur.csv", index=False)

### Save original data
Don't do this higher up, because the json.dumps thing messes up the formatting for later stages of the code

In [24]:
# Create a copy and convert lists to JSON strings before saving - not doing this, messed up formatting a bit
df_long['dur_for_which_this_is_amax'] = df_long['dur_for_which_this_is_amax'].apply(json.dumps)
df_long.to_csv(home_dir + f"ProcessedData/AMAX_Events/UKCP18_30mins/all_events_characteristics.csv", index=False)
df_long_with_short_durations_kept['dur_for_which_this_is_amax'] = df_long_with_short_durations_kept['dur_for_which_this_is_amax'].apply(json.dumps)
df_long_with_short_durations_kept.to_csv(home_dir + f"ProcessedData/AMAX_Events/UKCP18_30mins/all_events_characteristics_shortdurationskept.csv", index=False)