In [None]:
def remove_events_with_problems(df, verbose=True):
    problem_events = 0
    
    # Check if the DataFrame is too short to be an event
    if len(df) < 2:
        if verbose:
            print(f"Too short to be an event")
        problem_events += 1
        return None, problem_events

    # Check for more than 30 minute gap between time steps
    if (df['time_since_last_minutes'] > 30).any(): 
        if verbose:
            print(f"More than 30 minute gap between each time step")
        problem_events += 1
        return None, problem_events

    # Check if it contains more than 1 non-zero value in 'precipitation (mm/hr)'
    if not len(df[df['precipitation (mm/hr)'] > 0]) > 2:
        if verbose:
            print(f"Doesn't contain more than 1 value which isn't 0")
        problem_events += 1
        return None, problem_events

    # Check for any NaN values in 'precipitation (mm/hr)'
    if df['precipitation (mm/hr)'].isna().any():
        if verbose:
            print(f"Contains NANs")
        problem_events += 1
        return None, problem_events

    return df, problem_events

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pickle
import os
import pandas as pd
import re
from datetime import datetime
import sys 

quintile_mapping = {1: 'F2', 2: 'F1', 3: 'C', 4: 'B1', 5: 'B2'}

from Get_Events_Functions import *

gauge_nums = range(0,1294)
em ='bc005'
time_period='Present'

def create_dataframe_row(this_event):
    # Trim the event and remove problematic events
    trimmed_event = remove_leading_and_trailing_zeroes(this_event)
    real_trimmed_event, problem_events = remove_events_with_problems(trimmed_event, verbose=False)
    
    if real_trimmed_event is None:
        return {
        'precip':None,
        'times': None,
        "season" : get_season(trimmed_event['times'][0]),
        'duration':None,
        "year":extract_year(trimmed_event),
        'Volume': None,
    }
    
    # Return only the relevant data in a dictionary
    return {
        'precip': real_trimmed_event['precipitation (mm)'].values,
        'times': trimmed_event['times'].values,
        "season" : get_season(trimmed_event['times'][0]),
        'duration':len(real_trimmed_event) / 2,
        "year":extract_year(trimmed_event),
        'Volume': sum(real_trimmed_event['precipitation (mm)'].values),
    }

# Initialize an empty list to collect rows
rows = []

for em in [em]:
    for gauge_num in gauge_nums:
        if gauge_num not in [444, 827, 888]:
            if gauge_num % 100 == 0:
                print(f"Processing gauge {gauge_num}")
            
            base_fp = f"/nfs/a161/gy17m2a/PhD/ProcessedData/"
            if em == 'nimrod_5mins':
                indy_events_fp = f"/nfs/a161/gy17m2a/PhD/ProcessedData/IndependentEvents/NIMROD_5mins/NIMROD_1km_filtered_100/{gauge_num}/WholeYear/"
                profiles_fp = f"/nfs/a319/gy17m2a/PhD/ProcessedData/Profiles/NIMROD_5mins/WholeYear/"
            elif em == 'nimrod_30mins':
                indy_events_fp = f"/nfs/a161/gy17m2a/PhD/ProcessedData/IndependentEvents/NIMROD_30mins/2km_filtered_100/{gauge_num}/WholeYear/"
                profiles_fp = f"/nfs/a319/gy17m2a/PhD/ProcessedData/Profiles/NIMROD_30mins/WholeYear/"
            else:
                indy_events_fp = f"/nfs/a161/gy17m2a/PhD/ProcessedData/IndependentEvents/UKCP18_30mins/{time_period}/{em}/{gauge_num}/WholeYear/"
                profiles_fp = f"/nfs/a319/gy17m2a/PhD/ProcessedData/Profiles/UKCP18_30mins/{time_period}/{em}/"
            
            if not os.path.isdir(profiles_fp):
                os.makedirs(profiles_fp)
            
            files = [f for f in os.listdir(indy_events_fp) if f.endswith('.csv')]
            files = np.sort(files)

            for file in files:
                fp = indy_events_fp +  f"{file}"
                if '2080' in fp:
                    continue

                this_event = read_event(gauge_num, fp)

                # Create the row data with just 'precip' and 'times'
                row_data = create_dataframe_row(this_event)
                
                # Only append rows that are not None
                if row_data is not None:
                    rows.append(row_data)

# Create DataFrame from collected rows
df = pd.DataFrame(rows)

Processing gauge 0
Processing gauge 100
Processing gauge 200
Processing gauge 300
Processing gauge 400
Processing gauge 500
Processing gauge 600
Processing gauge 700
Processing gauge 800
Processing gauge 900
Processing gauge 1000
Processing gauge 1100
Processing gauge 1200


In [8]:
this_event

Unnamed: 0.1,Unnamed: 0,precipitation (mm/hr),times,precipitation (mm),is_dry,Rolling_Sum,consecutive_dry,timestamp,time_since_last_minutes
0,10502,5.111259e-07,2019-08-09 19:15:00,2.555629e-07,True,1.9e-05,1,2019-08-09 19:15:00,0.0
1,10503,1.137132e-06,2019-08-09 19:45:00,5.685662e-07,True,1.9e-05,2,2019-08-09 19:45:00,30.0
2,10504,1.233063,2019-08-09 20:15:00,0.6165314,False,0.616548,0,2019-08-09 20:15:00,30.0
3,10505,0.5491539,2019-08-09 20:45:00,0.274577,False,0.891124,0,2019-08-09 20:45:00,30.0
4,10506,6.924081e-05,2019-08-09 21:15:00,3.462041e-05,True,0.891158,1,2019-08-09 21:15:00,30.0
5,10507,0.01804837,2019-08-09 21:45:00,0.009024186,True,0.900182,2,2019-08-09 21:45:00,30.0
6,10508,1.070105,2019-08-09 22:15:00,0.5350525,False,1.435234,0,2019-08-09 22:15:00,30.0
7,10509,0.8394496,2019-08-09 22:45:00,0.4197248,False,1.854955,0,2019-08-09 22:45:00,30.0
8,10510,15.02937,2019-08-09 23:15:00,7.514682,False,9.369636,0,2019-08-09 23:15:00,30.0
9,10511,12.025,2019-08-09 23:45:00,6.0125,False,15.382135,0,2019-08-09 23:45:00,30.0


In [3]:
df = add_duration_cats_based_on_data(df)
df = add_duration_cats_predetermined(df)
df = add_duration_cats_based_on_all_ems(df)

with open(profiles_fp + "df.pkl", 'wb') as file:
    pickle.dump(df, file)    