### Identify events which appear twice and delete them

In [1]:
import os
import hashlib
import pandas as pd
import numpy as np
import shutil

In [2]:
def create_copy_of_files(current_directory, gauge_num):
    target_directory = current_directory + "/EventSet"

    # Create the directory if it does not exist
    os.makedirs(target_directory, exist_ok=True)

    # List all files in the current directory
    files = [f for f in os.listdir(current_directory) if os.path.isfile(os.path.join(current_directory, f))]

    # Copy each file to the target directory
    for file in files:
        source_path = os.path.join(current_directory, file)
        destination_path = os.path.join(target_directory, file)
        shutil.copy(source_path, destination_path)

    # print(f"All files have been copied to {target_directory}.")

def load_csv_files(directory):
    csv_files = [f for f in os.listdir(directory) if f.endswith('.csv')]
    dataframes = {}
    
    for csv_file in csv_files:
        file_path = os.path.join(directory, csv_file)
        df = pd.read_csv(file_path)
        if 'precipitation (mm)' in df.columns:
            dataframes[csv_file] = df['precipitation (mm)']
    
    return dataframes

def compare_and_delete_duplicates(dataframes):
    filenames = list(dataframes.keys())
    num_files = len(filenames)
    duplicates = set()

    for i in range(num_files):
        for j in range(i + 1, num_files):
            file1, file2 = filenames[i], filenames[j]
            if dataframes[file1].equals(dataframes[file2]):
                duplicates.add(file2)  # Add the second file to the set of duplicates
                # print(f"The 'rolling_sum' column in {file1} is the same as in {file2}. Deleting {file2}.")

    # Delete the duplicate files
    for file in duplicates:
        os.remove(os.path.join(directory_path, file))
        # print(f"Deleted file: {file}")

def compare_columns(dataframes):
    filenames = list(dataframes.keys())
    num_files = len(filenames)
    
    for i in range(num_files):
        for j in range(i + 1, num_files):
            file1, file2 = filenames[i], filenames[j]
            if dataframes[file1].equals(dataframes[file2]):
                print(f"The 'rolling_sum' column in {file1} is the same as in {file2}")       

### UKCP18 (done for bc005, started for bc009 but not many profiles actually made yet)

In [3]:
em='bc005'
timeperiod='Present'

#### Check if there are enough files for each gauge

In [5]:
for gauge_num in range(0,1294):
    if gauge_num not in [444, 827, 888]:
        directory_path = f"/nfs/a161/gy17m2a/PhD/ProcessedData/IndependentEvents/UKCP18_30mins/{timeperiod}/{em}/{gauge_num}/WholeYear/EventSet/"
        file_count = len([
            name for name in os.listdir(directory_path)
            if os.path.isfile(os.path.join(directory_path, name)) and 'part0' in name])
        if file_count == 133:
            pass
        else:
            print(f"Gauge number {gauge_num}: Not as expected, {file_count}")

#### Check if already did the deleting

In [9]:
# for gauge_num in range(0,1294):
#     if gauge_num not in [444, 827, 888]:
#         directory_path_eventset = f"/nfs/a161/gy17m2a/PhD/ProcessedData/IndependentEvents/UKCP18_30mins/Present/{em}/{gauge_num}/WholeYear/EventSet/"
#         directory_path = f"/nfs/a161/gy17m2a/PhD/ProcessedData/IndependentEvents/UKCP18_30mins/Present/{em}/{gauge_num}/WholeYear/"
#         file_count_eventset = len([
#             name for name in os.listdir(directory_path_eventset)
#             if os.path.isfile(os.path.join(directory_path_eventset, name)) and 'part0' in name])
#         file_count = len([
#             name for name in os.listdir(directory_path)
#             if os.path.isfile(os.path.join(directory_path, name)) and 'part0' in name])
#         if file_count == file_count_eventset:
#             print(f"Gauge {gauge_num}: files not already deleted")

#### Do the deleting

In [7]:
for gauge_num in range(1,2):
    if gauge_num not in [444, 827, 888]:
        print(gauge_num)
        directory_path = f"/nfs/a161/gy17m2a/PhD/ProcessedData/IndependentEvents/UKCP18_30mins/{timeperiod}/{em}/{gauge_num}/WholeYear/"
        # print(f"Gauge number {gauge_num}: Not as expected, {file_count}")
        create_copy_of_files(directory_path, gauge_num)
        # Move files to extra directoy incase deleting part is wrong
        dataframes = load_csv_files(directory_path)
        print(f"gauge {gauge_num}, pre-deletion {len(dataframes)} files exist")
        # Compare the 'rolling_sum' columns
        compare_and_delete_duplicates(dataframes)
        dataframes = load_csv_files(directory_path)
        print(f"gauge {gauge_num}, pre-deletion {len(dataframes)} files exist")

1
gauge 1, pre-deletion 104 files exist
gauge 1, pre-deletion 104 files exist


In [9]:
pd.read_csv(f"/nfs/a161/gy17m2a/PhD/ProcessedData/IndependentEvents/UKCP18_30mins/{timeperiod}/{em}/{gauge_num}/WholeYear/12hrs_2013_v2_part0.csv")

Unnamed: 0.1,Unnamed: 0,precipitation (mm/hr),times,precipitation (mm),is_dry,Rolling_Sum,consecutive_dry
0,6936,3.649756e-07,2013-05-25 12:15:00,1.824878e-07,True,4.7e-05,1.0
1,6937,0.6059926,2013-05-25 12:45:00,0.3029963,False,0.303039,0.0
2,6938,11.24636,2013-05-25 13:15:00,5.623178,False,5.926216,0.0
3,6939,22.95285,2013-05-25 13:45:00,11.47643,False,17.402638,0.0
4,6940,35.63773,2013-05-25 14:15:00,17.81886,False,35.221502,0.0
5,6941,5.149635,2013-05-25 14:45:00,2.574818,False,37.796314,0.0
6,6942,0.04212647,2013-05-25 15:15:00,0.02106323,True,37.817375,1.0
7,6943,0.08342051,2013-05-25 15:45:00,0.04171025,True,37.859085,2.0
8,6944,4.783451e-07,2013-05-25 16:15:00,2.391725e-07,True,37.859081,3.0
9,6945,1.24795e-05,2013-05-25 16:45:00,6.239751e-06,True,37.859083,4.0


In [8]:
pd.read_csv(f"/nfs/a161/gy17m2a/PhD/ProcessedData/IndependentEvents/UKCP18_30mins/{timeperiod}/{em}/{gauge_num}/WholeYear/0.5hrs_2013_v2_part0.csv")

Unnamed: 0.1,Unnamed: 0,precipitation (mm/hr),times,precipitation (mm),is_dry,Rolling_Sum,consecutive_dry
0,6937,0.6059926,2013-05-25 12:45:00,0.3029963,False,0.3029963,
1,6938,11.24636,2013-05-25 13:15:00,5.623178,False,5.623178,
2,6939,22.95285,2013-05-25 13:45:00,11.47643,False,11.47643,0.0
3,6940,35.63773,2013-05-25 14:15:00,17.81886,False,17.81886,
4,6941,5.149635,2013-05-25 14:45:00,2.574818,False,2.574818,
5,6942,0.04212647,2013-05-25 15:15:00,0.02106323,True,0.02106323,
6,6943,0.08342051,2013-05-25 15:45:00,0.04171025,True,0.04171025,
7,6944,4.783451e-07,2013-05-25 16:15:00,2.391725e-07,True,2.391726e-07,
8,6945,1.24795e-05,2013-05-25 16:45:00,6.239751e-06,True,6.239751e-06,
9,6946,1.731245e-05,2013-05-25 17:15:00,8.656223e-06,True,8.656223e-06,


### 30 minute NIMROD (Done)

In [8]:
# for dataset_name in ['NIMROD_2.2km_filtered_100']:
#     for gauge_num in range(1200,1293):
#         if gauge_num not in [444, 827, 888]:
#             print(f"Gauge number {gauge_num}")
#             directory_path = f"/nfs/a161/gy17m2a/PhD/ProcessedData/IndependentEvents/NIMROD_30mins/{dataset_name}/{gauge_num}/WholeYear"
#             # Move files to extra directoy incase deleting part is wrong
#             create_copy_of_files(directory_path, gauge_num)
#             dataframes = load_csv_files(directory_path)
#             print(len(dataframes))
#             # Compare the 'rolling_sum' columns
#             compare_and_delete_duplicates(dataframes)
#             dataframes = load_csv_files(directory_path)
#             print(len(dataframes))

### 5 minute NIMROD

- filtered_100 = 0-200, 200-350, 350-1200, 1200-1294  
- filtered_300 = 0-200,  200-350, 350-1294
- unfilterd = 0-200, 200-350, 350-1294 

In [17]:
for dataset_name in ['NIMROD_1km_filtered_100']:
    for gauge_num in range(1200,1294):
        if gauge_num not in [444, 827, 888]:
            print(f"Gauge number {gauge_num}")
            directory_path = f"/nfs/a161/gy17m2a/PhD/ProcessedData/IndependentEvents/NIMROD_5mins/{dataset_name}/{gauge_num}/WholeYear"
            # Move files to extra directoy incase deleting part is wrong
            create_copy_of_files(directory_path, gauge_num)
            dataframes = load_csv_files(directory_path)
            print(len(dataframes))
            # Compare the 'rolling_sum' columns
            compare_and_delete_duplicates(dataframes)
            dataframes = load_csv_files(directory_path)
            print(len(dataframes))