### Identify events which appear twice and delete them

In [5]:
import os
import hashlib
import pandas as pd
import numpy as np
import shutil

In [6]:
def create_copy_of_files(current_directory, gauge_num):
    target_directory = current_directory + "/EventSet"

    # Create the directory if it does not exist
    os.makedirs(target_directory, exist_ok=True)

    # List all files in the current directory
    files = [f for f in os.listdir(current_directory) if os.path.isfile(os.path.join(current_directory, f))]

    # Copy each file to the target directory
    for file in files:
        source_path = os.path.join(current_directory, file)
        destination_path = os.path.join(target_directory, file)
        shutil.copy(source_path, destination_path)

    # print(f"All files have been copied to {target_directory}.")

def load_csv_files(directory):
    csv_files = [f for f in os.listdir(directory) if f.endswith('.csv')]
    dataframes = {}
    
    for csv_file in csv_files:
        file_path = os.path.join(directory, csv_file)
        df = pd.read_csv(file_path)
        if 'precipitation (mm)' in df.columns:
            dataframes[csv_file] = df['precipitation (mm)']
    
    return dataframes

def compare_and_delete_duplicates(dataframes):
    filenames = list(dataframes.keys())
    num_files = len(filenames)
    duplicates = set()

    for i in range(num_files):
        for j in range(i + 1, num_files):
            file1, file2 = filenames[i], filenames[j]
            if dataframes[file1].equals(dataframes[file2]):
                duplicates.add(file2)  # Add the second file to the set of duplicates
                # print(f"The 'rolling_sum' column in {file1} is the same as in {file2}. Deleting {file2}.")

    # Delete the duplicate files
    for file in duplicates:
        os.remove(os.path.join(directory_path, file))
        # print(f"Deleted file: {file}")

def compare_columns(dataframes):
    filenames = list(dataframes.keys())
    num_files = len(filenames)
    
    for i in range(num_files):
        for j in range(i + 1, num_files):
            file1, file2 = filenames[i], filenames[j]
            if dataframes[file1].equals(dataframes[file2]):
                print(f"The 'rolling_sum' column in {file1} is the same as in {file2}")       

### UKCP18 (done for bc005, started for bc009 but not many profiles actually made yet)

In [7]:
em='bb216'
timeperiod='Future'

#### Check if there are enough files for each gauge

In [12]:
for gauge_num in range(0,1294):
    if gauge_num not in [444, 827, 888]:
        directory_path = f"/nfs/a161/gy17m2a/PhD/ProcessedData/IndependentEvents/UKCP18_30mins/{timeperiod}/{em}/{gauge_num}/WholeYear/EventSet/"
        file_count = len([
            name for name in os.listdir(directory_path)
            if os.path.isfile(os.path.join(directory_path, name)) and 'part0' in name])
        if file_count == 133:
            pass
        else:
            print(f"Gauge number {gauge_num}: Not as expected, {file_count}")

#### Check if already did the deleting

In [None]:
# for gauge_num in range(423,424):
#     if gauge_num not in [444, 827, 888]:
#         directory_path_eventset = f"/nfs/a161/gy17m2a/PhD/ProcessedData/IndependentEvents/UKCP18_30mins/Present/{em}/{gauge_num}/WholeYear/EventSet/"
#         directory_path = f"/nfs/a161/gy17m2a/PhD/ProcessedData/IndependentEvents/UKCP18_30mins/Present/{em}/{gauge_num}/WholeYear/"
#         file_count_eventset = len([
#             name for name in os.listdir(directory_path_eventset)
#             if os.path.isfile(os.path.join(directory_path_eventset, name)) and 'part0' in name])
#         file_count = len([
#             name for name in os.listdir(directory_path)
#             if os.path.isfile(os.path.join(directory_path, name)) and 'part0' in name])
#         if file_count == file_count_eventset:
#             print(f"Gauge {gauge_num}: files not already deleted")

#### Do the deleting

In [11]:
for gauge_num in range(0,945):
    if gauge_num not in [444, 827, 888]:
        print(gauge_num)
        directory_path = f"/nfs/a161/gy17m2a/PhD/ProcessedData/IndependentEvents/UKCP18_30mins/{timeperiod}/{em}/{gauge_num}/WholeYear/"
        # print(f"Gauge number {gauge_num}: Not as expected, {file_count}")
        create_copy_of_files(directory_path, gauge_num)
        # Move files to extra directoy incase deleting part is wrong
        dataframes = load_csv_files(directory_path)
        print(f"gauge {gauge_num}, pre-deletion {len(dataframes)} files exist")
        # Compare the 'rolling_sum' columns
        compare_and_delete_duplicates(dataframes)
        dataframes = load_csv_files(directory_path)
        print(f"gauge {gauge_num}, pre-deletion {len(dataframes)} files exist")

0
gauge 0, pre-deletion 96 files exist
gauge 0, pre-deletion 96 files exist
1
gauge 1, pre-deletion 92 files exist
gauge 1, pre-deletion 92 files exist
2
gauge 2, pre-deletion 91 files exist
gauge 2, pre-deletion 91 files exist
3
gauge 3, pre-deletion 83 files exist
gauge 3, pre-deletion 83 files exist
4
gauge 4, pre-deletion 95 files exist
gauge 4, pre-deletion 95 files exist
5
gauge 5, pre-deletion 86 files exist
gauge 5, pre-deletion 86 files exist
6
gauge 6, pre-deletion 104 files exist
gauge 6, pre-deletion 104 files exist
7
gauge 7, pre-deletion 94 files exist
gauge 7, pre-deletion 94 files exist
8
gauge 8, pre-deletion 90 files exist
gauge 8, pre-deletion 90 files exist
9
gauge 9, pre-deletion 105 files exist
gauge 9, pre-deletion 105 files exist
10
gauge 10, pre-deletion 102 files exist
gauge 10, pre-deletion 102 files exist
11
gauge 11, pre-deletion 104 files exist
gauge 11, pre-deletion 104 files exist
12
gauge 12, pre-deletion 94 files exist
gauge 12, pre-deletion 94 files e

gauge 103, pre-deletion 100 files exist
104
gauge 104, pre-deletion 91 files exist
gauge 104, pre-deletion 91 files exist
105
gauge 105, pre-deletion 84 files exist
gauge 105, pre-deletion 84 files exist
106
gauge 106, pre-deletion 101 files exist
gauge 106, pre-deletion 101 files exist
107
gauge 107, pre-deletion 84 files exist
gauge 107, pre-deletion 84 files exist
108
gauge 108, pre-deletion 89 files exist
gauge 108, pre-deletion 89 files exist
109
gauge 109, pre-deletion 90 files exist
gauge 109, pre-deletion 90 files exist
110
gauge 110, pre-deletion 90 files exist
gauge 110, pre-deletion 90 files exist
111
gauge 111, pre-deletion 93 files exist
gauge 111, pre-deletion 93 files exist
112
gauge 112, pre-deletion 91 files exist
gauge 112, pre-deletion 91 files exist
113
gauge 113, pre-deletion 94 files exist
gauge 113, pre-deletion 94 files exist
114
gauge 114, pre-deletion 91 files exist
gauge 114, pre-deletion 91 files exist
115
gauge 115, pre-deletion 92 files exist
gauge 115, pr

gauge 203, pre-deletion 97 files exist
204
gauge 204, pre-deletion 93 files exist
gauge 204, pre-deletion 93 files exist
205
gauge 205, pre-deletion 89 files exist
gauge 205, pre-deletion 89 files exist
206
gauge 206, pre-deletion 99 files exist
gauge 206, pre-deletion 99 files exist
207
gauge 207, pre-deletion 103 files exist
gauge 207, pre-deletion 103 files exist
208
gauge 208, pre-deletion 92 files exist
gauge 208, pre-deletion 92 files exist
209
gauge 209, pre-deletion 97 files exist
gauge 209, pre-deletion 97 files exist
210
gauge 210, pre-deletion 82 files exist
gauge 210, pre-deletion 82 files exist
211
gauge 211, pre-deletion 87 files exist
gauge 211, pre-deletion 87 files exist
212
gauge 212, pre-deletion 84 files exist
gauge 212, pre-deletion 84 files exist
213
gauge 213, pre-deletion 88 files exist
gauge 213, pre-deletion 88 files exist
214
gauge 214, pre-deletion 84 files exist
gauge 214, pre-deletion 84 files exist
215
gauge 215, pre-deletion 91 files exist
gauge 215, pre

gauge 303, pre-deletion 91 files exist
304
gauge 304, pre-deletion 92 files exist
gauge 304, pre-deletion 92 files exist
305
gauge 305, pre-deletion 84 files exist
gauge 305, pre-deletion 84 files exist
306
gauge 306, pre-deletion 87 files exist
gauge 306, pre-deletion 87 files exist
307
gauge 307, pre-deletion 90 files exist
gauge 307, pre-deletion 90 files exist
308
gauge 308, pre-deletion 81 files exist
gauge 308, pre-deletion 81 files exist
309
gauge 309, pre-deletion 77 files exist
gauge 309, pre-deletion 77 files exist
310
gauge 310, pre-deletion 81 files exist
gauge 310, pre-deletion 81 files exist
311
gauge 311, pre-deletion 88 files exist
gauge 311, pre-deletion 88 files exist
312
gauge 312, pre-deletion 83 files exist
gauge 312, pre-deletion 83 files exist
313
gauge 313, pre-deletion 86 files exist
gauge 313, pre-deletion 86 files exist
314
gauge 314, pre-deletion 93 files exist
gauge 314, pre-deletion 93 files exist
315
gauge 315, pre-deletion 82 files exist
gauge 315, pre-d

gauge 403, pre-deletion 86 files exist
404
gauge 404, pre-deletion 83 files exist
gauge 404, pre-deletion 83 files exist
405
gauge 405, pre-deletion 87 files exist
gauge 405, pre-deletion 87 files exist
406
gauge 406, pre-deletion 85 files exist
gauge 406, pre-deletion 85 files exist
407
gauge 407, pre-deletion 87 files exist
gauge 407, pre-deletion 87 files exist
408
gauge 408, pre-deletion 86 files exist
gauge 408, pre-deletion 86 files exist
409
gauge 409, pre-deletion 93 files exist
gauge 409, pre-deletion 93 files exist
410
gauge 410, pre-deletion 89 files exist
gauge 410, pre-deletion 89 files exist
411
gauge 411, pre-deletion 88 files exist
gauge 411, pre-deletion 88 files exist
412
gauge 412, pre-deletion 93 files exist
gauge 412, pre-deletion 93 files exist
413
gauge 413, pre-deletion 86 files exist
gauge 413, pre-deletion 86 files exist
414
gauge 414, pre-deletion 87 files exist
gauge 414, pre-deletion 87 files exist
415
gauge 415, pre-deletion 76 files exist
gauge 415, pre-d

gauge 504, pre-deletion 95 files exist
505
gauge 505, pre-deletion 85 files exist
gauge 505, pre-deletion 85 files exist
506
gauge 506, pre-deletion 86 files exist
gauge 506, pre-deletion 86 files exist
507
gauge 507, pre-deletion 93 files exist
gauge 507, pre-deletion 93 files exist
508
gauge 508, pre-deletion 78 files exist
gauge 508, pre-deletion 78 files exist
509
gauge 509, pre-deletion 97 files exist
gauge 509, pre-deletion 97 files exist
510
gauge 510, pre-deletion 100 files exist
gauge 510, pre-deletion 100 files exist
511
gauge 511, pre-deletion 89 files exist
gauge 511, pre-deletion 89 files exist
512
gauge 512, pre-deletion 96 files exist
gauge 512, pre-deletion 96 files exist
513
gauge 513, pre-deletion 90 files exist
gauge 513, pre-deletion 90 files exist
514
gauge 514, pre-deletion 91 files exist
gauge 514, pre-deletion 91 files exist
515
gauge 515, pre-deletion 101 files exist
gauge 515, pre-deletion 101 files exist
516
gauge 516, pre-deletion 95 files exist
gauge 516, p

gauge 604, pre-deletion 88 files exist
605
gauge 605, pre-deletion 83 files exist
gauge 605, pre-deletion 83 files exist
606
gauge 606, pre-deletion 88 files exist
gauge 606, pre-deletion 88 files exist
607
gauge 607, pre-deletion 96 files exist
gauge 607, pre-deletion 96 files exist
608
gauge 608, pre-deletion 85 files exist
gauge 608, pre-deletion 85 files exist
609
gauge 609, pre-deletion 87 files exist
gauge 609, pre-deletion 87 files exist
610
gauge 610, pre-deletion 89 files exist
gauge 610, pre-deletion 89 files exist
611
gauge 611, pre-deletion 82 files exist
gauge 611, pre-deletion 82 files exist
612
gauge 612, pre-deletion 75 files exist
gauge 612, pre-deletion 75 files exist
613
gauge 613, pre-deletion 88 files exist
gauge 613, pre-deletion 88 files exist
614
gauge 614, pre-deletion 83 files exist
gauge 614, pre-deletion 83 files exist
615
gauge 615, pre-deletion 93 files exist
gauge 615, pre-deletion 93 files exist
616
gauge 616, pre-deletion 87 files exist
gauge 616, pre-d

gauge 704, pre-deletion 93 files exist
705
gauge 705, pre-deletion 85 files exist
gauge 705, pre-deletion 85 files exist
706
gauge 706, pre-deletion 94 files exist
gauge 706, pre-deletion 94 files exist
707
gauge 707, pre-deletion 80 files exist
gauge 707, pre-deletion 80 files exist
708
gauge 708, pre-deletion 82 files exist
gauge 708, pre-deletion 82 files exist
709
gauge 709, pre-deletion 90 files exist
gauge 709, pre-deletion 90 files exist
710
gauge 710, pre-deletion 83 files exist
gauge 710, pre-deletion 83 files exist
711
gauge 711, pre-deletion 94 files exist
gauge 711, pre-deletion 94 files exist
712
gauge 712, pre-deletion 80 files exist
gauge 712, pre-deletion 80 files exist
713
gauge 713, pre-deletion 82 files exist
gauge 713, pre-deletion 82 files exist
714
gauge 714, pre-deletion 78 files exist
gauge 714, pre-deletion 78 files exist
715
gauge 715, pre-deletion 90 files exist
gauge 715, pre-deletion 90 files exist
716
gauge 716, pre-deletion 93 files exist
gauge 716, pre-d

gauge 804, pre-deletion 99 files exist
805
gauge 805, pre-deletion 103 files exist
gauge 805, pre-deletion 103 files exist
806
gauge 806, pre-deletion 100 files exist
gauge 806, pre-deletion 100 files exist
807
gauge 807, pre-deletion 99 files exist
gauge 807, pre-deletion 99 files exist
808
gauge 808, pre-deletion 94 files exist
gauge 808, pre-deletion 94 files exist
809
gauge 809, pre-deletion 90 files exist
gauge 809, pre-deletion 90 files exist
810
gauge 810, pre-deletion 100 files exist
gauge 810, pre-deletion 100 files exist
811
gauge 811, pre-deletion 91 files exist
gauge 811, pre-deletion 91 files exist
812
gauge 812, pre-deletion 91 files exist
gauge 812, pre-deletion 91 files exist
813
gauge 813, pre-deletion 96 files exist
gauge 813, pre-deletion 96 files exist
814
gauge 814, pre-deletion 90 files exist
gauge 814, pre-deletion 90 files exist
815
gauge 815, pre-deletion 96 files exist
gauge 815, pre-deletion 96 files exist
816
gauge 816, pre-deletion 94 files exist
gauge 816,

gauge 906, pre-deletion 86 files exist
907
gauge 907, pre-deletion 84 files exist
gauge 907, pre-deletion 84 files exist
908
gauge 908, pre-deletion 98 files exist
gauge 908, pre-deletion 98 files exist
909
gauge 909, pre-deletion 89 files exist
gauge 909, pre-deletion 89 files exist
910
gauge 910, pre-deletion 87 files exist
gauge 910, pre-deletion 87 files exist
911
gauge 911, pre-deletion 87 files exist
gauge 911, pre-deletion 87 files exist
912
gauge 912, pre-deletion 84 files exist
gauge 912, pre-deletion 84 files exist
913
gauge 913, pre-deletion 78 files exist
gauge 913, pre-deletion 78 files exist
914
gauge 914, pre-deletion 82 files exist
gauge 914, pre-deletion 82 files exist
915
gauge 915, pre-deletion 97 files exist
gauge 915, pre-deletion 97 files exist
916
gauge 916, pre-deletion 88 files exist
gauge 916, pre-deletion 88 files exist
917
gauge 917, pre-deletion 78 files exist
gauge 917, pre-deletion 78 files exist
918
gauge 918, pre-deletion 85 files exist
gauge 918, pre-d

### 30 minute NIMROD (Done)

In [8]:
# for dataset_name in ['NIMROD_2.2km_filtered_100']:
#     for gauge_num in range(1200,1293):
#         if gauge_num not in [444, 827, 888]:
#             print(f"Gauge number {gauge_num}")
#             directory_path = f"/nfs/a161/gy17m2a/PhD/ProcessedData/IndependentEvents/NIMROD_30mins/{dataset_name}/{gauge_num}/WholeYear"
#             # Move files to extra directoy incase deleting part is wrong
#             create_copy_of_files(directory_path, gauge_num)
#             dataframes = load_csv_files(directory_path)
#             print(len(dataframes))
#             # Compare the 'rolling_sum' columns
#             compare_and_delete_duplicates(dataframes)
#             dataframes = load_csv_files(directory_path)
#             print(len(dataframes))

### 5 minute NIMROD

- filtered_100 = 0-200, 200-350, 350-1200, 1200-1294  
- filtered_300 = 0-200,  200-350, 350-1294
- unfilterd = 0-200, 200-350, 350-1294 

In [17]:
for dataset_name in ['NIMROD_1km_filtered_100']:
    for gauge_num in range(1200,1294):
        if gauge_num not in [444, 827, 888]:
            print(f"Gauge number {gauge_num}")
            directory_path = f"/nfs/a161/gy17m2a/PhD/ProcessedData/IndependentEvents/NIMROD_5mins/{dataset_name}/{gauge_num}/WholeYear"
            # Move files to extra directoy incase deleting part is wrong
            create_copy_of_files(directory_path, gauge_num)
            dataframes = load_csv_files(directory_path)
            print(len(dataframes))
            # Compare the 'rolling_sum' columns
            compare_and_delete_duplicates(dataframes)
            dataframes = load_csv_files(directory_path)
            print(len(dataframes))