### Identify events which appear twice and delete them

In [2]:
import os
import hashlib
import pandas as pd
import numpy as np
import shutil

In [17]:
def copy_files(em, gauge_num, option_num):
    # path to source directory
    src_dir = f"/nfs/a319/gy17m2a/PhD/ProcessedData/IndependentEvents/UKCP18_30mins/{em}/{gauge_num}/Option{option_num}/" 
    # path to destination directory
    dest_dir = f"/nfs/a319/gy17m2a/PhD/ProcessedData/IndependentEvents/UKCP18_30mins/{em}/{gauge_num}/Option{option_num}/EventSet/" 

    # getting all the files in the source directory
    files = os.listdir(src_dir)
    
    if not os.path.isdir(dest_dir):
        shutil.copytree(src_dir, dest_dir)
        
        
def copy_files_nimrod(gauge_num, filtering_option):
    # path to source directory
    src_dir = f"/nfs/a319/gy17m2a/PhD/ProcessedData/IndependentEvents/NIMROD/{filtering_option}/{gauge_num}/" 
    # path to destination directory
    dest_dir = f"/nfs/a319/gy17m2a/PhD/ProcessedData/IndependentEvents/NIMROD/{filtering_option}/{gauge_num}/EventSet/" 

    # getting all the files in the source directory
    files = os.listdir(src_dir)
    
    if not os.path.isdir(dest_dir):
        shutil.copytree(src_dir, dest_dir)        

def hash_dataframe(filepath, columns_to_exclude=None):
    """
    Generate a hash for a DataFrame loaded from a CSV file,
    excluding a specified column if provided.
    """
    # Load the CSV file into a DataFrame
    df = pd.read_csv(filepath)
    
    # Drop the specified column if it exists
    for column_to_exclude in columns_to_exclude:
        if column_to_exclude and column_to_exclude in df.columns:
            df = df.drop(columns=[column_to_exclude])
    
    # Convert DataFrame to a binary format using 'to_records' which includes data types in the hash
    data = df.to_records(index=False)
    return hashlib.sha256(data.tobytes()).hexdigest()

def find_and_remove_duplicates(directory, column_to_exclude=None):
    """
    Find and remove duplicate CSV files in a given directory,
    excluding comparison on a specified column.
    """
    hashes = {}
    files = [f for f in os.listdir(directory) if f.endswith('.csv')]
    files = np.sort(files)
    print(len(files))
    
    for filename in files:
        filepath = os.path.join(directory, filename)
        file_hash = hash_dataframe(filepath, column_to_exclude)
        
        if file_hash in hashes:
            # If hash is already in the dictionary, delete this file
            os.remove(filepath)
            print(f"Deleted duplicate file: {filepath}")
        else:
            # Otherwise, add the hash and file path to the dictionary
            hashes[file_hash] = filepath
            
    files = [f for f in os.listdir(directory) if f.endswith('.csv')]
    files = np.sort(files)
    print(len(files))            

### NIMROD regridded data

In [18]:
for dir_name in ["NIMROD_2.2km_unfiltered"]:
    for gauge_num in range(12, 20):
        if gauge_num not in [423, 444, 827, 888]:
            print(gauge_num)
            copy_files_nimrod(gauge_num, filtering_option = dir_name)
            directory2 = f"/nfs/a319/gy17m2a/PhD/ProcessedData/IndependentEvents/NIMROD/{dir_name}/{gauge_num}/" 
            column_to_exclude = 'Rolling_Sum'  # Change this to the column you want to exclude
            find_and_remove_duplicates(directory2, column_to_exclude)

12
110
110
13
108
108
14
107
107
15
112
112
16
113
113
17
112
112
18
117
117
19
113
113


### NIMROD data

In [84]:
# gauge_nums = list(range(0,25))
# gauge_nums.extend(list(range(50,54)))
# gauge_nums

# for dir_name in ["NIMROD_1km_filtered_100", "NIMROD_1km_filtered_300", "NIMROD_1km_unfiltered"]:
#     for gauge_num in gauge_nums:
#         print(gauge_num)
#         copy_files(em, gauge_num, option_num =2)
#         directory2 = f"/nfs/a319/gy17m2a/PhD/ProcessedData/IndependentEvents/NIMROD/{dir_name}/{gauge_num}/" 
#         column_to_exclude = 'Rolling_Sum'  # Change this to the column you want to exclude
#         find_and_remove_duplicates(directory2, column_to_exclude)

In [16]:
def is_sequential(numbers):
    return all(numbers[i] - numbers[i - 1] == 1 for i in range(1, len(numbers)))

# Loop through gauges
for gauge_num in [37]:
    
    for em in ['bc005']:
        directory = f"../../ProcessedData/IndependentEvents/UKCP18_30mins/{em}/{gauge_num}/Option2"
        files = [f for f in os.listdir(directory) if f.endswith('.csv')]
        files = np.sort(files)
        for file in files:
            test = pd.read_csv(directory+ '/'+ file)
            if test['Unnamed: 0'].is_monotonic_increasing ==False:
                print(f"monotonic: {file}")
            if not is_sequential(test['Unnamed: 0']):
                print(f"not sequential {file}")


In [40]:
# Usage
for gauge_num in range(1000,1200):
    print(gauge_num)
    em = 'bc005'
    copy_files(em, gauge_num, option_num =2)
#     directory2 = f"/nfs/a319/gy17m2a/PhD/ProcessedData/IndependentEvents/UKCP18_30mins/{em}/{gauge_num}/Option2/EventSet/" 
#     column_to_exclude = 'Rolling_Sum'  # Change this to the column you want to exclude
#     find_and_remove_duplicates(directory2, column_to_exclude)

1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161


FileNotFoundError: [Errno 2] No such file or directory: '/nfs/a319/gy17m2a/PhD/ProcessedData/IndependentEvents/UKCP18_30mins/bc005/1161/Option2/'

In [89]:
def copy_files(src_dir):
    # path to destination directory
    dest_dir =directory_path + 'EventSet/'
    # getting all the files in the source directory
    files = os.listdir(src_dir)
    
    if not os.path.isdir(dest_dir):
        shutil.copytree(src_dir, dest_dir)

def load_csv_files(directory):
    csv_files = [f for f in os.listdir(directory) if f.endswith('.csv')]
    dataframes = {}
    
    for csv_file in csv_files:
        file_path = os.path.join(directory, csv_file)
        df = pd.read_csv(file_path)
        if 'precipitation (mm)' in df.columns:
            dataframes[csv_file] = df['precipitation (mm)']
    
    return dataframes

def compare_and_delete_duplicates(dataframes):
    filenames = list(dataframes.keys())
    num_files = len(filenames)
    duplicates = set()

    for i in range(num_files):
        for j in range(i + 1, num_files):
            file1, file2 = filenames[i], filenames[j]
            if dataframes[file1].equals(dataframes[file2]):
                duplicates.add(file2)  # Add the second file to the set of duplicates
                # print(f"The 'rolling_sum' column in {file1} is the same as in {file2}. Deleting {file2}.")

    # Delete the duplicate files
    for file in duplicates:
        os.remove(os.path.join(directory_path, file))
        # print(f"Deleted file: {file}")

def compare_columns(dataframes):
    filenames = list(dataframes.keys())
    num_files = len(filenames)
    
    for i in range(num_files):
        for j in range(i + 1, num_files):
            file1, file2 = filenames[i], filenames[j]
            if dataframes[file1].equals(dataframes[file2]):
                print(f"The 'rolling_sum' column in {file1} is the same as in {file2}")       

In [90]:
for gauge_num in range(141,1300):
    print(f"Gauge number {gauge_num}")
    directory_path = f"/nfs/a319/gy17m2a/PhD/ProcessedData/IndependentEvents/NIMROD/NIMROD_1km_filtered_100/{gauge_num}/"
    # Move files to extra directoy incase deleting part is wrong
    copy_files(directory_path)
    dataframes = load_csv_files(directory_path)
    print(len(dataframes))
    # Compare the 'rolling_sum' columns
    compare_and_delete_duplicates(dataframes)
    dataframes = load_csv_files(directory_path)
    print(len(dataframes))

Gauge number 141
68
68
Gauge number 142
108
67
Gauge number 143
108
78
Gauge number 144
110
84
Gauge number 145
109
76
Gauge number 146
108
88
Gauge number 147
111
76
Gauge number 148
111
74
Gauge number 149
107
72
Gauge number 150
107
76
Gauge number 151
108
65
Gauge number 152
105
87
Gauge number 153
107
79
Gauge number 154
108
73
Gauge number 155
113
87
Gauge number 156
113
88
Gauge number 157
107
78
Gauge number 158
108
80
Gauge number 159
109
80
Gauge number 160
113
81
Gauge number 161
110
77
Gauge number 162
111
83
Gauge number 163
109
79
Gauge number 164
108
77
Gauge number 165
108
76
Gauge number 166
108
75
Gauge number 167
111
76
Gauge number 168
109
78
Gauge number 169
109
80
Gauge number 170
106
67
Gauge number 171
115
92
Gauge number 172
107
75
Gauge number 173
109
79
Gauge number 174
107
83
Gauge number 175
108
81
Gauge number 176
109
83
Gauge number 177
107
74
Gauge number 178
108
79
Gauge number 179
109
84
Gauge number 180
111
83
Gauge number 181
107
70
Gauge number 182


50
39
Gauge number 491
52
36
Gauge number 492
51
35
Gauge number 493
51
36
Gauge number 494
49
33
Gauge number 495
50
35
Gauge number 496
50
33
Gauge number 497
49
27
Gauge number 498
50
34
Gauge number 499
50
37
Gauge number 500
53
36
Gauge number 501
51
34
Gauge number 502
49
38
Gauge number 503
50
34
Gauge number 504
49
37
Gauge number 505
49
37
Gauge number 506
51
34
Gauge number 507
54
36
Gauge number 508
52
35
Gauge number 509
52
39
Gauge number 510
53
41
Gauge number 511
52
37
Gauge number 512
54
35
Gauge number 513
52
34
Gauge number 514
53
37
Gauge number 515
52
42
Gauge number 516
52
35
Gauge number 517
52
36
Gauge number 518
50
37
Gauge number 519
55
37
Gauge number 520
49
36
Gauge number 521
52
35
Gauge number 522
54
41
Gauge number 523
51
30
Gauge number 524
53
34
Gauge number 525
55
39
Gauge number 526
53
38
Gauge number 527
53
37
Gauge number 528
50
39
Gauge number 529
49
35
Gauge number 530
49
39
Gauge number 531
53
33
Gauge number 532
52
42
Gauge number 533
53
38
Gauge

FileNotFoundError: [Errno 2] No such file or directory: '/nfs/a319/gy17m2a/PhD/ProcessedData/IndependentEvents/NIMROD/NIMROD_1km_filtered_100/800/'