### Identify events which appear twice and delete them

In [2]:
import os
import hashlib
import pandas as pd
import numpy as np
import shutil

In [3]:
def create_copy_of_files(current_directory, gauge_num):
    target_directory = current_directory + "/EventSet"

    # Create the directory if it does not exist
    os.makedirs(target_directory, exist_ok=True)

    # List all files in the current directory
    files = [f for f in os.listdir(current_directory) if os.path.isfile(os.path.join(current_directory, f))]

    # Copy each file to the target directory
    for file in files:
        source_path = os.path.join(current_directory, file)
        destination_path = os.path.join(target_directory, file)
        shutil.copy(source_path, destination_path)

    # print(f"All files have been copied to {target_directory}.")

def load_csv_files(directory):
    csv_files = [f for f in os.listdir(directory) if f.endswith('.csv')]
    dataframes = {}
    
    for csv_file in csv_files:
        file_path = os.path.join(directory, csv_file)
        df = pd.read_csv(file_path)
        if 'precipitation (mm)' in df.columns:
            dataframes[csv_file] = df['precipitation (mm)']
    
    return dataframes

def compare_and_delete_duplicates(dataframes):
    filenames = list(dataframes.keys())
    num_files = len(filenames)
    duplicates = set()

    for i in range(num_files):
        for j in range(i + 1, num_files):
            file1, file2 = filenames[i], filenames[j]
            if dataframes[file1].equals(dataframes[file2]):
                duplicates.add(file2)  # Add the second file to the set of duplicates
                # print(f"The 'rolling_sum' column in {file1} is the same as in {file2}. Deleting {file2}.")

    # Delete the duplicate files
    for file in duplicates:
        os.remove(os.path.join(directory_path, file))
        # print(f"Deleted file: {file}")

def compare_columns(dataframes):
    filenames = list(dataframes.keys())
    num_files = len(filenames)
    
    for i in range(num_files):
        for j in range(i + 1, num_files):
            file1, file2 = filenames[i], filenames[j]
            if dataframes[file1].equals(dataframes[file2]):
                print(f"The 'rolling_sum' column in {file1} is the same as in {file2}")       

### UKCP18 (done for bc005, started for bc009 but not many profiles actually made yet)

In [4]:
em='bb189'
timeperiod='Future'

#### Check if there are enough files for each gauge

In [5]:
# for gauge_num in range(0,1294):
#     if gauge_num not in [444, 827, 888]:
#         directory_path = f"/nfs/a161/gy17m2a/PhD/ProcessedData/IndependentEvents/UKCP18_30mins/{timeperiod}/{em}/{gauge_num}/WholeYear/"
#         file_count = len([
#             name for name in os.listdir(directory_path)
#             if os.path.isfile(os.path.join(directory_path, name)) and 'part0' in name])
#         if file_count == 133:
#             pass
#         else:
#             print(f"Gauge number {gauge_num}: Not as expected, {file_count}")

#### Check if already did the deleting

In [6]:
# for gauge_num in range(0,1294):
#     if gauge_num not in [444, 827, 888]:
#         directory_path_eventset = f"/nfs/a161/gy17m2a/PhD/ProcessedData/IndependentEvents/UKCP18_30mins/Present/{em}/{gauge_num}/WholeYear/EventSet/"
#         directory_path = f"/nfs/a161/gy17m2a/PhD/ProcessedData/IndependentEvents/UKCP18_30mins/Present/{em}/{gauge_num}/WholeYear/"
#         file_count_eventset = len([
#             name for name in os.listdir(directory_path_eventset)
#             if os.path.isfile(os.path.join(directory_path_eventset, name)) and 'part0' in name])
#         file_count = len([
#             name for name in os.listdir(directory_path)
#             if os.path.isfile(os.path.join(directory_path, name)) and 'part0' in name])
#         if file_count == file_count_eventset:
#             print(f"Gauge {gauge_num}: files not already deleted")

#### Do the deleting

In [10]:
for gauge_num in range(1008,1010):
    if gauge_num not in [444, 827, 888]:
        print(gauge_num)
        directory_path = f"/nfs/a161/gy17m2a/PhD/ProcessedData/IndependentEvents/UKCP18_30mins/{timeperiod}/{em}/{gauge_num}/WholeYear/"
        # print(f"Gauge number {gauge_num}: Not as expected, {file_count}")
        create_copy_of_files(directory_path, gauge_num)
        # Move files to extra directoy incase deleting part is wrong
        dataframes = load_csv_files(directory_path)
        print(f"gauge {gauge_num}, pre-deletion {len(dataframes)} files exist")
        # Compare the 'rolling_sum' columns
        compare_and_delete_duplicates(dataframes)
        dataframes = load_csv_files(directory_path)
        print(f"gauge {gauge_num}, pre-deletion {len(dataframes)} files exist")

1008
gauge 1008, pre-deletion 94 files exist
gauge 1008, pre-deletion 94 files exist
1009
gauge 1009, pre-deletion 83 files exist
gauge 1009, pre-deletion 83 files exist


### 30 minute NIMROD (Done)

In [12]:
for dataset_name in ['NIMROD_2.2km_filtered_100']:
    for gauge_num in range(1007,1010):
        if gauge_num not in [444, 827, 888]:
            print(f"Gauge number {gauge_num}")
            directory_path = f"/nfs/a161/gy17m2a/PhD/ProcessedData/IndependentEvents/NIMROD_30mins/{dataset_name}/{gauge_num}/WholeYear"
            # Move files to extra directoy incase deleting part is wrong
            create_copy_of_files(directory_path, gauge_num)
            dataframes = load_csv_files(directory_path)
            print(len(dataframes))
            # Compare the 'rolling_sum' columns
            compare_and_delete_duplicates(dataframes)
            dataframes = load_csv_files(directory_path)
            print(len(dataframes))

Gauge number 1007
107
71
Gauge number 1008
61
61
Gauge number 1009
59
59


### 5 minute NIMROD

- filtered_100 = 0-200, 200-350, 350-1200, 1200-1294  
- filtered_300 = 0-200,  200-350, 350-1294
- unfilterd = 0-200, 200-350, 350-1294 

In [17]:
for dataset_name in ['NIMROD_1km_filtered_100']:
    for gauge_num in range(1200,1294):
        if gauge_num not in [444, 827, 888]:
            print(f"Gauge number {gauge_num}")
            directory_path = f"/nfs/a161/gy17m2a/PhD/ProcessedData/IndependentEvents/NIMROD_5mins/{dataset_name}/{gauge_num}/WholeYear"
            # Move files to extra directoy incase deleting part is wrong
            create_copy_of_files(directory_path, gauge_num)
            dataframes = load_csv_files(directory_path)
            print(len(dataframes))
            # Compare the 'rolling_sum' columns
            compare_and_delete_duplicates(dataframes)
            dataframes = load_csv_files(directory_path)
            print(len(dataframes))