### Identify events which appear twice and delete them

In [6]:
import os
import hashlib
import pandas as pd
import numpy as np
import shutil

In [7]:
def create_copy_of_files(current_directory, gauge_num):
    target_directory = current_directory + "/EventSet"

    # Create the directory if it does not exist
    os.makedirs(target_directory, exist_ok=True)

    # List all files in the current directory
    files = [f for f in os.listdir(current_directory) if os.path.isfile(os.path.join(current_directory, f))]

    # Copy each file to the target directory
    for file in files:
        source_path = os.path.join(current_directory, file)
        destination_path = os.path.join(target_directory, file)
        shutil.copy(source_path, destination_path)

    print(f"All files have been copied to {target_directory}.")

def load_csv_files(directory):
    csv_files = [f for f in os.listdir(directory) if f.endswith('.csv')]
    dataframes = {}
    
    for csv_file in csv_files:
        file_path = os.path.join(directory, csv_file)
        df = pd.read_csv(file_path)
        if 'precipitation (mm)' in df.columns:
            dataframes[csv_file] = df['precipitation (mm)']
    
    return dataframes

def compare_and_delete_duplicates(dataframes):
    filenames = list(dataframes.keys())
    num_files = len(filenames)
    duplicates = set()

    for i in range(num_files):
        for j in range(i + 1, num_files):
            file1, file2 = filenames[i], filenames[j]
            if dataframes[file1].equals(dataframes[file2]):
                duplicates.add(file2)  # Add the second file to the set of duplicates
                # print(f"The 'rolling_sum' column in {file1} is the same as in {file2}. Deleting {file2}.")

    # Delete the duplicate files
    for file in duplicates:
        os.remove(os.path.join(directory_path, file))
        # print(f"Deleted file: {file}")

def compare_columns(dataframes):
    filenames = list(dataframes.keys())
    num_files = len(filenames)
    
    for i in range(num_files):
        for j in range(i + 1, num_files):
            file1, file2 = filenames[i], filenames[j]
            if dataframes[file1].equals(dataframes[file2]):
                print(f"The 'rolling_sum' column in {file1} is the same as in {file2}")       

### UKCP18 (done for bc005, started for bc009 but not many profiles actually made yet)

In [7]:
em='bb189'
for gauge_num in range(0,1293):
    if gauge_num not in [444, 827, 888]:
        print(f"Gauge number {gauge_num}")
        directory_path = f"/nfs/a161/gy17m2a/PhD/ProcessedData/IndependentEvents/UKCP18_30mins/{em}/{gauge_num}/WholeYear/"
        create_copy_of_files(directory_path, gauge_num)
        # Move files to extra directoy incase deleting part is wrong
        dataframes = load_csv_files(directory_path)
        print(len(dataframes))
        # Compare the 'rolling_sum' columns
        compare_and_delete_duplicates(dataframes)
        dataframes = load_csv_files(directory_path)
        print(len(dataframes))

### 30 minute NIMROD (Done)

In [14]:
for dataset_name in ['NIMROD_2.2km_filtered_100']:
    for gauge_num in range(1200,1293):
        if gauge_num not in [444, 827, 888]:
            print(f"Gauge number {gauge_num}")
            directory_path = f"/nfs/a161/gy17m2a/PhD/ProcessedData/IndependentEvents/NIMROD_30mins/{dataset_name}/{gauge_num}/WholeYear"
            # Move files to extra directoy incase deleting part is wrong
            create_copy_of_files(directory_path, gauge_num)
            dataframes = load_csv_files(directory_path)
            print(len(dataframes))
            # Compare the 'rolling_sum' columns
            compare_and_delete_duplicates(dataframes)
            dataframes = load_csv_files(directory_path)
            print(len(dataframes))

### 5 minute NIMROD
Ran this for 0-200 for all 3, now running for 350-1200 for just filtered_100

In [9]:
for dataset_name in ['NIMROD_1km_filtered_100']:
    for gauge_num in range(350,1201):
        if gauge_num not in [444, 827, 888]:
            print(f"Gauge number {gauge_num}")
            directory_path = f"/nfs/a161/gy17m2a/PhD/ProcessedData/IndependentEvents/NIMROD_5mins/{dataset_name}/{gauge_num}/WholeYear"
            # Move files to extra directoy incase deleting part is wrong
            create_copy_of_files(directory_path, gauge_num)
            dataframes = load_csv_files(directory_path)
            print(len(dataframes))
            # Compare the 'rolling_sum' columns
            compare_and_delete_duplicates(dataframes)
            dataframes = load_csv_files(directory_path)
            print(len(dataframes))

Gauge number 1200
All files have been copied to /nfs/a161/gy17m2a/PhD/ProcessedData/IndependentEvents/NIMROD_5mins/NIMROD_1km_filtered_100/1200/WholeYear/EventSet.
106
60
