# Split data to train, validate and test subsets

Random split: 60% for training, 20% for validation, 20% for testing

Temporal split: 2019-2020 for training, 20221-2023 for testing

Input: 
- `../data/preprocessed_data/Atlantic_2019_03/normalized`

Output: 
- `../data/randomsplit/train`
- `../data/randomsplit/test`

In [22]:
import pandas as pd
import numpy as np
import os
import csv
from datetime import datetime 

In [23]:
input_dir = '../data/preprocessed_data/Atlantic_2019_03/normalized/'
random_train_dir = "../data/randomsplit/train"
random_val_dir = "../data/randomsplit/val"
random_test_dir = "../data/randomsplit/test"


file_paths = [os.path.join(input_dir, f) for f in os.listdir(input_dir) if f.endswith('.csv')]


os.makedirs(random_train_dir, exist_ok=True)
os.makedirs(random_val_dir, exist_ok=True)
os.makedirs(random_test_dir, exist_ok=True)


In [24]:
# Split the data and make sure the proportion in train and test set are equal
def getRandomSplit(data, size):
    from sklearn.model_selection import StratifiedShuffleSplit
    split = StratifiedShuffleSplit(n_splits = 1,test_size = size,random_state = 42)

    for train_index,test_index in split.split(data,data.iloc[:,-1]):
        train_set = data.iloc[train_index,:]
        test_set = data.iloc[test_index,:]
    return train_set,test_set

In [33]:
def getTemporalSplit(df, split_date): 
    """
    Split a DataFrame into a training set and a test set based on a split date.
    
    Args:
        df (pd.DataFrame): The input DataFrame.
        split_date (str): The date used to split the DataFrame.
        
    Returns:
        pd.DataFrame, pd.DataFrame: The training set and test set DataFrames.
    """
    # Convert 'date' column to datetime
    df['Date'] = pd.to_datetime(df['Date'])
    
    # Convert split_date to datetime
    split_date = pd.Timestamp(split_date)
    
    # Split the DataFrame into training and test sets
    train_set = df[df['Date'] < split_date]
    test_set = df[df['Date'] >= split_date]
    
    return train_set, test_set
    

In [34]:
#split the last 10% data 

# window
def sliding_window(data, window_size,anomaly_ratio):
    # from last raw
    for i in range(len(data)-1, window_size-2, -10):
        data_slice = data[i-window_size+1:i+1]
        ratio = (data_slice["Label"] == 1).sum() / window_size*100
        #print(ratio)
        if anomaly_ratio*0.8 <= ratio <= anomaly_ratio*1.2:
            #print(data_slice)
            return data_slice


def getLastSplit(data, size,anomaly_ratio):
    data.sort_values("Datetime", ascending=True, inplace=True)
    #print(data)
    row,col=data.shape
    window_size = int(size*row)
    test_set = sliding_window(data, window_size,anomaly_ratio)
    if test_set is None:
        print("no test")
    time = test_set.iloc[0,0]
    #print(time)
    train_set = data[data['Datetime'] < time]
    print(len(train_set),len(test_set))
    return train_set,test_set
# windows keep the same rate of anomaly

In [35]:
# Compute data error rate
def comp_error_ratio(dataset):
    instance = dataset[(dataset['Label']==1)]
    rate=len(instance)/len(dataset)*100
    return round(rate,2) 

### Get random split

60-20-20 Random split:
- Training Set: 60% of the data
- Validation Set: 20% of the data
- Test Set: 20% of the data


In [40]:
files = os.listdir(input_dir)


for file_name in files:
    print(f'-------- {file_name} ---------')
    file_path = os.path.join(input_dir, file_name)
    data = pd.read_csv(file_path)
    
    # ------ Random split
    print('Random split:')
    train_set1, temp = getRandomSplit(data,0.4) # random 60 40
    val_set1, test_set1 = getRandomSplit(temp,0.5) # random 20 20
    train_rate1 = comp_error_ratio(test_set1)
    val_rate1 = comp_error_ratio(val_set1)
    test_rate1 = comp_error_ratio(train_set1)

    print(f"Train error rate: {train_rate1}%")
    print(f"Validate error rate: {val_rate1}%")
    print(f"Test error rate: {test_rate1}%")

    print("Train set size: ",len(train_set1))
    print("Validate set size: ",len(val_set1))
    print("Test set size: ",len(test_set1))
    print("\n")

    train_set1.to_csv(os.path.join(random_train_dir, f"{file_name}"), index=False)
    val_set1.to_csv(os.path.join(random_val_dir, f"{file_name}"), index=False)
    test_set1.to_csv(os.path.join(random_test_dir, f"{file_name}"), index=False)


-------- PR_PF_4903220.csv ---------
Random split:
Train error rate: 0.16%
Validate error rate: 0.16%
Test error rate: 0.16%
Train set size:  181009
Validate set size:  60337
Test set size:  60337


Temporal split:
Train error rate: 0.04%
Test error rate: 0.33%
Train set size:  125733
Test set size:  175950


-------- PR_PF_4903217.csv ---------
Random split:
Train error rate: 33.72%
Validate error rate: 33.72%
Test error rate: 33.72%
Train set size:  179539
Validate set size:  59847
Test set size:  59847


Temporal split:
Train error rate: 57.65%
Test error rate: 0.03%
Train set size:  124294
Test set size:  174939


-------- PR_PF_4903058.csv ---------
Random split:
Train error rate: 42.02%
Validate error rate: 42.02%
Test error rate: 42.02%
Train set size:  184254
Validate set size:  61418
Test set size:  61418


Temporal split:
Train error rate: 71.46%
Test error rate: 0.73%
Train set size:  127822
Test set size:  179268


-------- PR_PF_4903218.csv ---------
Random split:
Train er

In [37]:
train_set2

Unnamed: 0,ID,Date,Normalized_date,Latitude,Longitude,Pressure,Salinity,Temperature,Label
0,1,2019-03-07,-1.704009,-2.107283,-2.423796,-0.893563,1.008039,0.978781,0
1,2,2019-03-07,-1.704009,-2.107283,-2.423796,-0.892623,1.031669,0.979516,0
2,3,2019-03-07,-1.704009,-2.107283,-2.423796,-0.891120,1.044469,0.979516,0
3,4,2019-03-07,-1.704009,-2.107283,-2.423796,-0.889554,1.034623,0.979149,0
4,5,2019-03-07,-1.704009,-2.107283,-2.423796,-0.887988,1.033639,0.979271,0
...,...,...,...,...,...,...,...,...,...
125728,125729,2020-12-24,-0.303387,0.447158,0.377426,-0.738406,1.198069,0.930368,0
125729,125730,2020-12-24,-0.303387,0.447158,0.377426,-0.738218,1.198069,0.930368,0
125730,125731,2020-12-24,-0.303387,0.447158,0.377426,-0.738030,1.197085,0.930491,0
125731,125732,2020-12-24,-0.303387,0.447158,0.377426,-0.737842,1.198069,0.930368,0


In [38]:
test_set2

Unnamed: 0,ID,Date,Normalized_date,Latitude,Longitude,Pressure,Salinity,Temperature,Label
125733,125734,2021-01-03,-0.282101,0.254899,0.455253,-0.892686,1.075977,0.845063,0
125734,125735,2021-01-03,-0.282101,0.254899,0.455253,-0.891120,1.075977,0.845063,0
125735,125736,2021-01-03,-0.282101,0.254899,0.455253,-0.889554,1.075977,0.845063,0
125736,125737,2021-01-03,-0.282101,0.254899,0.455253,-0.887988,1.075977,0.844573,0
125737,125738,2021-01-03,-0.282101,0.254899,0.455253,-0.886422,1.075977,0.844941,0
...,...,...,...,...,...,...,...,...,...
301678,301679,2023-08-01,1.718787,0.776962,0.512307,-0.738531,0.594500,0.619544,0
301679,301680,2023-08-01,1.718787,0.776962,0.512307,-0.738281,0.596470,0.618686,0
301680,301681,2023-08-01,1.718787,0.776962,0.512307,-0.738156,0.596470,0.616971,0
301681,301682,2023-08-01,1.718787,0.776962,0.512307,-0.737968,0.592531,0.616235,0
