In [17]:
import os
import glob
import pandas as pd
import numpy as np
##rename to Temporal Split

# Base directory
datadir = "/mnt/class_data/group3_remotesensing/mariamayad/planet_superdove"
sites = ['cheeca_flkeys','lbcaye_bbr', 'sanagustin_mexico','northpoint_lizard']
#sample_sites= for each site 1 timestamp picked at random

# Variables to store file paths and their labels
filepaths = []

# Collect filepaths from every image that we have
for site in sites:    
    # Get all bleached and healthy files for the current site
    filepaths_site = glob.glob(os.path.join(datadir, site, '**/*.tif'), recursive=True)
    # Add filepaths
    filepaths.extend(filepaths_site)

# Check if files were collected
assert filepaths, "No files found. Please check your directory structure and file paths."

# Create a DataFrame
df = pd.DataFrame({
    'filepath': filepaths,
})

# Extract components from the file path
df['site'] = df['filepath'].apply(lambda x: x.split('/')[6])  # Extract site
df['label'] = df['filepath'].apply(lambda x: x.split('/')[7])  # Extract label (bleached/healthy)
df['date'] = df['filepath'].apply(lambda x: x.split('/')[9])  # Extract date
df['filename'] = df['filepath'].apply(lambda x: os.path.basename(x))  # Extract filename

# Rearrange the columns
df = df[['site', 'label', 'date', 'filename', 'filepath']]

# Debugging: Check the DataFrame
print(df.head())

# Save all filepaths to CSV
output_csv_path = "/home/Mariam/codes/ct_classifier/runs/resnet18/split_temporal/all.csv"
df.to_csv(output_csv_path, index=False)
print(f"CSV file saved to: {output_csv_path}")

# Start creating the data splits


            site    label      date    filename  \
0  cheeca_flkeys  healthy  20230504  loc005.tif   
1  cheeca_flkeys  healthy  20230504  loc002.tif   
2  cheeca_flkeys  healthy  20230504  loc003.tif   
3  cheeca_flkeys  healthy  20230504  loc004.tif   
4  cheeca_flkeys  healthy  20230504  loc006.tif   

                                            filepath  
0  /mnt/class_data/group3_remotesensing/mariamaya...  
1  /mnt/class_data/group3_remotesensing/mariamaya...  
2  /mnt/class_data/group3_remotesensing/mariamaya...  
3  /mnt/class_data/group3_remotesensing/mariamaya...  
4  /mnt/class_data/group3_remotesensing/mariamaya...  
CSV file saved to: /home/Mariam/codes/ct_classifier/runs/resnet18/split_temporal/all.csv


In [72]:
# Initialize list of dictionaries to store filepaths 
#  of healthy and bleached images
val_filepaths = {
    'bleached': [],
    'healthy': []
}
test_filepaths = {
    'bleached': [],
    'healthy': []
}
train_filepaths = {
    'bleached': [],
    'healthy': []
}

excluded_filepaths = []

for split_filepaths in [val_filepaths, test_filepaths]:
    # Exclude all filepaths that have already been used for other splits
    if excluded_filepaths:
        df_sub = df[~(df.filepath.isin(excluded_filepaths))]
    else:
        df_sub = df
    
    # Select one date at random from every site
    for label in ['healthy', 'bleached']:
        for site in sites:
            # Within all healthy or bleached images, get all dates for the selected site
            dates_at_site = df_sub[(df_sub.site==site) & (df_sub.label==label)].date.unique()
            # Select one random date within those
            random_date = np.random.choice(dates_at_site)
            # Get filepaths of all tile locations within selected site and date
            filepaths_at_site_date = df_sub[(df_sub.site==site) & (df_sub.date==random_date)].filepath
            # Add filepaths to list
            split_filepaths[label].extend(filepaths_at_site_date)
        
        # Add filepaths that we just selected for the test or val split to the list
        # of excluded files
        excluded_filepaths.extend(split_filepaths[label])

# Move all else into train split
df_train = df[~(df.filepath.isin(excluded_filepaths))]
for label in ['healthy', 'bleached']:
    train_filepaths[label] = df_train[df_train.label==label].filepath

In [73]:
# todo: debug case in which there s a site that has less than 3 dates and\or theres an unequal number of bleached\\ healhty images
print(len(excluded_filepaths))
print(len(test_filepaths['bleached']), len(test_filepaths['healthy']), len(val_filepaths['bleached']), len(val_filepaths['healthy']))

print(len(train_filepaths['bleached']))
print(len(train_filepaths['healthy']))
print(len(df) )
assert (len(df) == len(test_filepaths['bleached']) + 
    len(test_filepaths['healthy']) +len(val_filepaths['bleached']) + 
    len(val_filepaths['healthy']) + len(train_filepaths['bleached']) + 
    len(train_filepaths['healthy'])), 'For some unknown reason the '\
    'total number images in each split does not equal the number of file that we have'

44
11 11 11 11
70
68
182


In [39]:
val_filepaths[label]

['/mnt/class_data/group3_remotesensing/mariamayad/planet_superdove/cheeca_flkeys/bleached/tiled_360m/20230817/loc005.tif',
 '/mnt/class_data/group3_remotesensing/mariamayad/planet_superdove/cheeca_flkeys/bleached/tiled_360m/20230817/loc002.tif',
 '/mnt/class_data/group3_remotesensing/mariamayad/planet_superdove/cheeca_flkeys/bleached/tiled_360m/20230817/loc003.tif',
 '/mnt/class_data/group3_remotesensing/mariamayad/planet_superdove/cheeca_flkeys/bleached/tiled_360m/20230817/loc004.tif',
 '/mnt/class_data/group3_remotesensing/mariamayad/planet_superdove/cheeca_flkeys/bleached/tiled_360m/20230817/loc006.tif',
 '/mnt/class_data/group3_remotesensing/mariamayad/planet_superdove/cheeca_flkeys/bleached/tiled_360m/20230817/loc001.tif',
 '/mnt/class_data/group3_remotesensing/mariamayad/planet_superdove/lbcaye_bbr/bleached/tiled_360m/20240929/loc002.tif',
 '/mnt/class_data/group3_remotesensing/mariamayad/planet_superdove/lbcaye_bbr/bleached/tiled_360m/20240929/loc001.tif',
 '/mnt/class_data/grou