In [1]:
cd '/scratch/nf33/rd7475/ohw24_proj/ohw24_proj_micronekton_img_pipeline_au'

/scratch/nf33/rd7475/ohw24_proj/ohw24_proj_micronekton_img_pipeline_au


### Source info DataFrame

In [2]:
from pathlib import Path
import numpy as np
import pandas as pd
import re
import time

def extract_data(pattern, file_paths):
    """Extract unique numeric data from file paths based on regex pattern."""
    data = []
    seen = set()
    for path in file_paths:
        path = str(path)
        match = re.search(pattern, path)
        if match:
            num = int(match.group(1))  # Change to float(match.group(1)) if needed
            if num not in seen:
                seen.add(num)
                data.append(num)
    return sorted(data)

# # Define source path
# source_path = Path('/Volumes/csiro_data/IN2020_V08/PLAOS/Data/')

# # Get relevant file paths
# relevent_paths = list(source_path.rglob('Station_*/**/OBL*.JPG'))

import pickle
with open('relevent_paths.pkl', 'rb') as file:
    relevent_paths = pickle.load(file)

relevent_paths_str = []
for p in relevent_paths:
    relevent_paths_str.append(str(p))

# Extract station numbers
station_numbers = extract_data(pattern=r'Station_(\d+)', file_paths=relevent_paths)

# CAP NUMBER OF STATIONS FOR THE MOMENT!
station_numbers = [f'{num:02}' for num in np.arange(1, 3)]

# Prepare lists to store data
img_input_path = []
S = []
I = []

# Process each station
for station in station_numbers:

    station_paths = []
    for path in relevent_paths_str:
        # Format the station number as a two-digit string
        station_str = f'{station:02}'
        # Construct the station path with the formatted station number
        station_path = f'Station_{station_str}'
        # Check if the formatted station path is in the path
        if station_path in path:
            station_paths.append(path)

    # Extract image numbers for the current station
    img_numbers = extract_data(pattern=r'OBL(\d+)', file_paths=station_paths)

    # Process each image number
    for img_number in img_numbers:

        img_number = f'{img_number:05}'

        img_path = [s for s in station_paths if 'OBL'+img_number in s]

        img_input_path.append(str(img_path[0]))
        S.append(station)
        I.append('OBL'+img_number)

M = img_input_path

# Create a DataFrame
df = pd.DataFrame({
    'Station': S,
    'Photo': I,
    'Source': M
})

source_info = df
source_info

Elapsed time: 1.3283 seconds


Unnamed: 0,Station,Photo,Source
0,01,OBL00001,/Volumes/csiro_data/IN2020_V08/PLAOS/Data/Stat...
1,01,OBL00002,/Volumes/csiro_data/IN2020_V08/PLAOS/Data/Stat...
2,01,OBL00003,/Volumes/csiro_data/IN2020_V08/PLAOS/Data/Stat...
3,01,OBL00004,/Volumes/csiro_data/IN2020_V08/PLAOS/Data/Stat...
4,01,OBL00005,/Volumes/csiro_data/IN2020_V08/PLAOS/Data/Stat...
...,...,...,...
5447,02,OBL09995,/Volumes/csiro_data/IN2020_V08/PLAOS/Data/Stat...
5448,02,OBL09996,/Volumes/csiro_data/IN2020_V08/PLAOS/Data/Stat...
5449,02,OBL09997,/Volumes/csiro_data/IN2020_V08/PLAOS/Data/Stat...
5450,02,OBL09998,/Volumes/csiro_data/IN2020_V08/PLAOS/Data/Stat...


### Exif info DataFrame

In [3]:
for station in station_numbers:

    exif_info_station = pd.read_csv('IN2020_V08_PLAOS_station_'+str(int(station))+'_oblique_exif.txt', header=None, names=['Photo', 'Date', 'Time', 'AMPM', 'Photo_number'])
    exif_info_station = exif_info_station.iloc[1:].reset_index(drop=True)
    exif_info_station.insert(0, 'Station', station)

    if station == station_numbers[0]:
        exif_info = exif_info_station

    exif_info = pd.concat([exif_info, exif_info_station], ignore_index=True)

exif_info


Unnamed: 0,Station,Photo,Date,Time,AMPM,Photo_number
0,01,OBL09618,5/12/2020,06:54:03,AM,1
1,01,OBL09619,5/12/2020,06:54:05,AM,2
2,01,OBL09620,5/12/2020,06:54:07,AM,3
3,01,OBL09621,5/12/2020,06:54:09,AM,4
4,01,OBL09622,5/12/2020,06:54:11,AM,5
...,...,...,...,...,...,...
5818,02,OBL01531,5/12/2020,13:09:10,PM,1917
5819,02,OBL01532,5/12/2020,13:09:12,PM,1918
5820,02,OBL01533,5/12/2020,13:09:14,PM,1919
5821,02,OBL01534,5/12/2020,13:09:16,PM,1920


### Combine relevant info for each image

In [4]:
import_df = pd.DataFrame()

for index, img_info in exif_info.iterrows():
    station = img_info.Station
    photo = img_info.Photo
    
    # Filter the source_info DataFrame to match the current row
    img_source_info = source_info[(source_info.Station == station) & (source_info.Photo == photo)]
    
    if not img_source_info.empty:
        # Convert both rows to dictionaries
        img_info_dict = img_info.to_dict()
        img_source_info_dict = img_source_info.iloc[0].to_dict()
        
        # Combine dictionaries, img_source_info_dict will overwrite img_info_dict if there are conflicts
        combined_dict = {**img_info_dict, **img_source_info_dict}
        
        # Convert the combined dictionary to a DataFrame
        new_row_df = pd.DataFrame([combined_dict])
        
        # Append the new row DataFrame to import_df
        import_df = pd.concat([import_df, new_row_df], ignore_index=True)

import_df


Unnamed: 0,Station,Photo,Date,Time,AMPM,Photo_number,Source
0,01,OBL09618,5/12/2020,06:54:03,AM,1,/Volumes/csiro_data/IN2020_V08/PLAOS/Data/Stat...
1,01,OBL09619,5/12/2020,06:54:05,AM,2,/Volumes/csiro_data/IN2020_V08/PLAOS/Data/Stat...
2,01,OBL09620,5/12/2020,06:54:07,AM,3,/Volumes/csiro_data/IN2020_V08/PLAOS/Data/Stat...
3,01,OBL09621,5/12/2020,06:54:09,AM,4,/Volumes/csiro_data/IN2020_V08/PLAOS/Data/Stat...
4,01,OBL09622,5/12/2020,06:54:11,AM,5,/Volumes/csiro_data/IN2020_V08/PLAOS/Data/Stat...
...,...,...,...,...,...,...,...
5818,02,OBL01531,5/12/2020,13:09:10,PM,1917,/Volumes/csiro_data/IN2020_V08/PLAOS/Data/Stat...
5819,02,OBL01532,5/12/2020,13:09:12,PM,1918,/Volumes/csiro_data/IN2020_V08/PLAOS/Data/Stat...
5820,02,OBL01533,5/12/2020,13:09:14,PM,1919,/Volumes/csiro_data/IN2020_V08/PLAOS/Data/Stat...
5821,02,OBL01534,5/12/2020,13:09:16,PM,1920,/Volumes/csiro_data/IN2020_V08/PLAOS/Data/Stat...


### Find Depth for each Image

In [5]:
depth_info = pd.read_csv('IN2020_V08_date_time_depth_20201129-20201209-op-test-03.txt', header=None, names=['XXX', 'Date', 'Time', 'Depth'])
depth_info = depth_info.iloc[:, 1:]
depth_info = depth_info.iloc[1:].reset_index(drop=True)
depth_info

Unnamed: 0,Date,Time,Depth
0,20201129,232028.95,4.7
1,20201129,232030.00,3.6
2,20201129,232030.96,3.6
3,20201129,232031.96,3.2
4,20201129,232033.02,3.2
...,...,...,...
61634,20201209,103742.57,4.3
61635,20201209,103743.53,3.2
61636,20201209,103744.49,3.2
61637,20201209,103745.54,1.4


In [6]:
import_df_cpy = import_df

import_df_cpy.insert(len(import_df_cpy.columns) - 1, 'Depth', np.nan)

In [None]:
for index, img_info in import_df.iterrows():
    date = img_info.Date
    time = img_info.Time

    dates = depth_info.Date
    times = depth_info.Time

    # Converting to datetime
    date_time_str = date + ' ' + time
    date_time1 = pd.to_datetime(date_time_str, format='%m/%d/%Y %H:%M:%S')

    # Convert dates to datetime
    dates = pd.to_datetime(dates, format='%Y%m%d')
    
    # Convert times to datetime
    # Convert times to strings, then to timedelta and add to a date
    times_str = times.astype(int).astype(str).str.zfill(6)
    times_formatted = times_str.str[:2] + ':' + times_str.str[2:4] + ':' + times_str.str[4:]
    times = pd.to_datetime(times_formatted, format='%H:%M:%S').dt.time
    
    # Combine dates and times
    date_time2 = pd.to_datetime(dates.astype(str) + ' ' + times_formatted, format='%Y-%m-%d %H:%M:%S')
    

    # Find closest match
    def find_closest_time(dt1, dt2):
        # Calculate the absolute difference between dt1 and each entry in dt2
        differences = [abs(dt1 - time2) for time2 in dt2]
        # Find the index of the smallest difference
        closest_index = differences.index(min(differences))
        return closest_index
    
    # Get the closest index
    closest_index = find_closest_time(date_time1, date_time2)

    import_df.at[index, 'Depth'] = depth_info.Depth.iloc[closest_index]

    

    