In [1]:
pip install opencv-python

Note: you may need to restart the kernel to use updated packages.


In [2]:
# Initialize 
import glob
import os
import pandas as pd
from PIL import Image
import numpy as np
import multiprocessing
import cv2

pd.set_option('display.max_colwidth', None)
output_base = 'AGI_OA_3RDF_P1_threshold'

# ch1 - DAPI, ch2 - IBA1, ch3 - Brightfield, ch4 - BODIPY, ch5 - EEA1 
base_path = 'AGI_OA_3RDF_P1__2025-03-26T11_37_33-Measurement_1' 
full_path = glob.glob(os.path.join(base_path, '**/r*c*f*p*-ch*sk*fk*fl*/*.tiff'), recursive=True)

full_path_pd = pd.DataFrame(full_path, columns=['path'])
full_path_pd = full_path_pd[full_path_pd['path'].str.endswith('.tiff')].reset_index(drop=True)
print('In total:', full_path_pd.shape[0])

# print sample path
if full_path_pd.shape[0] > 0:
    print('Sample path:', full_path_pd.loc[0, 'path'])

In total: 292905
Sample path: AGI_OA_3RDF_P1__2025-03-26T11_37_33-Measurement_1/test/r04c05f23p01-ch2sk1fk1fl1/r04c05f23p01-ch2sk1fk1fl1_Cell_6.tiff


In [3]:
import pandas as pd
import numpy as np
import re

# Extract row ID (e.g., 'r02') from the file path
def extract_row(path):
    match = re.search(r'r\d{2}', path)
    return match.group() if match else None

# Extract column ID (e.g., 'c05') from the file path
def extract_column(path):
    match = re.search(r'c\d{2}', path)
    return match.group() if match else None

# Add 'row' and 'column' columns to the DataFrame
full_path_pd['row'] = full_path_pd['path'].apply(extract_row)
full_path_pd['column'] = full_path_pd['path'].apply(extract_column)

# Drop any rows with missing row/column info
full_path_pd = full_path_pd.dropna(subset=['row', 'column']).reset_index(drop=True)

# Create composite keys (e.g., 'c05r02')
full_path_pd['col_row'] = full_path_pd['column'] + full_path_pd['row']

# Use the full dataset
final_sample_df = full_path_pd.copy()

# Output basic info about the data
print(f"Total samples: {final_sample_df.shape[0]}")
print(f"Included rows: {sorted(final_sample_df['row'].unique())}")
print(f"Included columns: {sorted(final_sample_df['column'].unique())}")

Total samples: 292905
Included rows: ['r02', 'r03', 'r04', 'r05', 'r06', 'r07']
Included columns: ['c02', 'c03', 'c04', 'c05', 'c06', 'c07', 'c08', 'c09', 'c10', 'c11']


In [4]:
# basic data cleaning
# Convert 1080x1080 TIFF images to 540x540 PNGs using percentile normalization
for i in range(final_sample_df.shape[0]):
    path = final_sample_df.loc[i, 'path']
    
    # Load the image in original depth (16-bit grayscale)
    im = cv2.imread(path, -1)
    
    # Resize to 540x540
    im = cv2.resize(im, (540, 540))

    # Percentile-based intensity clipping (to remove outliers)
    low, high = np.percentile(im, (0, 99.6))
    im = np.clip(im, low, high)
    
    # Normalize pixel values to the 16-bit range
    norm = (im - im.min()) * 65535.0 / (im.max() - im.min() + 1e-8)

    # Build relative path to preserve folder structure
    rel_path = os.path.relpath(path, base_path)  # e.g., r02c02.../filename.tiff
    rel_path = rel_path.replace('.tiff', '.png')
    output_path = os.path.join(output_base, rel_path)

    # Create the output directory if it doesn't exist
    output_dir = os.path.dirname(output_path)
    os.makedirs(output_dir, exist_ok=True)

    # Save the PNG as 16-bit image
    cv2.imwrite(output_path, norm.astype(np.uint16))

    # Print progress every 1000 files
    if i % 1000 == 0:
        print(f'Progress: {i}/{final_sample_df.shape[0]}')

print('Processing complete.')

Progress: 0/292905
Progress: 1000/292905
Progress: 2000/292905
Progress: 3000/292905
Progress: 4000/292905
Progress: 5000/292905
Progress: 6000/292905
Progress: 7000/292905
Progress: 8000/292905
Progress: 9000/292905
Progress: 10000/292905
Progress: 11000/292905
Progress: 12000/292905
Progress: 13000/292905
Progress: 14000/292905
Progress: 15000/292905
Progress: 16000/292905
Progress: 17000/292905
Progress: 18000/292905
Progress: 19000/292905
Progress: 20000/292905
Progress: 21000/292905
Progress: 22000/292905
Progress: 23000/292905
Progress: 24000/292905
Progress: 25000/292905
Progress: 26000/292905
Progress: 27000/292905
Progress: 28000/292905
Progress: 29000/292905
Progress: 30000/292905
Progress: 31000/292905
Progress: 32000/292905
Progress: 33000/292905
Progress: 34000/292905
Progress: 35000/292905
Progress: 36000/292905
Progress: 37000/292905
Progress: 38000/292905
Progress: 39000/292905
Progress: 40000/292905
Progress: 41000/292905
Progress: 42000/292905
Progress: 43000/292905
P

In [5]:
import os
import pandas as pd

# Make sure final_sample_df already exists and includes a 'path' column
final_sample_df = final_sample_df.reset_index(drop=True)

# Extract the base filename (remove suffixes like '_Cell_1')
final_sample_df['filename'] = final_sample_df['path'].apply(lambda x: os.path.basename(x).split('_')[0])

# Extract row, column, field, plane, and channel information
final_sample_df['row'] = final_sample_df['filename'].str[0:3]        # e.g., r07
final_sample_df['column'] = final_sample_df['filename'].str[3:6]     # e.g., c08
final_sample_df['field'] = final_sample_df['filename'].str[6:9]      # e.g., f11
final_sample_df['plane'] = final_sample_df['filename'].str[9:12]     # e.g., p01
final_sample_df['channel'] = final_sample_df['filename'].str[13:16]  # e.g., ch1

# Construct compound identifiers like rcf and rc to support group analysis
final_sample_df['rcf'] = final_sample_df['row'] + final_sample_df['column'] + final_sample_df['field']
final_sample_df['rc'] = final_sample_df['row'] + final_sample_df['column']
final_sample_df['front'] = final_sample_df['filename'].str[:-7]

# Keep only images with plane == 'p03'
final_sample_df = final_sample_df[final_sample_df['plane'] == 'p03'].reset_index(drop=True)

# Output basic info of the filtered dataset
print(f"Total number of filtered images: {final_sample_df.shape[0]}")
print(f"Included rows: {sorted(final_sample_df['row'].unique())}")
print(f"Included columns: {sorted(final_sample_df['column'].unique())}")
print(f"Included fields: {sorted(final_sample_df['field'].unique())}")
print(f"Included channels: {sorted(final_sample_df['channel'].unique())}")

# Sort by rcf, plane, and channel
final_sample_df = final_sample_df.sort_values(by=['rcf', 'plane', 'channel']).reset_index(drop=True)

# Show the first few rows
final_sample_df.head()

Total number of filtered images: 104405
Included rows: ['r02', 'r03', 'r04', 'r05', 'r06', 'r07']
Included columns: ['c02', 'c03', 'c04', 'c05', 'c06', 'c07', 'c08', 'c09', 'c10', 'c11']
Included fields: ['f01', 'f02', 'f03', 'f04', 'f05', 'f06', 'f07', 'f08', 'f09', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29', 'f30', 'f31', 'f32', 'f33', 'f34']
Included channels: ['ch1', 'ch2', 'ch3', 'ch4', 'ch5']


Unnamed: 0,path,row,column,col_row,filename,field,plane,channel,rcf,rc,front
0,AGI_OA_3RDF_P1__2025-03-26T11_37_33-Measurement_1/test/r02c02f01p03-ch1sk1fk1fl1/r02c02f01p03-ch1sk1fk1fl1_Cell_13.tiff,r02,c02,c02r02,r02c02f01p03-ch1sk1fk1fl1,f01,p03,ch1,r02c02f01,r02c02,r02c02f01p03-ch1sk
1,AGI_OA_3RDF_P1__2025-03-26T11_37_33-Measurement_1/test/r02c02f01p03-ch1sk1fk1fl1/r02c02f01p03-ch1sk1fk1fl1_Cell_9.tiff,r02,c02,c02r02,r02c02f01p03-ch1sk1fk1fl1,f01,p03,ch1,r02c02f01,r02c02,r02c02f01p03-ch1sk
2,AGI_OA_3RDF_P1__2025-03-26T11_37_33-Measurement_1/test/r02c02f01p03-ch1sk1fk1fl1/r02c02f01p03-ch1sk1fk1fl1_Cell_29.tiff,r02,c02,c02r02,r02c02f01p03-ch1sk1fk1fl1,f01,p03,ch1,r02c02f01,r02c02,r02c02f01p03-ch1sk
3,AGI_OA_3RDF_P1__2025-03-26T11_37_33-Measurement_1/test/r02c02f01p03-ch1sk1fk1fl1/r02c02f01p03-ch1sk1fk1fl1_Cell_5.tiff,r02,c02,c02r02,r02c02f01p03-ch1sk1fk1fl1,f01,p03,ch1,r02c02f01,r02c02,r02c02f01p03-ch1sk
4,AGI_OA_3RDF_P1__2025-03-26T11_37_33-Measurement_1/test/r02c02f01p03-ch1sk1fk1fl1/r02c02f01p03-ch1sk1fk1fl1_Cell_33.tiff,r02,c02,c02r02,r02c02f01p03-ch1sk1fk1fl1,f01,p03,ch1,r02c02f01,r02c02,r02c02f01p03-ch1sk


In [6]:
import os
import numpy as np
import cv2
from pathlib import Path

# Updated Helen's thresholds for all 5 channels
thres_dict = {
    'ch1': [520, 3200],
    'ch2': [1000, 5800],
    'ch3': [2200, 3400],
    'ch4': [500, 6000],
    'ch5': [450, 6500],
}

# Output base folder
output_base = 'AGI_OA_3RDF_P1_threshold'

# Process each channel separately
for i_ch in np.sort(final_sample_df['channel'].unique()):
    if i_ch not in thres_dict:
        print(f"Skipping unknown channel: {i_ch}")
        continue

    temp_img_path = final_sample_df[final_sample_df['channel'] == i_ch].reset_index(drop=True)

    for i in range(temp_img_path.shape[0]):
        path = temp_img_path.loc[i, 'path']
        im = cv2.imread(path, -1)
        im = cv2.resize(im, (540, 540))

        # Apply Helen's channel-specific threshold
        low, high = thres_dict[i_ch]
        im[im > high] = im.min()
        im = np.clip(im, low, high)

        # Further percentile clipping to remove extremes
        low_p, high_p = np.percentile(im, (0, 99.6))
        if low_p != high_p:
            im = np.clip(im, low_p, high_p)

        # Normalize to 16-bit range
        norm = (im - im.min()) * 65535.0 / (im.max() - im.min() + 1e-8)

        # Construct relative output path
        rel_path = os.path.relpath(path, base_path).replace('.tiff', '.png')
        output_path = os.path.join(output_base, rel_path)

        # Make sure the output directory exists
        os.makedirs(os.path.dirname(output_path), exist_ok=True)

        # Save as 16-bit PNG
        cv2.imwrite(output_path, norm.astype(np.uint16))

        if i % 1000 == 0:
            print(f'Progress ({i_ch}): {i}/{temp_img_path.shape[0]}')

print('All channels processed.')

Progress (ch1): 0/20881
Progress (ch1): 1000/20881
Progress (ch1): 2000/20881
Progress (ch1): 3000/20881
Progress (ch1): 4000/20881
Progress (ch1): 5000/20881
Progress (ch1): 6000/20881
Progress (ch1): 7000/20881
Progress (ch1): 8000/20881
Progress (ch1): 9000/20881
Progress (ch1): 10000/20881
Progress (ch1): 11000/20881
Progress (ch1): 12000/20881
Progress (ch1): 13000/20881
Progress (ch1): 14000/20881
Progress (ch1): 15000/20881
Progress (ch1): 16000/20881
Progress (ch1): 17000/20881
Progress (ch1): 18000/20881
Progress (ch1): 19000/20881
Progress (ch1): 20000/20881
Progress (ch2): 0/20881
Progress (ch2): 1000/20881
Progress (ch2): 2000/20881
Progress (ch2): 3000/20881
Progress (ch2): 4000/20881
Progress (ch2): 5000/20881
Progress (ch2): 6000/20881
Progress (ch2): 7000/20881
Progress (ch2): 8000/20881
Progress (ch2): 9000/20881
Progress (ch2): 10000/20881
Progress (ch2): 11000/20881
Progress (ch2): 12000/20881
Progress (ch2): 13000/20881
Progress (ch2): 14000/20881
Progress (ch2): 15

In [7]:
import os
import shutil

# Set the base directory where your folders are stored
base_dir = 'AGI_OA_3RDF_P1_threshold/test'

# Get all folder names
all_folders = [f for f in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, f))]

# Loop through and delete folders not containing 'p03' in the correct position
for folder in all_folders:
    # Check if 'p03' appears in the standard position (should be around 11th–14th char)
    if 'p03' not in folder:
        folder_path = os.path.join(base_dir, folder)
        print(f"Deleting: {folder_path}")
        shutil.rmtree(folder_path)

print("Only p03 folders remain.")

Deleting: AGI_OA_3RDF_P1_threshold/test/r04c05f23p01-ch2sk1fk1fl1
Deleting: AGI_OA_3RDF_P1_threshold/test/r06c10f07p01-ch4sk1fk1fl1
Deleting: AGI_OA_3RDF_P1_threshold/test/r06c04f34p01-ch1sk1fk1fl1
Deleting: AGI_OA_3RDF_P1_threshold/test/r04c06f26p02-ch3sk1fk1fl1
Deleting: AGI_OA_3RDF_P1_threshold/test/r07c05f34p02-ch2sk1fk1fl1
Deleting: AGI_OA_3RDF_P1_threshold/test/r02c08f11p01-ch5sk1fk1fl1
Deleting: AGI_OA_3RDF_P1_threshold/test/r05c10f10p02-ch4sk1fk1fl1
Deleting: AGI_OA_3RDF_P1_threshold/test/r03c08f14p01-ch2sk1fk1fl1
Deleting: AGI_OA_3RDF_P1_threshold/test/r05c06f23p02-ch4sk1fk1fl1
Deleting: AGI_OA_3RDF_P1_threshold/test/r04c10f15p02-ch3sk1fk1fl1
Deleting: AGI_OA_3RDF_P1_threshold/test/r05c05f26p01-ch5sk1fk1fl1
Deleting: AGI_OA_3RDF_P1_threshold/test/r03c11f29p02-ch4sk1fk1fl1
Deleting: AGI_OA_3RDF_P1_threshold/test/r04c09f28p01-ch5sk1fk1fl1
Deleting: AGI_OA_3RDF_P1_threshold/test/r02c09f14p02-ch1sk1fk1fl1
Deleting: AGI_OA_3RDF_P1_threshold/test/r07c07f04p02-ch2sk1fk1fl1
Deleting: 