# Feature Aggregation

This script joins features from 4 of the 7 files. These files have useful data in a similar format, so they can be combined and used for statistical analysis. The combined features were then used to train the classifier and run the downstream similarity analysis.

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Set display options for clean output
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 15)

# File path
filepath ='/Users/zhuangzhuang/Desktop/Data Science Project/Task2/Extracted Features/AGI_F1_P1_baseline__2025-02-07T12_52_48-Measurement 1/Evaluation3/Objects_Population - bodipy Spots.txt' 

# Load and parse Bodipy Spots data from Phenix2 format
with open(filepath, 'r', encoding='utf-8') as f:
    lines = f.readlines()

# Find [Data] section
data_start = None
for i, line in enumerate(lines):
    if line.strip() == '[Data]':
        data_start = i + 1
        break

if data_start is None:
    raise ValueError("No [Data] section found")

# Parse data section
data_lines = lines[data_start:]
header = data_lines[0].strip().split('\t')
data_rows = []

for line in data_lines[1:]:
    if line.strip():
        row = line.strip().split('\t')
        while len(row) < len(header):
            row.append('')
        data_rows.append(row[:len(header)])

df = pd.DataFrame(data_rows, columns=header)

# Convert numeric columns
for col in df.columns:
    if col not in ['Compound', 'Cell Type', 'Bounding Box']:
        df[col] = pd.to_numeric(df[col], errors='coerce')

# Define columns to remove
columns_to_remove = [
    'Timepoint',
    'X',
    'Y', 
    'Bounding Box',
    'Position X [µm]',
    'Position Y [µm]',
    'Compound',
    'Concentration',
    'Cell Type',
    'Cell Count'
]

# Remove columns that exist
existing_columns_to_remove = [col for col in columns_to_remove if col in df.columns]
df_cleaned = df.drop(columns=existing_columns_to_remove)

# Create rcfid (Row + Column + Field)
df_cleaned['rcfid'] = 'r' + df_cleaned['Row'].astype(str).str.zfill(2) + \
                      'c' + df_cleaned['Column'].astype(str).str.zfill(2) + \
                      'f' + df_cleaned['Field'].astype(str).str.zfill(2)

# Define label mapping based on column values
label_mapping = {
    2: 'PARENT',
    3: 'TREM2_KO',
    4: 'R47H',
    5: 'H157Y',
    6: 'PLCG2_KO',
    7: 'P522R',
    8: 'P522R_HET',
    9: 'SHIP1_KO',       
    10: 'ABI3_KO',
    11: 'S209F'
}

# Add label column based on Column value
df_cleaned['label'] = df_cleaned['Column'].map(label_mapping)

# Check for any unmapped labels (optional - for debugging)
unmapped_columns = df_cleaned[df_cleaned['label'].isna()]['Column'].unique()
if len(unmapped_columns) > 0:
    print(f"Warning: Found unmapped column values: {unmapped_columns}")

# Display DataFrame shape
print(f"DataFrame shape: {df_cleaned.shape}")
print()

# Display first 5 rows with rcfid and label
print("Sample data with labels:")
print(df_cleaned[['Row', 'Column', 'Field', 'Object No', 'rcfid', 'label']].head())

# Show unique rcfid count and label distribution
print(f"\nTotal unique rcfids: {df_cleaned['rcfid'].nunique()}")
print(f"\nLabel distribution:")
print(df_cleaned['label'].value_counts().sort_index())

# Show unique labels
print(f"\nUnique labels: {sorted(df_cleaned['label'].unique())}")

DataFrame shape: (1679195, 20)

Sample data with labels:
   Row  Column  Field  Object No      rcfid   label
0    2       2      4          1  r02c02f04  PARENT
1    2       2      4          2  r02c02f04  PARENT
2    2       2      4          3  r02c02f04  PARENT
3    2       2      4          4  r02c02f04  PARENT
4    2       2      4          5  r02c02f04  PARENT

Total unique rcfids: 970

Label distribution:
label
ABI3_KO       15205
H157Y        152836
P522R        275559
P522R_HET    202894
PARENT       207714
PLCG2_KO     201140
R47H         155645
S209F        187924
SHIP1_KO     263463
TREM2_KO      16815
Name: count, dtype: int64

Unique labels: ['ABI3_KO', 'H157Y', 'P522R', 'P522R_HET', 'PARENT', 'PLCG2_KO', 'R47H', 'S209F', 'SHIP1_KO', 'TREM2_KO']


In [3]:
# Create the initial data_pd
data_pd = df_cleaned[['rcfid', 'label']].drop_duplicates().reset_index(drop=True)
print(f"\nInitial data_pd shape: {data_pd.shape}")
print("Initial data_pd:")
data_pd.head()


Initial data_pd shape: (970, 2)
Initial data_pd:


Unnamed: 0,rcfid,label
0,r02c02f04,PARENT
1,r02c02f17,PARENT
2,r02c02f02,PARENT
3,r02c02f03,PARENT
4,r02c02f01,PARENT


In [5]:
# Defines a function to aggregate features by rcfid with common statistics.
def get_agg(txt):
    temp_txt = txt.fillna(0)
    col_agg = temp_txt.columns[:-1].to_list()
    temp_agg = temp_txt.groupby(['rcfid'])[col_agg].agg(['mean', 'sum', 'std', 'min', 'max', 'median']).reset_index()
    temp_agg.columns = [' ('.join(col) + ')' for col in temp_agg.columns.to_flat_index()]
    temp_agg = temp_agg.rename(columns={'rcfid ()': 'rcfid'})
    return temp_agg

print('Done')

Done


In [7]:
temp_txt = df_cleaned.copy()
# Select BODIPY feature columns (excluding Row, Column, Field, Object No, and rcfid).
bodipy_feature_cols = [col for col in temp_txt.columns if 'bodipy Spots' in col]
temp_txt = temp_txt[bodipy_feature_cols + ['rcfid']]
temp_txt.columns = ['BODIPY - Relative Spot Intensity',
      'BODIPY - Corrected Spot Intensity', 
      'BODIPY - Uncorrected Spot Peak Intensity',
      'BODIPY - Spot Contrast',
      'BODIPY - Spot Background Intensity',
      'BODIPY - Spot Area [px²]',
      'BODIPY - Region Intensity',
      'BODIPY - Spot to Region Intensity',
      'BODIPY - Object No in Cells Selected',
      'BODIPY - Spot Area [µm²]',
      'BODIPY - Spot Roundness',
      'BODIPY - Spot Width [µm]',
      'BODIPY - Spot Length [µm]',
      'BODIPY - Spot Ratio Width to Length', 'rcfid']
print(temp_txt.shape)
print(temp_txt.head())

(1679195, 15)
   BODIPY - Relative Spot Intensity  BODIPY - Corrected Spot Intensity  \
0        0.076845                           22.9596                      
1        0.129085                           39.0183                      
2        0.085792                           27.6905                      
3        0.035985                           18.8706                      
4        0.040078                           20.8533                      

   BODIPY - Uncorrected Spot Peak Intensity  BODIPY - Spot Contrast  \
0             357                                  0.128286           
1             404                                  0.210940           
2             431                                  0.187211           
3             614                                  0.096889           
4             608                                  0.098001           

   BODIPY - Spot Background Intensity  BODIPY - Spot Area [px²]  \
0         275.818                              

In [9]:
temp_txt.head()

Unnamed: 0,BODIPY - Relative Spot Intensity,BODIPY - Corrected Spot Intensity,BODIPY - Uncorrected Spot Peak Intensity,BODIPY - Spot Contrast,BODIPY - Spot Background Intensity,BODIPY - Spot Area [px²],BODIPY - Region Intensity,BODIPY - Spot to Region Intensity,BODIPY - Object No in Cells Selected,BODIPY - Spot Area [µm²],BODIPY - Spot Roundness,BODIPY - Spot Width [µm],BODIPY - Spot Length [µm],BODIPY - Spot Ratio Width to Length,rcfid
0,0.076845,22.9596,357,0.128286,275.818,18,165.594,1.80428,1,1.57427,1.03908,1.18294,1.32257,0.894427,r02c02f04
1,0.129085,39.0183,404,0.21094,263.25,41,165.594,1.82535,1,3.58584,0.886608,1.67293,2.72655,0.613572,r02c02f04
2,0.085792,27.6905,431,0.187211,295.071,21,165.594,1.94911,1,1.83665,0.847174,0.836466,1.72442,0.485071,r02c02f04
3,0.035985,18.8706,614,0.096889,505.529,25,304.205,1.72384,2,2.18648,0.746812,0.836466,1.98385,0.421637,r02c02f04
4,0.040078,20.8533,608,0.098001,499.467,25,304.205,1.71043,2,2.18648,0.888374,1.18294,1.50796,0.784465,r02c02f04


In [11]:
# Process temp_txt to generate aggregated and count features  
temp_agg = get_agg(temp_txt)
temp_count = temp_txt['rcfid'].value_counts().reset_index()
temp_count.columns = ['rcfid', 'BODIPY - Number of Spots (sum)']

# Merge them into data_pd
data_pd = data_pd.merge(temp_agg, how='left', on='rcfid')
data_pd = data_pd.merge(temp_count, how='left', on='rcfid')
print('data_pd.shape =', data_pd.shape)

data_pd.shape = (970, 87)


In [13]:
# File path
filepath = "/Users/zhuangzhuang/Desktop/Data Science Project/Task2/Extracted Features/AGI_F1_P1_baseline__2025-02-07T12_52_48-Measurement 1/Evaluation3/Objects_Population - Cells Selected.txt"

# Load and parse Cells Selected data from Phenix2 format
with open(filepath, 'r', encoding='utf-8') as f:
    lines = f.readlines()

# Find [Data] section
data_start = None
for i, line in enumerate(lines):
    if line.strip() == '[Data]':
        data_start = i + 1
        break

if data_start is None:
    raise ValueError("No [Data] section found")

# Parse data section
data_lines = lines[data_start:]
header = data_lines[0].strip().split('\t')
data_rows = []
for line in data_lines[1:]:
    if line.strip():
        row = line.strip().split('\t')
        while len(row) < len(header):
            row.append('')
        data_rows.append(row[:len(header)])

df = pd.DataFrame(data_rows, columns=header)

# Convert numeric columns
for col in df.columns:
    if col not in ['Compound', 'Cell Type', 'Bounding Box']:
        df[col] = pd.to_numeric(df[col], errors='coerce')

# Define columns to remove
columns_to_remove = [
    'Timepoint', 'X', 'Y', 'Bounding Box', 'Position X [µm]', 'Position Y [µm]',
    'Compound', 'Concentration', 'Cell Type', 'Cell Count'
]

# Remove columns that exist
existing_columns_to_remove = [col for col in columns_to_remove if col in df.columns]
df_cleaned = df.drop(columns=existing_columns_to_remove)

# Create rcfid (Row + Column + Field)
df_cleaned['rcfid'] = 'r' + df_cleaned['Row'].astype(str).str.zfill(2) + \
                      'c' + df_cleaned['Column'].astype(str).str.zfill(2) + \
                      'f' + df_cleaned['Field'].astype(str).str.zfill(2)

temp_txt = df_cleaned.copy()
# Select Cells Selected feature columns (excluding Row, Column, Field, Object No, and rcfid)  
cells_feature_cols = [col for col in temp_txt.columns if 'Cells Selected' in col]
temp_txt = temp_txt[cells_feature_cols + ['rcfid']]
temp_txt.columns = ['Cells Selected - ROI No',
       'Cells Selected - Cell Area [µm²]',
       'Cells Selected - Cell Roundness',
       'Cells Selected - Cell Ratio Width to Length',
       'Cells Selected - Object No in Cells',
       'Cells Selected - Intensity Cell Alexa 568 Mean',
       'Cells Selected - Total Spot Area',
       'Cells Selected - Relative Spot Intensity',
       'Cells Selected - Number of Spots',
       'Cells Selected - Number of Spots per Area of Cell',
       'Cells Selected - Total Spot Area (2)',
       'Cells Selected - Relative Spot Intensity (2)',
       'Cells Selected - Number of Spots (2)',
       'Cells Selected - Number of Spots per Area of Cell (2)', 'rcfid']

# Aggregate and merge into data_pd  
temp_agg = get_agg(temp_txt)
temp_count = temp_txt['rcfid'].value_counts().reset_index()
temp_count.columns = ['rcfid', 'Cells Selected - Number of Cells (sum)']

# Merge with the existing data_pd
data_pd = data_pd.merge(temp_agg, on='rcfid', how='left')
data_pd = data_pd.merge(temp_count, on='rcfid', how='left')

print('data_pd.shape =', data_pd.shape)

data_pd.shape = (970, 172)


In [16]:
# File path
filepath = '/Users/zhuangzhuang/Desktop/Data Science Project/Task2/Extracted Features/AGI_F1_P1_baseline__2025-02-07T12_52_48-Measurement 1/Evaluation3/Objects_Population - Cells.txt'

# Load and parse Cells data from Phenix2 format
with open(filepath, 'r', encoding='utf-8') as f:
    lines = f.readlines()

# Find [Data] section
data_start = None
for i, line in enumerate(lines):
    if line.strip() == '[Data]':
        data_start = i + 1
        break

if data_start is None:
    raise ValueError("No [Data] section found")

# Parse data section
data_lines = lines[data_start:]
header = data_lines[0].strip().split('\t')
data_rows = []
for line in data_lines[1:]:
    if line.strip():
        row = line.strip().split('\t')
        while len(row) < len(header):
            row.append('')
        data_rows.append(row[:len(header)])

df = pd.DataFrame(data_rows, columns=header)

# Convert numeric columns
for col in df.columns:
    if col not in ['Compound', 'Cell Type', 'Bounding Box']:
        df[col] = pd.to_numeric(df[col], errors='coerce')

# Define columns to remove
columns_to_remove = [
    'Timepoint', 'X', 'Y', 'Bounding Box', 'Position X [µm]', 'Position Y [µm]',
    'Compound', 'Concentration', 'Cell Type', 'Cell Count'
]

# Remove columns that exist
existing_columns_to_remove = [col for col in columns_to_remove if col in df.columns]
df_cleaned = df.drop(columns=existing_columns_to_remove)

# Create rcfid (Row + Column + Field)
df_cleaned['rcfid'] = 'r' + df_cleaned['Row'].astype(str).str.zfill(2) + \
                      'c' + df_cleaned['Column'].astype(str).str.zfill(2) + \
                      'f' + df_cleaned['Field'].astype(str).str.zfill(2)

temp_txt = df_cleaned.copy()
# Select Cells Selected feature columns (excluding Row, Column, Field, Object No, and rcfid)  
cells_feature_cols = [col for col in temp_txt.columns if 'Cells -' in col]
temp_txt = temp_txt[cells_feature_cols + ['rcfid']]
temp_txt.columns = ['Cells - ROI No',
       'Cells - Cell Area [µm²]',
       'Cells - Cell Roundness',
       'Cells - Cell Ratio Width to Length',
       'Cells - Cells Selected', 'rcfid']

# Aggregate and merge into data_pd  
temp_agg = get_agg(temp_txt)
temp_count = temp_txt['rcfid'].value_counts().reset_index()
temp_count.columns = ['rcfid', 'Cells - Number of Cells (sum)']

# Merge with the existing data_pd
data_pd = data_pd.merge(temp_agg, on='rcfid', how='left')
data_pd = data_pd.merge(temp_count, on='rcfid', how='left')

print('data_pd.shape =', data_pd.shape)

data_pd.shape = (970, 203)


In [19]:
# File path
filepath = '/Users/zhuangzhuang/Desktop/Data Science Project/Task2/Extracted Features/AGI_F1_P1_baseline__2025-02-07T12_52_48-Measurement 1/Evaluation3/Objects_Population - EEA1 Spots.txt'

# Load and parse EEA1 Spots data from Phenix2 format
with open(filepath, 'r', encoding='utf-8') as f:
    lines = f.readlines()

# Find [Data] section
data_start = None
for i, line in enumerate(lines):
    if line.strip() == '[Data]':
        data_start = i + 1
        break

if data_start is None:
    raise ValueError("No [Data] section found")

# Parse data section
data_lines = lines[data_start:]
header = data_lines[0].strip().split('\t')
data_rows = []
for line in data_lines[1:]:
    if line.strip():
        row = line.strip().split('\t')
        while len(row) < len(header):
            row.append('')
        data_rows.append(row[:len(header)])

df = pd.DataFrame(data_rows, columns=header)

# Convert numeric columns
for col in df.columns:
    if col not in ['Compound', 'Cell Type', 'Bounding Box']:
        df[col] = pd.to_numeric(df[col], errors='coerce')

# Define columns to remove
columns_to_remove = [
    'Timepoint', 'X', 'Y', 'Bounding Box', 'Position X [µm]', 'Position Y [µm]',
    'Compound', 'Concentration', 'Cell Type', 'Cell Count'
]

# Remove columns that exist
existing_columns_to_remove = [col for col in columns_to_remove if col in df.columns]
df_cleaned = df.drop(columns=existing_columns_to_remove)

# Create rcfid (Row + Column + Field)
df_cleaned['rcfid'] = 'r' + df_cleaned['Row'].astype(str).str.zfill(2) + \
                      'c' + df_cleaned['Column'].astype(str).str.zfill(2) + \
                      'f' + df_cleaned['Field'].astype(str).str.zfill(2)

temp_txt = df_cleaned.copy()

# Select Cells Selected feature columns (excluding Row, Column, Field, Object No, and rcfid)  
eea1_feature_cols = [col for col in temp_txt.columns if 'EEA1 Spots' in col]
temp_txt = temp_txt[eea1_feature_cols + ['rcfid']]
temp_txt.columns = ['EEA1 Spots - Relative Spot Intensity',
       'EEA1 Spots - Corrected Spot Intensity',
       'EEA1 Spots - Uncorrected Spot Peak Intensity',
       'EEA1 Spots - Spot Contrast',
       'EEA1 Spots - Spot Background Intensity',
       'EEA1 Spots - Spot Area [px²]',
       'EEA1 Spots - Region Intensity',
       'EEA1 Spots - Spot To Region Intensity',
       'EEA1 Spots - Object No in Cells Selected',
       'EEA1 Spots - EEA1 Spot Area [µm²]',
       'EEA1 Spots - EEA1 Spot Roundness',
       'EEA1 Spots - EEA1 Spot Width [µm]',
       'EEA1 Spots - EEA1 Spot Length [µm]',
       'EEA1 Spots - EEA1 Spot Ratio Width to Length', 'rcfid']

# Aggregate and merge into data_pd  
temp_agg = get_agg(temp_txt)
temp_count = temp_txt['rcfid'].value_counts().reset_index()
temp_count.columns = ['rcfid', 'EEA1 Spots - Number of Spots (sum)']

# Merge with the existing data_pd
data_pd = data_pd.merge(temp_agg, on='rcfid', how='left')
data_pd = data_pd.merge(temp_count, on='rcfid', how='left')

print('data_pd.shape =', data_pd.shape)

data_pd.shape = (970, 288)


In [None]:
data_pd.to_csv('feature_agg_F1_P1.csv', index=False)