# EDA for Extracted Features

This script performs basic exploratory data analysis on the extracted features dataset. It provides an overview of the metadata, general dataset information, column and row index details, descriptive statistics, missing values, unique values in key columns, and a summary of spot measurements.

**For each plate, we got seven files:**

- *Objects_Population – Bodipy Spots*  
- *Objects_Population – Cells Selected*  
- *Objects_Population – Cells*  
- *Objects_Population – EEA1 Spots*  
- *Objects_Population – Nuclei (2)*  
- *PlateResults*  
- *indexfile*  

### Objects_Population - bodipy Spots

In [6]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 50)

# Load the Bodipy Spots Data 
def load_bodipy_data(filepath):
    """Load and parse Bodipy Spots data from Phenix2 format"""
    with open(filepath, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    
    # Find [Data] section
    data_start = None
    metadata = {}
    
    for i, line in enumerate(lines):
        if line.strip() == '[Data]':
            data_start = i + 1
            break
        elif '\t' in line and not line.startswith('['):
            parts = line.strip().split('\t', 1)
            if len(parts) == 2:
                metadata[parts[0]] = parts[1]
    
    if data_start is None:
        raise ValueError("No [Data] section found")
    
    # Parse data section
    data_lines = lines[data_start:]
    header = data_lines[0].strip().split('\t')
    
    data_rows = []
    for line in data_lines[1:]:
        if line.strip():
            row = line.strip().split('\t')
            while len(row) < len(header):
                row.append('')
            data_rows.append(row[:len(header)])
    
    df = pd.DataFrame(data_rows, columns=header)
    
    # Convert numeric columns
    for col in df.columns:
        if col not in ['Compound', 'Cell Type', 'Bounding Box']:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    
    return df, metadata

# Load the data
filepath = '/Users/zhuangzhuang/Desktop/Data Science Project/Task2/Extracted Features/AGI_F1_P1_baseline__2025-02-07T12_52_48-Measurement 1/Evaluation3/Objects_Population - bodipy Spots.txt' 
df, metadata = load_bodipy_data(filepath)

print("=== BODIPY SPOTS DATASET EDA ===")
print("=" * 50)

# 1. METADATA INFORMATION
print("\n METADATA:")
for key, value in metadata.items():
    print(f"  {key}: {value}")

# 2. BASIC DATASET INFORMATION
print(f"\n DATASET OVERVIEW:")
print(f"  Shape: {df.shape}")
print(f"  Total Objects: {len(df):,}")
print(f"  Total Columns: {len(df.columns)}")
print(f"  Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# 3. COLUMN INFORMATION
print(f"\n COLUMN NAMES ({len(df.columns)} columns):")
for i, col in enumerate(df.columns, 1):
    print(f"  {i:2d}. {col}")

print(f"\n DATA TYPES:")
print(df.dtypes.value_counts())

print(f"\n DETAILED COLUMN INFO:")
df.info()

# 4. ROW INDEX INFORMATION
print(f"\n ROW INDEX:")
print(f"  Index type: {type(df.index)}")
print(f"  Index range: {df.index.min()} to {df.index.max()}")
print(f"  Index values (first 10): {list(df.index[:10])}")

# 5. BASIC STATISTICS
print(f"\n NUMERIC COLUMNS SUMMARY:")
numeric_cols = df.select_dtypes(include=[np.number]).columns
print(f"  Number of numeric columns: {len(numeric_cols)}")

# Show descriptive statistics for key columns
key_columns = [col for col in numeric_cols if any(word in col.lower() 
               for word in ['intensity', 'area', 'roundness', 'contrast'])]

if key_columns:
    print(f"\n KEY MEASUREMENTS STATISTICS:")
    print(df[key_columns[:8]].describe().round(4))

# 6. MISSING VALUES
print(f"\n MISSING VALUES:")
missing_counts = df.isnull().sum()
missing_percent = (missing_counts / len(df) * 100).round(2)
missing_df = pd.DataFrame({
    'Column': missing_counts.index,
    'Missing_Count': missing_counts.values,
    'Missing_Percent': missing_percent.values
})
missing_df = missing_df[missing_df['Missing_Count'] > 0].sort_values('Missing_Count', ascending=False)

if len(missing_df) > 0:
    print(missing_df.to_string(index=False))
else:
    print("  No missing values found!")

# 7. UNIQUE VALUES IN KEY COLUMNS
print(f"\n UNIQUE VALUES IN KEY COLUMNS:")
key_categorical = ['Row', 'Column', 'Field', 'Timepoint']
for col in key_categorical:
    if col in df.columns:
        unique_vals = df[col].nunique()
        print(f"  {col}: {unique_vals} unique values {sorted(df[col].unique())}")

# 8. SPOT MEASUREMENTS OVERVIEW
print(f"\n BODIPY SPOTS MEASUREMENTS:")
bodipy_cols = [col for col in df.columns if 'bodipy' in col.lower()]
print(f"  Number of bodipy-specific columns: {len(bodipy_cols)}")
for col in bodipy_cols:
    print(f"    • {col}")

# Preview first 5 row of df
print(f"\n FIRST 5 ROWS PREVIEW:")
display(df.head())


=== BODIPY SPOTS DATASET EDA ===

 METADATA:
  Database Name: Phenix2
  Database Link: http://129.67.91.135/ODA/OdaService.asmx
  Evaluation Signature: 9a21bf12-06a9-475a-b54d-ebac2dcfb7c5
  Plate Name: AGI_F1_P1_baseline
  Measurement: Measurement 1
  Evaluation: Evaluation3
  Population: Population - bodipy Spots

 DATASET OVERVIEW:
  Shape: (1679195, 28)
  Total Objects: 1,679,195
  Total Columns: 28
  Memory Usage: 582.85 MB

 COLUMN NAMES (28 columns):
   1. Row
   2. Column
   3. Timepoint
   4. Field
   5. Object No
   6. X
   7. Y
   8. Bounding Box
   9. Position X [µm]
  10. Position Y [µm]
  11. Compound
  12. Concentration
  13. Cell Type
  14. Cell Count
  15. bodipy Spots - Relative Spot  Intensity
  16. bodipy Spots - Corrected Spot  Intensity
  17. bodipy Spots - Uncorrected Spot  Peak Intensity
  18. bodipy Spots - Spot Contrast
  19. bodipy Spots - Spot Background Intensity
  20. bodipy Spots - Spot Area [px²]
  21. bodipy Spots - Region Intensity
  22. bodipy Spots -

Unnamed: 0,Row,Column,Timepoint,Field,Object No,X,Y,Bounding Box,Position X [µm],Position Y [µm],Compound,Concentration,Cell Type,Cell Count,bodipy Spots - Relative Spot Intensity,bodipy Spots - Corrected Spot Intensity,bodipy Spots - Uncorrected Spot Peak Intensity,bodipy Spots - Spot Contrast,bodipy Spots - Spot Background Intensity,bodipy Spots - Spot Area [px²],bodipy Spots - Region Intensity,bodipy Spots - Spot To Region Intensity,bodipy Spots - Object No in Cells Selected,bodipy Spots - bodipy Spot Area [µm²],bodipy Spots - bodipy Spot Roundness,bodipy Spots - bodipy Spot Width [µm],bodipy Spots - bodipy Spot Length [µm],bodipy Spots - bodipy Spot Ratio Width to Length
0,2,2,0,4,1,620,42,"[618,40,622,44]",23.77,1438.77,,,,,0.076845,22.9596,357,0.128286,275.818,18,165.594,1.80428,1,1.57427,1.03908,1.18294,1.32257,0.894427
1,2,2,0,4,2,614,44,"[610,41,617,48]",21.89,1438.01,,,,,0.129085,39.0183,404,0.21094,263.25,41,165.594,1.82535,1,3.58584,0.886608,1.67293,2.72655,0.613572
2,2,2,0,4,3,607,50,"[605,49,609,54]",20.06,1435.95,,,,,0.085792,27.6905,431,0.187211,295.071,21,165.594,1.94911,1,1.83665,0.847174,0.836466,1.72442,0.485071
3,2,2,0,4,4,691,2,"[690,1,696,4]",45.47,1450.49,,,,,0.035985,18.8706,614,0.096889,505.529,25,304.205,1.72384,2,2.18648,0.746812,0.836466,1.98385,0.421637
4,2,2,0,4,5,703,3,"[701,1,706,5]",48.48,1450.35,,,,,0.040078,20.8533,608,0.098001,499.467,25,304.205,1.71043,2,2.18648,0.888374,1.18294,1.50796,0.784465


### Objects_Population - Cells Selected

In [10]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 50)

# Load the Cells Selected Data 
def load_cells_selected_data(filepath):
    """Load and parse Cells Selected data from Phenix2 format"""
    with open(filepath, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    
    # Find [Data] section
    data_start = None
    metadata = {}
    
    for i, line in enumerate(lines):
        if line.strip() == '[Data]':
            data_start = i + 1
            break
        elif '\t' in line and not line.startswith('['):
            parts = line.strip().split('\t', 1)
            if len(parts) == 2:
                metadata[parts[0]] = parts[1]
    
    if data_start is None:
        raise ValueError("No [Data] section found")
    
    # Parse data section
    data_lines = lines[data_start:]
    header = data_lines[0].strip().split('\t')
    
    data_rows = []
    for line in data_lines[1:]:
        if line.strip():
            row = line.strip().split('\t')
            while len(row) < len(header):
                row.append('')
            data_rows.append(row[:len(header)])
    
    df = pd.DataFrame(data_rows, columns=header)
    
    # Convert numeric columns
    for col in df.columns:
        if col not in ['Compound', 'Cell Type', 'Bounding Box']:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    
    return df, metadata

# Load the data
filepath = '/Users/zhuangzhuang/Desktop/Data Science Project/Task2/Extracted Features/AGI_F1_P1_baseline__2025-02-07T12_52_48-Measurement 1/Evaluation3/Objects_Population - Cells Selected.txt'
df, metadata = load_cells_selected_data(filepath)

print("=== CELLS SELECTED DATASET EDA ===")
print("=" * 50)

# 1. METADATA INFORMATION
print("\n METADATA:")
for key, value in metadata.items():
    print(f"  {key}: {value}")

# 2. BASIC DATASET INFORMATION
print(f"\n DATASET OVERVIEW:")
print(f"  Shape: {df.shape}")
print(f"  Total Objects: {len(df):,}")
print(f"  Total Columns: {len(df.columns)}")
print(f"  Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# 3. COLUMN INFORMATION
print(f"\n COLUMN NAMES ({len(df.columns)} columns):")
for i, col in enumerate(df.columns, 1):
    print(f"  {i:2d}. {col}")

print(f"\n DATA TYPES:")
print(df.dtypes.value_counts())

print(f"\n DETAILED COLUMN INFO:")
df.info()

# 4. ROW INDEX INFORMATION
print(f"\n ROW INDEX:")
print(f"  Index type: {type(df.index)}")
print(f"  Index range: {df.index.min()} to {df.index.max()}")
print(f"  Index values (first 10): {list(df.index[:10])}")

# 5. BASIC STATISTICS
print(f"\n NUMERIC COLUMNS SUMMARY:")
numeric_cols = df.select_dtypes(include=[np.number]).columns
print(f"  Number of numeric columns: {len(numeric_cols)}")

# Show descriptive statistics for key columns
key_columns = [col for col in numeric_cols if any(word in col.lower() 
               for word in ['intensity', 'area', 'roundness', 'contrast'])]

if key_columns:
    print(f"\n KEY MEASUREMENTS STATISTICS:")
    print(df[key_columns[:8]].describe().round(4))

# 6. MISSING VALUES
print(f"\n MISSING VALUES:")
missing_counts = df.isnull().sum()
missing_percent = (missing_counts / len(df) * 100).round(2)
missing_df = pd.DataFrame({
    'Column': missing_counts.index,
    'Missing_Count': missing_counts.values,
    'Missing_Percent': missing_percent.values
})
missing_df = missing_df[missing_df['Missing_Count'] > 0].sort_values('Missing_Count', ascending=False)

if len(missing_df) > 0:
    print(missing_df.to_string(index=False))
else:
    print("  No missing values found!")

# 7. UNIQUE VALUES IN KEY COLUMNS
print(f"\n UNIQUE VALUES IN KEY COLUMNS:")
key_categorical = ['Row', 'Column', 'Field', 'Timepoint']
for col in key_categorical:
    if col in df.columns:
        unique_vals = df[col].nunique()
        print(f"  {col}: {unique_vals} unique values {sorted(df[col].unique())}")

# 8. CELLS SELECTED MEASUREMENTS OVERVIEW
print(f"\n CELLS SELECTED MEASUREMENTS:")
cells_selected_cols = [col for col in df.columns if 'cells selected' in col.lower()]
print(f"  Number of cells selected-specific columns: {len(cells_selected_cols)}")
for col in cells_selected_cols:
    print(f"    • {col}")

# 9. FIRST 5 ROWS PREVIEW
print(f"\n FIRST 5 ROWS PREVIEW:")
display(df.head())

=== CELLS SELECTED DATASET EDA ===

 METADATA:
  Database Name: Phenix2
  Database Link: http://129.67.91.135/ODA/OdaService.asmx
  Evaluation Signature: 9a21bf12-06a9-475a-b54d-ebac2dcfb7c5
  Plate Name: AGI_F1_P1_baseline
  Measurement: Measurement 1
  Evaluation: Evaluation3
  Population: Population - Cells Selected

 DATASET OVERVIEW:
  Shape: (25863, 28)
  Total Objects: 25,863
  Total Columns: 28
  Memory Usage: 8.97 MB

 COLUMN NAMES (28 columns):
   1. Row
   2. Column
   3. Timepoint
   4. Field
   5. Object No
   6. X
   7. Y
   8. Bounding Box
   9. Position X [µm]
  10. Position Y [µm]
  11. Compound
  12. Concentration
  13. Cell Type
  14. Cell Count
  15. Cells Selected - ROI No
  16. Cells Selected - Cell Area [µm²]
  17. Cells Selected - Cell Roundness
  18. Cells Selected - Cell Ratio Width to Length
  19. Cells Selected - Object No in Cells
  20. Cells Selected - Intensity Cell Alexa 568 Mean
  21. Cells Selected - Total Spot Area
  22. Cells Selected - Relative Spot

Unnamed: 0,Row,Column,Timepoint,Field,Object No,X,Y,Bounding Box,Position X [µm],Position Y [µm],Compound,Concentration,Cell Type,Cell Count,Cells Selected - ROI No,Cells Selected - Cell Area [µm²],Cells Selected - Cell Roundness,Cells Selected - Cell Ratio Width to Length,Cells Selected - Object No in Cells,Cells Selected - Intensity Cell Alexa 568 Mean,Cells Selected - Total Spot Area,Cells Selected - Relative Spot Intensity,Cells Selected - Number of Spots,Cells Selected - Number of Spots per Area of Cell,Cells Selected - Total Spot Area (2),Cells Selected - Relative Spot Intensity (2),Cells Selected - Number of Spots (2),Cells Selected - Number of Spots per Area of Cell (2)
0,2,2,0,4,1,599,25,"[558,1,641,53]",17.7,1445.36,,,,,3,206.929,0.442996,0.33895,1,821.768,80,0.006622,3,0.001268,70,0.011829,4,0.001691
1,2,2,0,4,2,651,41,"[554,1,726,115]",28.3,1435.34,,,,,3,713.406,0.391785,0.252081,2,1296.34,2381,0.030461,84,0.010298,3001,0.054275,161,0.019738
2,2,2,0,4,3,927,20,"[872,1,1008,40]",119.09,1446.71,,,,,1,300.161,0.504446,0.264856,3,1213.46,520,0.091029,23,0.006702,1385,0.05891,76,0.022145
3,2,2,0,4,4,839,39,"[797,2,963,99]",99.88,1437.43,,,,,2,740.781,0.487663,0.360514,4,1508.27,2499,0.123819,82,0.009681,3615,0.076034,178,0.021015
4,2,2,0,4,5,771,57,"[711,13,882,120]",75.22,1430.53,,,,,4,725.301,0.475919,0.329012,5,1777.62,1919,0.034224,66,0.007959,3859,0.085783,206,0.02484


### Objects_Population - Cells

In [12]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 50)

# Load the Cells Data 
def load_cells_data(filepath):
    """Load and parse Cells data from Phenix2 format"""
    with open(filepath, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    
    # Find [Data] section
    data_start = None
    metadata = {}
    
    for i, line in enumerate(lines):
        if line.strip() == '[Data]':
            data_start = i + 1
            break
        elif '\t' in line and not line.startswith('['):
            parts = line.strip().split('\t', 1)
            if len(parts) == 2:
                metadata[parts[0]] = parts[1]
    
    if data_start is None:
        raise ValueError("No [Data] section found")
    
    # Parse data section
    data_lines = lines[data_start:]
    header = data_lines[0].strip().split('\t')
    
    data_rows = []
    for line in data_lines[1:]:
        if line.strip():
            row = line.strip().split('\t')
            while len(row) < len(header):
                row.append('')
            data_rows.append(row[:len(header)])
    
    df = pd.DataFrame(data_rows, columns=header)
    
    # Convert numeric columns
    for col in df.columns:
        if col not in ['Compound', 'Cell Type', 'Bounding Box']:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    
    return df, metadata

# Load the data
filepath = '/Users/zhuangzhuang/Desktop/Data Science Project/Task2/Extracted Features/AGI_F1_P1_baseline__2025-02-07T12_52_48-Measurement 1/Evaluation3/Objects_Population - Cells.txt'
df, metadata = load_cells_data(filepath)

print("=== CELLS DATASET EDA ===")
print("=" * 50)

# 1. METADATA INFORMATION
print("\n METADATA:")
for key, value in metadata.items():
    print(f"  {key}: {value}")

# 2. BASIC DATASET INFORMATION
print(f"\n DATASET OVERVIEW:")
print(f"  Shape: {df.shape}")
print(f"  Total Objects: {len(df):,}")
print(f"  Total Columns: {len(df.columns)}")
print(f"  Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# 3. COLUMN INFORMATION
print(f"\n COLUMN NAMES ({len(df.columns)} columns):")
for i, col in enumerate(df.columns, 1):
    print(f"  {i:2d}. {col}")

print(f"\n DATA TYPES:")
print(df.dtypes.value_counts())

print(f"\n DETAILED COLUMN INFO:")
df.info()

# 4. ROW INDEX INFORMATION
print(f"\n ROW INDEX:")
print(f"  Index type: {type(df.index)}")
print(f"  Index range: {df.index.min()} to {df.index.max()}")
print(f"  Index values (first 10): {list(df.index[:10])}")

# 5. BASIC STATISTICS 
print(f"\n NUMERIC COLUMNS SUMMARY:")
numeric_cols = df.select_dtypes(include=[np.number]).columns
print(f"  Number of numeric columns: {len(numeric_cols)}")

# Show descriptive statistics for key columns
key_columns = [col for col in numeric_cols if any(word in col.lower() 
               for word in ['intensity', 'area', 'roundness', 'contrast'])]

if key_columns:
    print(f"\n KEY MEASUREMENTS STATISTICS:")
    print(df[key_columns[:8]].describe().round(4))

# 6. MISSING VALUES 
print(f"\n MISSING VALUES:")
missing_counts = df.isnull().sum()
missing_percent = (missing_counts / len(df) * 100).round(2)
missing_df = pd.DataFrame({
    'Column': missing_counts.index,
    'Missing_Count': missing_counts.values,
    'Missing_Percent': missing_percent.values
})
missing_df = missing_df[missing_df['Missing_Count'] > 0].sort_values('Missing_Count', ascending=False)

if len(missing_df) > 0:
    print(missing_df.to_string(index=False))
else:
    print("  No missing values found!")

# 7. UNIQUE VALUES IN KEY COLUMNS
print(f"\n UNIQUE VALUES IN KEY COLUMNS:")
key_categorical = ['Row', 'Column', 'Field', 'Timepoint']
for col in key_categorical:
    if col in df.columns:
        unique_vals = df[col].nunique()
        print(f"  {col}: {unique_vals} unique values {sorted(df[col].unique())}")

# 8. CELLS MEASUREMENTS OVERVIEW 
print(f"\n CELLS MEASUREMENTS:")
cells_cols = [col for col in df.columns if 'cells' in col.lower()]
print(f"  Number of cells-specific columns: {len(cells_cols)}")
for col in cells_cols:
    print(f"    • {col}")

# 9. FIRST 5 ROWS PREVIEW 
print(f"\n FIRST 5 ROWS PREVIEW:")
display(df.head())

=== CELLS DATASET EDA ===

 METADATA:
  Database Name: Phenix2
  Database Link: http://129.67.91.135/ODA/OdaService.asmx
  Evaluation Signature: 9a21bf12-06a9-475a-b54d-ebac2dcfb7c5
  Plate Name: AGI_F1_P1_baseline
  Measurement: Measurement 1
  Evaluation: Evaluation3
  Population: Population - Cells

 DATASET OVERVIEW:
  Shape: (26814, 19)
  Total Objects: 26,814
  Total Columns: 19
  Memory Usage: 7.46 MB

 COLUMN NAMES (19 columns):
   1. Row
   2. Column
   3. Timepoint
   4. Field
   5. Object No
   6. X
   7. Y
   8. Bounding Box
   9. Position X [µm]
  10. Position Y [µm]
  11. Compound
  12. Concentration
  13. Cell Type
  14. Cell Count
  15. Cells - ROI No
  16. Cells - Cell Area [µm²]
  17. Cells - Cell Roundness
  18. Cells - Cell Ratio Width to Length
  19. Cells - Cells Selected

 DATA TYPES:
int64      9
float64    7
object     3
Name: count, dtype: int64

 DETAILED COLUMN INFO:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26814 entries, 0 to 26813
Data columns (to

Unnamed: 0,Row,Column,Timepoint,Field,Object No,X,Y,Bounding Box,Position X [µm],Position Y [µm],Compound,Concentration,Cell Type,Cell Count,Cells - ROI No,Cells - Cell Area [µm²],Cells - Cell Roundness,Cells - Cell Ratio Width to Length,Cells - Cells Selected
0,2,2,0,4,1,599,25,"[558,1,641,53]",17.7,1445.36,,,,,3,206.929,0.442996,0.33895,1
1,2,2,0,4,2,651,41,"[554,1,726,115]",28.3,1435.34,,,,,3,713.406,0.391785,0.252081,1
2,2,2,0,4,3,927,20,"[872,1,1008,40]",119.09,1446.71,,,,,1,300.161,0.504446,0.264856,1
3,2,2,0,4,4,839,39,"[797,2,963,99]",99.88,1437.43,,,,,2,740.781,0.487663,0.360514,1
4,2,2,0,4,5,771,57,"[711,13,882,120]",75.22,1430.53,,,,,4,725.301,0.475919,0.329012,1


### Objects_Population - EEA1 Spots

In [14]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 50)

# Load the EEA1 Spots Data 
def load_eea1_spots_data(filepath):
    """Load and parse EEA1 Spots data from Phenix2 format"""
    with open(filepath, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    
    # Find [Data] section
    data_start = None
    metadata = {}
    
    for i, line in enumerate(lines):
        if line.strip() == '[Data]':
            data_start = i + 1
            break
        elif '\t' in line and not line.startswith('['):
            parts = line.strip().split('\t', 1)
            if len(parts) == 2:
                metadata[parts[0]] = parts[1]
    
    if data_start is None:
        raise ValueError("No [Data] section found")
    
    # Parse data section
    data_lines = lines[data_start:]
    header = data_lines[0].strip().split('\t')
    
    data_rows = []
    for line in data_lines[1:]:
        if line.strip():
            row = line.strip().split('\t')
            while len(row) < len(header):
                row.append('')
            data_rows.append(row[:len(header)])
    
    df = pd.DataFrame(data_rows, columns=header)
    
    # Convert numeric columns
    for col in df.columns:
        if col not in ['Compound', 'Cell Type', 'Bounding Box']:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    
    return df, metadata

# Load the data
filepath = '/Users/zhuangzhuang/Desktop/Data Science Project/Task2/Extracted Features/AGI_F1_P1_baseline__2025-02-07T12_52_48-Measurement 1/Evaluation3/Objects_Population - EEA1 Spots.txt'
df, metadata = load_eea1_spots_data(filepath)

print("=== EEA1 SPOTS DATASET EDA ===")
print("=" * 50)

# 1. METADATA INFORMATION 
print("\n METADATA:")
for key, value in metadata.items():
    print(f"  {key}: {value}")

# 2. BASIC DATASET INFORMATION 
print(f"\n DATASET OVERVIEW:")
print(f"  Shape: {df.shape}")
print(f"  Total Objects: {len(df):,}")
print(f"  Total Columns: {len(df.columns)}")
print(f"  Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# 3. COLUMN INFORMATION 
print(f"\n COLUMN NAMES ({len(df.columns)} columns):")
for i, col in enumerate(df.columns, 1):
    print(f"  {i:2d}. {col}")

print(f"\n DATA TYPES:")
print(df.dtypes.value_counts())

print(f"\n DETAILED COLUMN INFO:")
df.info()

# 4. ROW INDEX INFORMATION 
print(f"\n ROW INDEX:")
print(f"  Index type: {type(df.index)}")
print(f"  Index range: {df.index.min()} to {df.index.max()}")
print(f"  Index values (first 10): {list(df.index[:10])}")

# 5. BASIC STATISTICS 
print(f"\n NUMERIC COLUMNS SUMMARY:")
numeric_cols = df.select_dtypes(include=[np.number]).columns
print(f"  Number of numeric columns: {len(numeric_cols)}")

# Show descriptive statistics for key columns
key_columns = [col for col in numeric_cols if any(word in col.lower() 
               for word in ['intensity', 'area', 'roundness', 'contrast'])]

if key_columns:
    print(f"\n KEY MEASUREMENTS STATISTICS:")
    print(df[key_columns[:8]].describe().round(4))

# 6. MISSING VALUES
print(f"\n MISSING VALUES:")
missing_counts = df.isnull().sum()
missing_percent = (missing_counts / len(df) * 100).round(2)
missing_df = pd.DataFrame({
    'Column': missing_counts.index,
    'Missing_Count': missing_counts.values,
    'Missing_Percent': missing_percent.values
})
missing_df = missing_df[missing_df['Missing_Count'] > 0].sort_values('Missing_Count', ascending=False)

if len(missing_df) > 0:
    print(missing_df.to_string(index=False))
else:
    print("  No missing values found!")

# 7. UNIQUE VALUES IN KEY COLUMNS
print(f"\n UNIQUE VALUES IN KEY COLUMNS:")
key_categorical = ['Row', 'Column', 'Field', 'Timepoint']
for col in key_categorical:
    if col in df.columns:
        unique_vals = df[col].nunique()
        print(f"  {col}: {unique_vals} unique values {sorted(df[col].unique())}")

# 8. EEA1 SPOTS MEASUREMENTS OVERVIEW 
print(f"\n EEA1 SPOTS MEASUREMENTS:")
eea1_cols = [col for col in df.columns if 'eea1' in col.lower()]
print(f"  Number of eea1-specific columns: {len(eea1_cols)}")
for col in eea1_cols:
    print(f"    • {col}")

# 9. FIRST 5 ROWS PREVIEW
print(f"\n FIRST 5 ROWS PREVIEW:")
display(df.head())

=== EEA1 SPOTS DATASET EDA ===

 METADATA:
  Database Name: Phenix2
  Database Link: http://129.67.91.135/ODA/OdaService.asmx
  Evaluation Signature: 9a21bf12-06a9-475a-b54d-ebac2dcfb7c5
  Plate Name: AGI_F1_P1_baseline
  Measurement: Measurement 1
  Evaluation: Evaluation3
  Population: Population - EEA1 Spots

 DATASET OVERVIEW:
  Shape: (3260711, 28)
  Total Objects: 3,260,711
  Total Columns: 28
  Memory Usage: 1131.71 MB

 COLUMN NAMES (28 columns):
   1. Row
   2. Column
   3. Timepoint
   4. Field
   5. Object No
   6. X
   7. Y
   8. Bounding Box
   9. Position X [µm]
  10. Position Y [µm]
  11. Compound
  12. Concentration
  13. Cell Type
  14. Cell Count
  15. EEA1 Spots - Relative Spot  Intensity
  16. EEA1 Spots - Corrected Spot  Intensity
  17. EEA1 Spots - Uncorrected Spot  Peak Intensity
  18. EEA1 Spots - Spot Contrast
  19. EEA1 Spots - Spot Background Intensity
  20. EEA1 Spots - Spot Area [px²]
  21. EEA1 Spots - Region Intensity
  22. EEA1 Spots - Spot To Region Int

Unnamed: 0,Row,Column,Timepoint,Field,Object No,X,Y,Bounding Box,Position X [µm],Position Y [µm],Compound,Concentration,Cell Type,Cell Count,EEA1 Spots - Relative Spot Intensity,EEA1 Spots - Corrected Spot Intensity,EEA1 Spots - Uncorrected Spot Peak Intensity,EEA1 Spots - Spot Contrast,EEA1 Spots - Spot Background Intensity,EEA1 Spots - Spot Area [px²],EEA1 Spots - Region Intensity,EEA1 Spots - Spot To Region Intensity,EEA1 Spots - Object No in Cells Selected,EEA1 Spots - EEA1 Spot Area [µm²],EEA1 Spots - EEA1 Spot Roundness,EEA1 Spots - EEA1 Spot Width [µm],EEA1 Spots - EEA1 Spot Length [µm],EEA1 Spots - EEA1 Spot Ratio Width to Length
0,2,2,0,4,1,617,34,"[616,33,620,36]",23.12,1441.07,,,,,0.10588,42.7292,698,0.318432,360.833,16,156.062,2.58591,1,1.39935,0.833926,0.836466,1.47868,0.565685
1,2,2,0,4,2,616,39,"[614,37,618,41]",22.59,1439.74,,,,,0.237928,116.611,946,0.433876,373.5,18,156.062,3.14049,1,1.57427,0.923065,1.18294,1.47868,0.8
2,2,2,0,4,3,610,44,"[607,43,612,47]",20.55,1437.96,,,,,0.105363,38.1875,555,0.26244,324.25,16,156.062,2.32239,1,1.39935,0.833926,0.836466,1.59258,0.525226
3,2,2,0,4,4,611,47,"[609,45,614,50]",21.23,1437.17,,,,,0.116744,48.7115,560,0.206197,368.538,20,156.062,2.67361,1,1.74919,0.901653,1.18294,1.50796,0.784465
4,2,2,0,4,5,689,3,"[686,1,691,5]",44.13,1450.4,,,,,0.141619,86.9341,922,0.272669,526.923,21,391.894,1.56638,2,1.83665,0.938088,1.18294,1.50796,0.784465


### Objects_Population - Nuclei (2)

In [16]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 50)

# Load the Nuclei Data 
def load_nuclei_data(filepath):
    """Load and parse Nuclei data from Phenix2 format"""
    with open(filepath, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    
    # Find [Data] section
    data_start = None
    metadata = {}
    
    for i, line in enumerate(lines):
        if line.strip() == '[Data]':
            data_start = i + 1
            break
        elif '\t' in line and not line.startswith('['):
            parts = line.strip().split('\t', 1)
            if len(parts) == 2:
                metadata[parts[0]] = parts[1]
    
    if data_start is None:
        raise ValueError("No [Data] section found")
    
    # Parse data section
    data_lines = lines[data_start:]
    header = data_lines[0].strip().split('\t')
    
    data_rows = []
    for line in data_lines[1:]:
        if line.strip():
            row = line.strip().split('\t')
            while len(row) < len(header):
                row.append('')
            data_rows.append(row[:len(header)])
    
    df = pd.DataFrame(data_rows, columns=header)
    
    # Convert numeric columns
    for col in df.columns:
        if col not in ['Compound', 'Cell Type', 'Bounding Box']:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    
    return df, metadata

# Load the data
filepath = '/Users/zhuangzhuang/Desktop/Data Science Project/Task2/Extracted Features/AGI_F1_P1_baseline__2025-02-07T12_52_48-Measurement 1/Evaluation3/Objects_Population - Nuclei (2).txt'
df, metadata = load_nuclei_data(filepath)

print("=== NUCLEI DATASET EDA ===")
print("=" * 50)

# 1. METADATA INFORMATION
print("\n METADATA:")
for key, value in metadata.items():
    print(f"  {key}: {value}")

# 2. BASIC DATASET INFORMATION
print(f"\n DATASET OVERVIEW:")
print(f"  Shape: {df.shape}")
print(f"  Total Objects: {len(df):,}")
print(f"  Total Columns: {len(df.columns)}")
print(f"  Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# 3. COLUMN INFORMATION
print(f"\n COLUMN NAMES ({len(df.columns)} columns):")
for i, col in enumerate(df.columns, 1):
    print(f"  {i:2d}. {col}")

print(f"\n DATA TYPES:")
print(df.dtypes.value_counts())

print(f"\n DETAILED COLUMN INFO:")
df.info()

# 4. ROW INDEX INFORMATION
print(f"\n ROW INDEX:")
print(f"  Index type: {type(df.index)}")
print(f"  Index range: {df.index.min()} to {df.index.max()}")
print(f"  Index values (first 10): {list(df.index[:10])}")

# 5. BASIC STATISTICS
print(f"\n NUMERIC COLUMNS SUMMARY:")
numeric_cols = df.select_dtypes(include=[np.number]).columns
print(f"  Number of numeric columns: {len(numeric_cols)}")

# Show descriptive statistics for key columns
key_columns = [col for col in numeric_cols if any(word in col.lower() 
               for word in ['intensity', 'area', 'roundness', 'contrast'])]

if key_columns:
    print(f"\n KEY MEASUREMENTS STATISTICS:")
    print(df[key_columns[:8]].describe().round(4))

# 6. MISSING VALUES
print(f"\n MISSING VALUES:")
missing_counts = df.isnull().sum()
missing_percent = (missing_counts / len(df) * 100).round(2)
missing_df = pd.DataFrame({
    'Column': missing_counts.index,
    'Missing_Count': missing_counts.values,
    'Missing_Percent': missing_percent.values
})
missing_df = missing_df[missing_df['Missing_Count'] > 0].sort_values('Missing_Count', ascending=False)

if len(missing_df) > 0:
    print(missing_df.to_string(index=False))
else:
    print("  No missing values found!")

# 7. UNIQUE VALUES IN KEY COLUMNS
print(f"\n UNIQUE VALUES IN KEY COLUMNS:")
key_categorical = ['Row', 'Column', 'Field', 'Timepoint']
for col in key_categorical:
    if col in df.columns:
        unique_vals = df[col].nunique()
        print(f"  {col}: {unique_vals} unique values {sorted(df[col].unique())}")

# 8. NUCLEI MEASUREMENTS OVERVIEW
print(f"\n NUCLEI MEASUREMENTS:")
nuclei_cols = [col for col in df.columns if 'nuclei' in col.lower()]
print(f"  Number of nuclei-specific columns: {len(nuclei_cols)}")
for col in nuclei_cols:
    print(f"    • {col}")

# 9. FIRST 5 ROWS PREVIEW
print(f"\n FIRST 5 ROWS PREVIEW:")
display(df.head())

=== NUCLEI DATASET EDA ===

 METADATA:
  Database Name: Phenix2
  Database Link: http://129.67.91.135/ODA/OdaService.asmx
  Evaluation Signature: 9a21bf12-06a9-475a-b54d-ebac2dcfb7c5
  Plate Name: AGI_F1_P1_baseline
  Measurement: Measurement 1
  Evaluation: Evaluation3
  Population: Population - Nuclei (2)

 DATASET OVERVIEW:
  Shape: (21126, 14)
  Total Objects: 21,126
  Total Columns: 14
  Memory Usage: 5.07 MB

 COLUMN NAMES (14 columns):
   1. Row
   2. Column
   3. Timepoint
   4. Field
   5. Object No
   6. X
   7. Y
   8. Bounding Box
   9. Position X [µm]
  10. Position Y [µm]
  11. Compound
  12. Concentration
  13. Cell Type
  14. Cell Count

 DATA TYPES:
int64      7
float64    4
object     3
Name: count, dtype: int64

 DETAILED COLUMN INFO:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21126 entries, 0 to 21125
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Row              21126 non-nul

Unnamed: 0,Row,Column,Timepoint,Field,Object No,X,Y,Bounding Box,Position X [µm],Position Y [µm],Compound,Concentration,Cell Type,Cell Count
0,2,2,0,4,1,933,24,"[872,1,1008,87]",118.9,1442.85,,,,
1,2,2,0,4,2,841,34,"[799,2,948,99]",96.25,1438.18,,,,
2,2,2,0,4,3,628,37,"[554,1,726,115]",25.91,1437.6,,,,
3,2,2,0,4,4,777,57,"[711,13,882,120]",75.36,1430.69,,,,
4,2,2,0,4,5,826,262,"[683,136,938,355]",79.93,1379.5,,,,


### PlateResults

In [18]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 50)

# Load the PlateResults Data 
def load_plate_results_data(filepath):
    """Load and parse PlateResults data from Phenix2 format"""
    with open(filepath, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    
    # Find [Data] section
    data_start = None
    metadata = {}
    
    for i, line in enumerate(lines):
        if line.strip() == '[Data]':
            data_start = i + 1
            break
        elif '\t' in line and not line.startswith('['):
            parts = line.strip().split('\t', 1)
            if len(parts) == 2:
                metadata[parts[0]] = parts[1]
    
    if data_start is None:
        raise ValueError("No [Data] section found")
    
    # Parse data section
    data_lines = lines[data_start:]
    header = data_lines[0].strip().split('\t')
    
    data_rows = []
    for line in data_lines[1:]:
        if line.strip():
            row = line.strip().split('\t')
            while len(row) < len(header):
                row.append('')
            data_rows.append(row[:len(header)])
    
    df = pd.DataFrame(data_rows, columns=header)
    
    # Convert numeric columns
    for col in df.columns:
        if col not in ['Compound', 'Cell Type', 'Bounding Box']:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    
    return df, metadata

# Load the data
filepath = '/Users/zhuangzhuang/Desktop/Data Science Project/Task2/Extracted Features/AGI_F1_P1_baseline__2025-02-07T12_52_48-Measurement 1/Evaluation3/PlateResults.txt'
df, metadata = load_plate_results_data(filepath)

print("=== PLATE RESULTS DATASET EDA ===")
print("=" * 50)

# 1. METADATA INFORMATION
print("\n METADATA:")
for key, value in metadata.items():
    print(f"  {key}: {value}")

# 2. BASIC DATASET INFORMATION
print(f"\n DATASET OVERVIEW:")
print(f"  Shape: {df.shape}")
print(f"  Total Objects: {len(df):,}")
print(f"  Total Columns: {len(df.columns)}")
print(f"  Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# 3. COLUMN INFORMATION
print(f"\n COLUMN NAMES ({len(df.columns)} columns):")
for i, col in enumerate(df.columns, 1):
    print(f"  {i:2d}. {col}")

print(f"\n DATA TYPES:")
print(df.dtypes.value_counts())

print(f"\n DETAILED COLUMN INFO:")
df.info()

# 4. ROW INDEX INFORMATION
print(f"\n ROW INDEX:")
print(f"  Index type: {type(df.index)}")
print(f"  Index range: {df.index.min()} to {df.index.max()}")
print(f"  Index values (first 10): {list(df.index[:10])}")

# 5. BASIC STATISTICS
print(f"\n NUMERIC COLUMNS SUMMARY:")
numeric_cols = df.select_dtypes(include=[np.number]).columns
print(f"  Number of numeric columns: {len(numeric_cols)}")

# Show descriptive statistics for key columns
key_columns = [col for col in numeric_cols if any(word in col.lower() 
               for word in ['intensity', 'area', 'roundness', 'contrast'])]

if key_columns:
    print(f"\n KEY MEASUREMENTS STATISTICS:")
    print(df[key_columns[:8]].describe().round(4))

# 6. MISSING VALUES
print(f"\n MISSING VALUES:")
missing_counts = df.isnull().sum()
missing_percent = (missing_counts / len(df) * 100).round(2)
missing_df = pd.DataFrame({
    'Column': missing_counts.index,
    'Missing_Count': missing_counts.values,
    'Missing_Percent': missing_percent.values
})
missing_df = missing_df[missing_df['Missing_Count'] > 0].sort_values('Missing_Count', ascending=False)

if len(missing_df) > 0:
    print(missing_df.to_string(index=False))
else:
    print("  No missing values found!")

# 7. UNIQUE VALUES IN KEY COLUMNS
print(f"\n UNIQUE VALUES IN KEY COLUMNS:")
key_categorical = ['Row', 'Column', 'Field', 'Timepoint']
for col in key_categorical:
    if col in df.columns:
        unique_vals = df[col].nunique()
        print(f"  {col}: {unique_vals} unique values {sorted(df[col].unique())}")

# 8. PLATE RESULTS MEASUREMENTS OVERVIEW
print(f"\n PLATE RESULTS MEASUREMENTS:")
# Count different measurement types
cells_cols = [col for col in df.columns if 'cells' in col.lower()]
bodipy_cols = [col for col in df.columns if 'bodipy' in col.lower()]
eea1_cols = [col for col in df.columns if 'eea1' in col.lower()]
nuclei_cols = [col for col in df.columns if 'nuclei' in col.lower()]

print(f"  Number of cells-related columns: {len(cells_cols)}")
print(f"  Number of bodipy-related columns: {len(bodipy_cols)}")
print(f"  Number of eea1-related columns: {len(eea1_cols)}")
print(f"  Number of nuclei-related columns: {len(nuclei_cols)}")

print(f"\n  Well-level aggregated measurements:")
for col in df.columns:
    if 'mean per well' in col.lower() or 'number of objects' in col.lower():
        print(f"    • {col}")

# 9. FIRST 5 ROWS PREVIEW
print(f"\n FIRST 5 ROWS PREVIEW:")
display(df.head())

=== PLATE RESULTS DATASET EDA ===

 METADATA:
  Database Name: Phenix2
  Database Location: http://129.67.91.135/ODA/OdaService.asmx
  Evaluation Signature: 9a21bf12-06a9-475a-b54d-ebac2dcfb7c5
  Plate Name: AGI_F1_P1_baseline
  Measurement: Measurement 1
  Evaluation: Evaluation3

 DATASET OVERVIEW:
  Shape: (60, 58)
  Total Objects: 60
  Total Columns: 58
  Memory Usage: 0.03 MB

 COLUMN NAMES (58 columns):
   1. Row
   2. Column
   3. Timepoint
   4. Cells - Number of Objects
   5. Cells - Cell Area [µm²] - Mean per Well
   6. Cells - Cell Roundness - Mean per Well
   7. Cells - Cell Ratio Width to Length - Mean per Well
   8. Cells - Cells Selected - Mean per Well
   9. Nuclei (2) - Number of Objects
  10. bodipy Spots - Number of Objects
  11. bodipy Spots - Relative Spot  Intensity - Mean per Well
  12. bodipy Spots - Corrected Spot  Intensity - Mean per Well
  13. bodipy Spots - Uncorrected Spot  Peak Intensity - Mean per Well
  14. bodipy Spots - Spot Contrast - Mean per Well
 

Unnamed: 0,Row,Column,Timepoint,Cells - Number of Objects,Cells - Cell Area [µm²] - Mean per Well,Cells - Cell Roundness - Mean per Well,Cells - Cell Ratio Width to Length - Mean per Well,Cells - Cells Selected - Mean per Well,Nuclei (2) - Number of Objects,bodipy Spots - Number of Objects,bodipy Spots - Relative Spot Intensity - Mean per Well,bodipy Spots - Corrected Spot Intensity - Mean per Well,bodipy Spots - Uncorrected Spot Peak Intensity - Mean per Well,bodipy Spots - Spot Contrast - Mean per Well,bodipy Spots - Spot Background Intensity - Mean per Well,bodipy Spots - Spot Area [px²] - Mean per Well,bodipy Spots - Region Intensity - Mean per Well,bodipy Spots - Spot To Region Intensity - Mean per Well,bodipy Spots - bodipy Spot Area [µm²] - Mean per Well,bodipy Spots - bodipy Spot Roundness - Mean per Well,bodipy Spots - bodipy Spot Width [µm] - Mean per Well,bodipy Spots - bodipy Spot Length [µm] - Mean per Well,bodipy Spots - bodipy Spot Ratio Width to Length - Mean per Well,Cells Selected - Number of Objects,Cells Selected - Cell Area [µm²] - Mean per Well,Cells Selected - Cell Roundness - Mean per Well,Cells Selected - Cell Ratio Width to Length - Mean per Well,Cells Selected - Intensity Cell Alexa 568 Mean - Mean per Well,Cells Selected - Total Spot Area - Mean per Well,Cells Selected - Relative Spot Intensity - Mean per Well,Cells Selected - Number of Spots - Mean per Well,Cells Selected - Number of Spots per Area of Cell - Mean per Well,Cells Selected - Total Spot Area (2) - Mean per Well,Cells Selected - Relative Spot Intensity (2) - Mean per Well,Cells Selected - Number of Spots (2) - Mean per Well,Cells Selected - Number of Spots per Area of Cell (2) - Mean per Well,EEA1 Spots - Number of Objects,EEA1 Spots - Relative Spot Intensity - Mean per Well,EEA1 Spots - Corrected Spot Intensity - Mean per Well,EEA1 Spots - Uncorrected Spot Peak Intensity - Mean per Well,EEA1 Spots - Spot Contrast - Mean per Well,EEA1 Spots - Spot Background Intensity - Mean per Well,EEA1 Spots - Spot Area [px²] - Mean per Well,EEA1 Spots - Region Intensity - Mean per Well,EEA1 Spots - Spot To Region Intensity - Mean per Well,EEA1 Spots - EEA1 Spot Area [µm²] - Mean per Well,EEA1 Spots - EEA1 Spot Roundness - Mean per Well,EEA1 Spots - EEA1 Spot Width [µm] - Mean per Well,EEA1 Spots - EEA1 Spot Length [µm] - Mean per Well,EEA1 Spots - EEA1 Spot Ratio Width to Length - Mean per Well,Cells - Cell Area [µm²] - CV % per Well,Cells - Cell Roundness - CV % per Well,Number of Analyzed Fields,Time [s],Compound,Concentration,Cell Type,Cell Count
0,2,2,0,533,582.026863,0.440864,0.328407,0.969981,467,30413,0.083764,104.202258,1191.465656,0.174452,750.194039,28.782757,571.257901,1.491055,2.517323,0.891551,1.165925,1.95996,0.620092,517,539.582526,0.442366,0.326109,2133.576812,1693.172147,0.051935,58.825919,0.009416,2516.299807,0.065904,129.321083,0.020774,66859,0.095846,107.118056,1488.62919,0.217156,926.651312,19.457769,793.199065,1.315846,1.701765,0.918014,0.975921,1.541324,0.665181,63.278368,29.768467,17,0,,,,
1,2,3,0,121,662.01993,0.431524,0.335883,0.933884,112,7147,0.098671,107.143233,1252.025605,0.20654,761.699715,25.780607,565.320237,1.578402,2.254756,0.90714,1.117219,1.813681,0.642028,113,568.201257,0.443282,0.338669,3121.113133,1630.566372,0.055913,63.247788,0.009719,2377.424779,0.074887,119.150442,0.018093,13464,0.107664,174.608689,2110.07546,0.235218,1218.824739,19.953134,989.754601,1.444869,1.745089,0.918527,0.988419,1.561639,0.663249,71.14969,36.906947,17,0,,,,
2,2,4,0,396,578.041682,0.453058,0.315926,0.982323,373,23716,0.092445,102.967322,1191.614185,0.192123,735.09621,27.314218,526.319295,1.598775,2.388885,0.89996,1.144636,1.879431,0.634887,389,546.413037,0.454272,0.313865,3369.101623,1665.254499,0.055351,60.966581,0.009714,2697.897172,0.066975,130.526992,0.020612,50775,0.094715,152.621664,2124.976681,0.212012,1339.401414,20.669266,1168.126448,1.285888,1.807722,0.912817,1.000486,1.602414,0.655684,62.717572,31.150841,17,0,,,,
3,2,5,0,416,581.098735,0.423925,0.331768,0.96875,389,19536,0.081822,82.88936,1057.186937,0.173247,683.750483,28.727836,486.7096,1.573015,2.512519,0.893034,1.167395,1.954996,0.622956,403,514.82616,0.425498,0.33053,2789.923092,1392.622829,0.042272,48.476427,0.008315,2281.558313,0.061403,107.183623,0.018144,43195,0.090043,179.935161,2610.692997,0.203378,1682.141092,21.286445,1398.533837,1.357918,1.8617,0.908862,1.010471,1.634719,0.648733,85.239877,33.299688,17,0,,,,
4,2,6,0,273,689.776246,0.433958,0.309916,0.934066,181,16491,0.083715,59.587104,807.231399,0.181757,520.731277,26.101631,396.150079,1.475501,2.282833,0.898242,1.118497,1.845361,0.630124,255,575.053767,0.439035,0.306433,2861.021835,1688.007843,0.039659,64.670588,0.009652,2839.247059,0.066876,139.639216,0.020718,35608,0.093353,128.689164,1814.496939,0.210945,1142.906826,20.332734,992.715985,1.292735,1.778289,0.912838,0.990838,1.588063,0.655366,80.32103,34.438496,17,0,,,,


### indexfile

In [20]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 50)

# Load the IndexFile Data 
def load_indexfile_data(filepath):
    """Load and parse IndexFile data from Phenix2 format"""
    with open(filepath, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    
    # Find [Data] section
    data_start = None
    metadata = {}
    
    for i, line in enumerate(lines):
        if line.strip() == '[Data]':
            data_start = i + 1
            break
        elif '\t' in line and not line.startswith('['):
            parts = line.strip().split('\t', 1)
            if len(parts) == 2:
                metadata[parts[0]] = parts[1]
    
    if data_start is None:
        # If no [Data] section found, assume the whole file is data with header
        data_start = 0
    
    # Parse data section
    data_lines = lines[data_start:]
    if not data_lines:
        return pd.DataFrame(), metadata
        
    header = data_lines[0].strip().split('\t')
    
    data_rows = []
    for line in data_lines[1:]:
        if line.strip():
            row = line.strip().split('\t')
            while len(row) < len(header):
                row.append('')
            data_rows.append(row[:len(header)])
    
    df = pd.DataFrame(data_rows, columns=header)
    
    # Convert numeric columns
    for col in df.columns:
        if col not in ['Compound', 'Cell Type', 'Bounding Box', 'URL', 'Channel Name', 'Channel Type', 'Time Stamp']:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    
    return df, metadata

# Load the data
filepath = '/Users/zhuangzhuang/Desktop/Data Science Project/Task2/Extracted Features/AGI_F1_P1_baseline__2025-02-07T12_52_48-Measurement 1/indexfile.txt'
df, metadata = load_indexfile_data(filepath)

print("=== INDEXFILE DATASET EDA ===")
print("=" * 50)

# 1. METADATA INFORMATION
print("\n METADATA:")
if metadata:
    for key, value in metadata.items():
        print(f"  {key}: {value}")
else:
    print("  No metadata found (file contains only data)")

# 2. BASIC DATASET INFORMATION 
print(f"\n DATASET OVERVIEW:")
print(f"  Shape: {df.shape}")
print(f"  Total Objects: {len(df):,}")
print(f"  Total Columns: {len(df.columns)}")
print(f"  Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# 3. COLUMN INFORMATION 
print(f"\n COLUMN NAMES ({len(df.columns)} columns):")
for i, col in enumerate(df.columns, 1):
    print(f"  {i:2d}. {col}")

print(f"\n DATA TYPES:")
print(df.dtypes.value_counts())

print(f"\n DETAILED COLUMN INFO:")
df.info()

# 4. ROW INDEX INFORMATION
print(f"\n ROW INDEX:")
print(f"  Index type: {type(df.index)}")
print(f"  Index range: {df.index.min()} to {df.index.max()}")
print(f"  Index values (first 10): {list(df.index[:10])}")

# 5. BASIC STATISTICS 
print(f"\n NUMERIC COLUMNS SUMMARY:")
numeric_cols = df.select_dtypes(include=[np.number]).columns
print(f"  Number of numeric columns: {len(numeric_cols)}")

# Show descriptive statistics for key columns
key_columns = [col for col in numeric_cols if any(word in col.lower() 
               for word in ['intensity', 'area', 'roundness', 'contrast', 'resolution', 'size', 'position'])]

if key_columns:
    print(f"\n KEY MEASUREMENTS STATISTICS:")
    print(df[key_columns[:8]].describe().round(4))

# 6. MISSING VALUES 
print(f"\n MISSING VALUES:")
missing_counts = df.isnull().sum()
missing_percent = (missing_counts / len(df) * 100).round(2)
missing_df = pd.DataFrame({
    'Column': missing_counts.index,
    'Missing_Count': missing_counts.values,
    'Missing_Percent': missing_percent.values
})
missing_df = missing_df[missing_df['Missing_Count'] > 0].sort_values('Missing_Count', ascending=False)

if len(missing_df) > 0:
    print(missing_df.to_string(index=False))
else:
    print("  No missing values found!")

# 7. UNIQUE VALUES IN KEY COLUMNS
print(f"\n UNIQUE VALUES IN KEY COLUMNS:")
key_categorical = ['Row', 'Column', 'Field', 'Timepoint', 'Plane', 'Channel ID', 'Channel Name', 'Channel Type']
for col in key_categorical:
    if col in df.columns:
        unique_vals = df[col].nunique()
        print(f"  {col}: {unique_vals} unique values {sorted(df[col].unique()) if unique_vals <= 20 else str(df[col].nunique()) + ' values'}")

# 8. IMAGE INDEX MEASUREMENTS OVERVIEW 
print(f"\n IMAGE INDEX MEASUREMENTS:")
image_cols = [col for col in df.columns if any(word in col.lower() for word in ['image', 'channel', 'resolution', 'size', 'position', 'url', 'time'])]
print(f"  Number of image-related columns: {len(image_cols)}")
for col in image_cols:
    print(f"    • {col}")

# Image analysis specific insights
if 'Channel Name' in df.columns:
    print(f"\n  Channel Information:")
    channel_counts = df['Channel Name'].value_counts()
    for channel, count in channel_counts.items():
        print(f"    • {channel}: {count} images")

if 'Field' in df.columns and 'Row' in df.columns and 'Column' in df.columns:
    total_wells = df.groupby(['Row', 'Column']).ngroups
    total_fields = df['Field'].nunique()
    total_images = len(df)
    print(f"\n  Imaging Summary:")
    print(f"    • Total wells: {total_wells}")
    print(f"    • Fields per well: {total_fields}")
    print(f"    • Total images: {total_images}")
    print(f"    • Images per well: {total_images // total_wells if total_wells > 0 else 'N/A'}")

# 9. FIRST 5 ROWS PREVIEW
print(f"\n FIRST 5 ROWS PREVIEW:")
display(df.head())

=== INDEXFILE DATASET EDA ===

 METADATA:
  Row: Column	Plane	Timepoint	Field	Channel ID	Channel Name	Channel Type	URL	ImageResolutionX [m]	ImageResolutionY [m]	ImageSizeX	ImageSizeY	PositionX [m]	PositionY [m]	Time Stamp
  2: 11	3	0	17	5	Alexa 647	Fluorescence	http://129.67.91.135/ODA/Images/C/ac4b010d-d0dd-4c82-a957-c8e9e59e28fc/28-101-56-14-815-1014-409-211.tiff	2.98988040478381E-07	2.98988040478381E-07	1080	1080	0.000645814	-0.001937443	2025-02-07T13:53:23.520+00:00
  3: 11	3	0	17	5	Alexa 647	Fluorescence	http://129.67.91.135/ODA/Images/C/ac4b010d-d0dd-4c82-a957-c8e9e59e28fc/28-101-56-14-815-1015-409-211.tiff	2.98988040478381E-07	2.98988040478381E-07	1080	1080	0.000645814	-0.001937443	2025-02-07T13:52:23.117+00:00
  4: 11	3	0	17	5	Alexa 647	Fluorescence	http://129.67.91.135/ODA/Images/C/ac4b010d-d0dd-4c82-a957-c8e9e59e28fc/28-101-56-14-815-1008-409-211.tiff	2.98988040478381E-07	2.98988040478381E-07	1080	1080	0.000645814	-0.001937443	2025-02-07T13:51:23.040+00:00
  5: 11	3	0	17	5	Al

Unnamed: 0,Row,Column,Plane,Timepoint,Field,Channel ID,Channel Name,Channel Type,URL,ImageResolutionX [m],ImageResolutionY [m],ImageSizeX,ImageSizeY,PositionX [m],PositionY [m],Time Stamp
0,2,2,1,0,1,1,DAPI,Fluorescence,http://129.67.91.135/ODA/Images/C/ac4b010d-d0d...,2.98988e-07,2.98988e-07,1080,1080,0.000646,0.0,2025-02-07T12:52:59.070+00:00
1,2,2,1,0,1,2,Alexa 568,Fluorescence,http://129.67.91.135/ODA/Images/C/ac4b010d-d0d...,2.98988e-07,2.98988e-07,1080,1080,0.000646,0.0,2025-02-07T12:52:59.070+00:00
2,2,2,1,0,1,3,Brightfield,Brightfield,http://129.67.91.135/ODA/Images/C/ac4b010d-d0d...,2.98988e-07,2.98988e-07,1080,1080,0.000646,0.0,2025-02-07T12:52:59.087+00:00
3,2,2,1,0,1,4,Alexa 488,Fluorescence,http://129.67.91.135/ODA/Images/C/ac4b010d-d0d...,2.98988e-07,2.98988e-07,1080,1080,0.000646,0.0,2025-02-07T12:52:59.447+00:00
4,2,2,1,0,1,5,Alexa 647,Fluorescence,http://129.67.91.135/ODA/Images/C/ac4b010d-d0d...,2.98988e-07,2.98988e-07,1080,1080,0.000646,0.0,2025-02-07T12:52:59.460+00:00
