### Imports and variables

In [1]:
# Import required libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from pathlib import Path
import plotly.graph_objects as go
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
# Set random seeds for reproducibility
np.random.seed(42)

# data paths
csv_dir = Path("/Users/felipecolombelli/phd/liver-plastic/datasets")
hep_path = csv_dir / "hepg2"
f1 = hep_path / "df_SingleCell_AO_HEPG2_102912.csv"
f2 = hep_path / "df_SingleCell_AO_HEPG2_110341.csv"
f3 = hep_path / "df_SingleCell_AO_HEPG2_231222.csv"

### Load and preprocess data

In [12]:
# Preprocessing function
def preprocess_dataframe(df, nan_threshold=0.5):
    feature_columns = [col for col in df.columns if not col.startswith(('Metadata_', 'Image_', 'Cells_ObjectNumber'))]
    print("Selected feature columns:", feature_columns)
    print("Number of feature columns:", len(feature_columns))
    
    X = df[feature_columns]
    # print("Initial number of rows in X:", X.shape[0])
    # print("NaN count per column:\n", X.isna().sum())
    # print("Inf count per column:\n", np.isinf(X).sum())
    
    threshold = X.shape[0] * nan_threshold
    valid_columns = [col for col in X.columns if X[col].isna().sum() < threshold and np.isinf(X[col]).sum() < threshold]
    print("Valid columns after filtering (>50% valid data):", valid_columns)
    print("Number of valid columns:", len(valid_columns))
    
    if not valid_columns:
        raise ValueError("No valid columns remain after filtering.")
    
    X = X[valid_columns]
    X = X.replace([np.inf, -np.inf], np.nan)
    X = X.fillna(X.median())
    
    nan_count_after_fill = X.isna().sum().sum()
    print("NaN count after filling with median:", nan_count_after_fill)
    if nan_count_after_fill > 0:
        print("Warning: Some NaN values remain. Filling with zero.")
        X = X.fillna(0)
    
    if X.shape[0] == 0 or X.shape[1] == 0:
        raise ValueError("No rows/columns remain after preprocessing.")
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled, valid_columns


df1 = pd.read_csv(f1, sep=",", header=0)
X_scaled, valid_columns = preprocess_dataframe(df1)
labels = labels = df1["Metadata_concentration_perliter"].astype(str)

  df1 = pd.read_csv(f1, sep=",", header=0)


Selected feature columns: ['Cells_AreaShape_Area', 'Cells_AreaShape_BoundingBoxArea', 'Cells_AreaShape_BoundingBoxMaximum_X', 'Cells_AreaShape_BoundingBoxMaximum_Y', 'Cells_AreaShape_BoundingBoxMinimum_X', 'Cells_AreaShape_BoundingBoxMinimum_Y', 'Cells_AreaShape_Center_X', 'Cells_AreaShape_Center_Y', 'Cells_AreaShape_CentralMoment_0_0', 'Cells_AreaShape_CentralMoment_0_1', 'Cells_AreaShape_CentralMoment_0_2', 'Cells_AreaShape_CentralMoment_0_3', 'Cells_AreaShape_CentralMoment_1_0', 'Cells_AreaShape_CentralMoment_1_1', 'Cells_AreaShape_CentralMoment_1_2', 'Cells_AreaShape_CentralMoment_1_3', 'Cells_AreaShape_CentralMoment_2_0', 'Cells_AreaShape_CentralMoment_2_1', 'Cells_AreaShape_CentralMoment_2_2', 'Cells_AreaShape_CentralMoment_2_3', 'Cells_AreaShape_Compactness', 'Cells_AreaShape_ConvexArea', 'Cells_AreaShape_Eccentricity', 'Cells_AreaShape_EquivalentDiameter', 'Cells_AreaShape_EulerNumber', 'Cells_AreaShape_Extent', 'Cells_AreaShape_FormFactor', 'Cells_AreaShape_HuMoment_0', 'Cells

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, labels, test_size=0.2)

# Train the Random Forest model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Evaluate the model
accuracy_before = rf.score(X_test, y_test)
print(f'Accuracy before feature selection: {accuracy_before:.2f}')

Accuracy before feature selection: 0.34


In [58]:
# Extract feature importances
importances = rf.feature_importances_
feature_names = valid_columns
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})

# Rank features by importance
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
display(feature_importance_df)

# Select top N features
top_features = feature_importance_df['Feature'][:17].index
X_train_selected = X_train[:, top_features]
X_test_selected = X_test[:, top_features]

Unnamed: 0,Feature,Importance
632,Nuclei_Correlation_RWC_AOGFP_AOPI,0.002780
633,Nuclei_Correlation_RWC_AOPI_AOGFP,0.002764
1299,Cytoplasm_Mean_Vesicles_Number_Object_Number,0.002710
627,Nuclei_Correlation_K_AOGFP_AOPI,0.002172
628,Nuclei_Correlation_K_AOPI_AOGFP,0.001543
...,...,...
161,Cells_Location_CenterMassIntensity_Z_AOPI,0.000000
164,Cells_Location_Center_Z,0.000000
169,Cells_Location_MaxIntensity_Z_AOGFP,0.000000
170,Cells_Location_MaxIntensity_Z_AOPI,0.000000


In [59]:
(feature_importance_df["Importance"] > 0).sum()

np.int64(1592)

In [60]:
feature_importance_df[:17]

Unnamed: 0,Feature,Importance
632,Nuclei_Correlation_RWC_AOGFP_AOPI,0.00278
633,Nuclei_Correlation_RWC_AOPI_AOGFP,0.002764
1299,Cytoplasm_Mean_Vesicles_Number_Object_Number,0.00271
627,Nuclei_Correlation_K_AOGFP_AOPI,0.002172
628,Nuclei_Correlation_K_AOPI_AOGFP,0.001543
1265,Cytoplasm_Intensity_MeanIntensityEdge_AOPI,0.001521
131,Cells_Intensity_LowerQuartileIntensity_AOPI,0.001456
430,Cells_Texture_SumAverage_AOGFP_5_01_256,0.001395
655,Nuclei_Intensity_IntegratedIntensityEdge_AOPI,0.001391
424,Cells_Texture_SumAverage_AOGFP_10_03_256,0.001357


In [61]:
# Train the Random Forest model with selected features
rf_selected = RandomForestClassifier(n_estimators=100)
rf_selected.fit(X_train_selected, y_train)

# Evaluate the model
accuracy_after = rf_selected.score(X_test_selected, y_test)
print(f'Accuracy after feature selection: {accuracy_after:.2f}')

Accuracy after feature selection: 0.34


In [64]:
pd.Series(rf_selected.predict(X_test_selected)).value_counts()

0        5345
0.5g     1000
1g        731
0.01g     506
1ug       467
1ng       385
1mg       356
0.1g      330
Name: count, dtype: int64

In [67]:
feature_importance_df[:17]['Feature'].values

array(['Nuclei_Correlation_RWC_AOGFP_AOPI',
       'Nuclei_Correlation_RWC_AOPI_AOGFP',
       'Cytoplasm_Mean_Vesicles_Number_Object_Number',
       'Nuclei_Correlation_K_AOGFP_AOPI',
       'Nuclei_Correlation_K_AOPI_AOGFP',
       'Cytoplasm_Intensity_MeanIntensityEdge_AOPI',
       'Cells_Intensity_LowerQuartileIntensity_AOPI',
       'Cells_Texture_SumAverage_AOGFP_5_01_256',
       'Nuclei_Intensity_IntegratedIntensityEdge_AOPI',
       'Cells_Texture_SumAverage_AOGFP_10_03_256',
       'Cells_Intensity_MeanIntensityEdge_AOPI',
       'Nuclei_Intensity_MeanIntensity_AOGFP',
       'Cells_Texture_SumAverage_AOGFP_5_00_256',
       'Cytoplasm_Intensity_UpperQuartileIntensity_AOGFP',
       'Cytoplasm_Texture_SumAverage_AOGFP_10_02_256',
       'Cells_Texture_SumAverage_AOGFP_5_03_256',
       'Nuclei_Granularity_10_AOPI'], dtype=object)