In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

In [12]:
# Load data
df = pd.read_csv("/Users/mariahloehr/IICD/IICD/Data/cell_cycle_tidied.csv")

# Combine phase M and G2 into one class
df['phase'] = df['phase'].replace({'M': 'G2'})

le = LabelEncoder()
df['phase'] = le.fit_transform(df['phase']) 

# Separate features and target
X = df.drop(columns=['phase', 'age', 'PHATE_1', 'PHATE_2'])
y = df['phase']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=949)

In [13]:
# Initialize XGBoost Regressor
xgb_model = XGBClassifier(
    n_estimators=500,
    eta = 0.3,
    max_depth=50,
    tree_method = "hist",
    random_state=949
)

In [14]:
## hypertuning

param_grid = {
    'n_estimators': [500, 600, 700],              
    #'max_depth': [40, 50, 60],
    'eta': [0.05, 0.1, 0.3]          
}

# Set up GridSearch with 10-fold cross-validation optimizing for accuracy
grid_search = GridSearchCV(
    estimator= xgb_model,
    param_grid=param_grid,
    cv=10,
    scoring= 'accuracy',
    n_jobs=-1
)

# Fit the model to your data
grid_search.fit(X_train, y_train)

#Output best settings and best accuracy
print("Best parameters:", grid_search.best_params_)
#print("Best cross-validation accuracy:", grid_search.best_score_)

Best parameters: {'eta': 0.1, 'n_estimators': 600}


In [15]:
# retrain model
xgb_model = XGBClassifier(
    n_estimators=600,
    eta = 0.1,
    max_depth=50,
    tree_method = "hist",
    random_state=949
)

In [17]:
xgb_model.fit(X_train, y_train)

# Predict
y_train_pred = xgb_model.predict(X_train)
y_test_pred = xgb_model.predict(X_test)

In [19]:
y_train_labels = le.inverse_transform(y_train)
y_test_labels = le.inverse_transform(y_test)
y_train_pred_labels = le.inverse_transform(y_train_pred)
y_test_pred_labels = le.inverse_transform(y_test_pred)

# Output results
print("=== Training Set ===")
print("Overall Accuracy:", accuracy_score(y_train_labels, y_train_pred_labels))

print("\n=== Test Set ===")
print("Overall Accuracy:", accuracy_score(y_test_labels, y_test_pred_labels))

# Accuracy per phase (Train)
df_train = pd.DataFrame({'true': y_train_labels, 'pred': y_train_pred_labels})
accuracy_per_phase_train = df_train.groupby('true').apply(lambda x: accuracy_score(x['true'], x['pred']))

print("Accuracy per phase (Train):")
print(accuracy_per_phase_train)

# Accuracy per phase (Test)
df_test = pd.DataFrame({'true': y_test_labels, 'pred': y_test_pred_labels})
accuracy_per_phase_test = df_test.groupby('true').apply(lambda x: accuracy_score(x['true'], x['pred']))

print("\nAccuracy per phase (Test):")
print(accuracy_per_phase_test)

# Confusion Matrix
print("\nTest Confusion Matrix")
print(confusion_matrix(y_test_labels, y_test_pred_labels, labels=["G0", "G1", "G2", "S"]))

=== Training Set ===
Overall Accuracy: 1.0

=== Test Set ===
Overall Accuracy: 0.9796610169491525
Accuracy per phase (Train):
true
G0    1.0
G1    1.0
G2    1.0
S     1.0
dtype: float64

Accuracy per phase (Test):
true
G0    1.000000
G1    0.974603
G2    0.971660
S     0.974903
dtype: float64

Test Confusion Matrix
[[375   0   0   0]
 [  0 614   2  14]
 [  0   1 240   6]
 [  0   7   6 505]]


  accuracy_per_phase_train = df_train.groupby('true').apply(lambda x: accuracy_score(x['true'], x['pred']))
  accuracy_per_phase_test = df_test.groupby('true').apply(lambda x: accuracy_score(x['true'], x['pred']))
