In [1]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay

In [2]:
# Load data
df = pd.read_csv("/Users/mariahloehr/IICD/IICD/Data/cell_cycle_tidied.csv")

df['phase'] = df['phase'].replace({'M': 'G2'})

# Separate features and target
X = df.drop(columns=['phase', 'age', 'PHATE_1', 'PHATE_2'])  # exclude phase and age
y = df['phase']  # target is now phase

# Split data into train and test sets (80:20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=949, stratify =y)

In [3]:
# Define linear-kernel SVM
svm = SVC(kernel='linear', probability=False, random_state=949)

In [4]:
# Hyperparameter tuning
param_grid = {
    'C': [0.001, 0.01, 0.1]
}

grid_search = GridSearchCV(
    estimator=svm,
    param_grid=param_grid,
    cv=10,
    scoring='accuracy',
    n_jobs=-1
)

# Fit the model
grid_search.fit(X_train, y_train)

# Output best settings
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation accuracy:", grid_search.best_score_)

Best parameters: {'C': 0.1}
Best cross-validation accuracy: 0.9549435028248588


In [5]:
# Retrain using best params
best_svm = grid_search.best_estimator_
best_svm.fit(X_train, y_train)

# Predict
y_train_pred = best_svm.predict(X_train)
y_test_pred = best_svm.predict(X_test)

In [7]:
# save results
# === Load existing results ===
results_df = pd.read_csv("/Users/mariahloehr/IICD/IICD/Bar Plot/classification_results.csv", index_col=0)

# === Compute accuracy ===
from sklearn.metrics import accuracy_score

overall_acc = accuracy_score(y_test, y_test_pred)

df_test = pd.DataFrame({'true': y_test, 'pred': y_test_pred})
acc_per_phase = df_test.groupby('true').apply(lambda x: accuracy_score(x['true'], x['pred']))

# === Insert values ===
model_name = "SVM-lin (full)"
results_df.loc[model_name, 'Overall'] = overall_acc

# Set per-phase accuracies
for phase in ['G0', 'G1', 'G2', 'S']:
    if phase in acc_per_phase.index:
        results_df.loc[model_name, phase] = acc_per_phase[phase]

# === Save updated file ===
results_df.to_csv("/Users/mariahloehr/IICD/IICD/Bar Plot/classification_results.csv")

  acc_per_phase = df_test.groupby('true').apply(lambda x: accuracy_score(x['true'], x['pred']))


In [9]:
# Output results
print("\n=== Training Set ===")
print("Overall Accuracy:", accuracy_score(y_train, y_train_pred))

print("\n=== Test Set ===")
print("Overall Accuracy:", accuracy_score(y_test, y_test_pred))

# Accuracy per class
df_train = pd.DataFrame({'true': y_train, 'pred': y_train_pred})
accuracy_per_phase_train = df_train.groupby('true').apply(lambda x: accuracy_score(x['true'], x['pred']))
print("Accuracy per phase (Train):")
print(accuracy_per_phase_train)

df_test = pd.DataFrame({'true': y_test, 'pred': y_test_pred})
accuracy_per_phase_test = df_test.groupby('true').apply(lambda x: accuracy_score(x['true'], x['pred']))
print("Accuracy per phase (Test):")
print(accuracy_per_phase_test)

# Confusion Matrix
print("\nTest Confusion Matrix")
print(confusion_matrix(y_test, y_test_pred, labels=["G0", "G1", "G2", "S"]))


=== Training Set ===
Overall Accuracy: 0.9840395480225989

=== Test Set ===
Overall Accuracy: 0.9559322033898305
Accuracy per phase (Train):
true
G0    0.985944
G1    0.983814
G2    0.977941
S     0.985721
dtype: float64
Accuracy per phase (Test):
true
G0    0.970509
G1    0.947950
G2    0.945378
S     0.960000
dtype: float64

Test Confusion Matrix
[[362   9   1   1]
 [  7 601   6  20]
 [  1   1 225  11]
 [  1   9  11 504]]


  accuracy_per_phase_train = df_train.groupby('true').apply(lambda x: accuracy_score(x['true'], x['pred']))
  accuracy_per_phase_test = df_test.groupby('true').apply(lambda x: accuracy_score(x['true'], x['pred']))
