In [77]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.utils.class_weight import compute_class_weight

# Load your data (assume already loaded as df)
df = pd.read_csv('patient_level_comprehensive_features.csv') 

df = df.drop(columns=['patient_id'])


#data pre-prcessing: need to convert clinical data using one-hot numeric arrays? 
columns_by_index = ['age_at_mri', 'sex', 'overall_survival_days', 'mgmt_status', 'mgmt_index', '1p19q', 'idh_status', 'eor']

for column in columns_by_index:
    unique_values = df[column].unique()
    #print(f"Unique values in column '{column}': {unique_values}")

#Notes-what to do with unknown data in categorical data and in 'overall_survival_days' 'mgmt_index'?

#all categorical data being encoded, unknowns included
df_encoded = pd.get_dummies(df, columns=['sex', 'mgmt_status', '1p19q', 'idh_status', 'eor'], drop_first=False)

print("\nOne-Hot Encoded DataFrame (Pandas):")
print(df_encoded)
    


#testing data w/out clinical features 
#indices_to_drop = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
#columns_by_index = df.columns[indices_to_drop].tolist()
#df_dropped = df.drop(columns=columns_by_index)


#df = df.drop(columns=[indices_to_drop])
#print(df_dropped)








One-Hot Encoded DataFrame (Pandas):
     who_grade  age_at_mri  overall_survival_days mgmt_index   fa_mean  \
0            4          66                 1303.0          0  0.253438   
1            4          80                  274.0    unknown  0.225728   
2            4          70                  417.0    unknown  0.245633   
3            4          70                  185.0          0  0.270115   
4            4          68                  389.0          0  0.200340   
..         ...         ...                    ...        ...       ...   
495          4          58                  267.0         16  0.256128   
496          4          64                  334.0         16  0.253504   
497          4          41                  287.0          0  0.263215   
498          4          72                  291.0         12  0.244807   
499          2          70                  215.0         17  0.242826   

       fa_std  fa_min    fa_max  fa_median     fa_p5  ...  \
0    0.163584

0       True
1      False
2       True
3       True
4      False
       ...  
495    False
496     True
497    False
498     True
499    False
Name: sex_M, Length: 500, dtype: bool


In [92]:
#replacing NAs and unknowns with 0s, change before submitting 

#note that survival days has missing value-what to do with that? 

missing_counts = df_encoded.isna().sum()



#print(missing_counts)
df_filled = df_encoded.fillna(0)

#confirm no NAs 
#print(sum(df_filled.isna().sum()>0))

#replacing unknowns in clinical data with 0, change before final 
df_clean = df_filled.replace('unknown', '0')
print(df_clean)





     who_grade  age_at_mri  overall_survival_days mgmt_index   fa_mean  \
0            4          66                 1303.0          0  0.253438   
1            4          80                  274.0          0  0.225728   
2            4          70                  417.0          0  0.245633   
3            4          70                  185.0          0  0.270115   
4            4          68                  389.0          0  0.200340   
..         ...         ...                    ...        ...       ...   
495          4          58                  267.0         16  0.256128   
496          4          64                  334.0         16  0.253504   
497          4          41                  287.0          0  0.263215   
498          4          72                  291.0         12  0.244807   
499          2          70                  215.0         17  0.242826   

       fa_std  fa_min    fa_max  fa_median     fa_p5  ...  \
0    0.163584     0.0  1.192218   0.212847  0.0591

In [94]:
X = df_clean.drop(columns=['who_grade']) #replace to df 
print(X)
y = df_clean['who_grade']
print(y)


     age_at_mri  overall_survival_days mgmt_index   fa_mean    fa_std  fa_min  \
0            66                 1303.0          0  0.253438  0.163584     0.0   
1            80                  274.0          0  0.225728  0.150805     0.0   
2            70                  417.0          0  0.245633  0.146553     0.0   
3            70                  185.0          0  0.270115  0.155586     0.0   
4            68                  389.0          0  0.200340  0.136351     0.0   
..          ...                    ...        ...       ...       ...     ...   
495          58                  267.0         16  0.256128  0.145090     0.0   
496          64                  334.0         16  0.253504  0.164926     0.0   
497          41                  287.0          0  0.263215  0.164676     0.0   
498          72                  291.0         12  0.244807  0.163983     0.0   
499          70                  215.0         17  0.242826  0.170698     0.0   

       fa_max  fa_median   

In [95]:
# Split into train/test with stratification to maintain class distribution
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, stratify=y, random_state=42
)
print('X_train', X_train.shape)
print('X_test', X_test.shape)
print('y_train', y_train.shape)
print('y_test', y_test.shape)






X_train (400, 75)
X_test (100, 75)
y_train (400,)
y_test (100,)


In [96]:
# Compute class weights to address imbalance
classes = np.unique(y_train)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weight_dict = dict(zip(classes, weights))



In [97]:
# Define the pipeline (scaling + random forest)
pipeline = Pipeline([
('scaler', StandardScaler()), # optional depending on feature types
('rf', RandomForestClassifier(
n_estimators=100,    #if do more, could stabilize??
class_weight=class_weight_dict,
random_state=42,
n_jobs=-1
))
])



In [98]:
# Repeated stratified K-Fold cross-validation
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=42)



In [99]:
# Cross-validation on training data
scores = cross_val_score(pipeline, X_train, y_train, cv=cv, scoring='accuracy', n_jobs=-1)
print(f'Cross-validated Accuracy: {np.mean(scores):.4f} ± {np.std(scores):.4f}')



Cross-validated Accuracy: 0.8755 ± 0.0206


In [100]:
# Fit on full training set
pipeline.fit(X_train, y_train)

# Evaluate on held-out test set
y_pred = pipeline.predict(X_test)
print("Classification Report on Test Set:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Classification Report on Test Set:
              precision    recall  f1-score   support

           2       0.73      0.73      0.73        11
           3       0.00      0.00      0.00         9
           4       0.90      0.97      0.93        80

    accuracy                           0.86       100
   macro avg       0.54      0.57      0.55       100
weighted avg       0.80      0.86      0.83       100

Confusion Matrix:
[[ 8  1  2]
 [ 2  0  7]
 [ 1  1 78]]
