In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.utils.class_weight import compute_class_weight

# Load your data (assume already loaded as df)
df = pd.read_csv('patient_level_comprehensive_features_fixed.csv')
print(df.shape)
#shell 0 data is stored in boundary_shell_0, deleting shell_0 

df = df.drop(columns=['patient_id', 'shell_0_fa_mean', 'shell_0_fa_std', 'shell_0_fa_median', 'shell_0_voxel_count',
                      'shell_1_fa_mean', 'shell_1_fa_std','shell_1_fa_median','shell_1_voxel_count', 
                      'shell_2_fa_mean','shell_2_fa_std','shell_2_fa_median','shell_2_voxel_count',
                      'shell_3_fa_mean','shell_3_fa_std','shell_3_fa_median','shell_3_voxel_count', 
                      'shell_0_gradient_mean', 'shell_0_gradient_std','shell_1_gradient_mean','shell_1_gradient_std',
                      'shell_2_gradient_mean','shell_2_gradient_std','shell_3_gradient_mean','shell_3_gradient_std'])
print(df.shape)

(500, 91)
(500, 66)


In [18]:
#data pre-prcessing: need to convert categorical data to numbers using one-hot numeric arrays
columns_by_index = ['age_at_mri', 'sex', 'overall_survival_days', 'mgmt_status', 'mgmt_index', '1p19q', 'idh_status', 'eor']

for column in columns_by_index:
    unique_values = df[column].unique()
    print(f"Unique values in column '{column}': {unique_values}")

#Notes-decide what to do with unknown/nan data in categorical data and in 'overall_survival_days' 'mgmt_index'?

#all categorical data being encoded, unknowns included
df_encoded = pd.get_dummies(df, columns=['sex', 'mgmt_status', '1p19q', 'idh_status', 'eor'], drop_first=False)

print("\nOne-Hot Encoded DataFrame (Pandas):")
print(df_encoded)
    


#testing data w/out clinical features 
#indices_to_drop = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
#columns_by_index = df.columns[indices_to_drop].tolist()
#df_dropped = df.drop(columns=columns_by_index)


#df = df.drop(columns=[indices_to_drop])
#print(df_dropped)



Unique values in column 'age_at_mri': [66 80 70 68 39 74 65 64 50 57 54 51 49 41 61 67 34 83 78 40 37 63 69 52
 53 58 38 32 48 55 79 71 46 56 76 84 77 60 62 72 25 45 29 17 47 36 81 27
 33 42 26 75 59 35 43 73 86 89 22 31 30 24 44 28 23 82 19 94 21 85]
Unique values in column 'sex': ['M' 'F']
Unique values in column 'overall_survival_days': [1303.  274.  417.  185.  389.   17.  133.   15.  322.  313.   54.   14.
  241. 1449.  719.  551. 1759.  533. 1161.  653.  465.  236.  298.   75.
  824.  105. 1489.   52.  132. 1673.  460.  152.  134.  716.  128.  635.
 1116.  930. 1718.  178.  385.  721.  554.  575. 1328.  207.  224. 1540.
  223. 1907.  287.  187.  167. 1913.  299.  336.  245.  678.  315.  184.
 1617.  583.  421.  108.  142.  354.  560.  253.    6.  260.  902.  232.
  266.  406. 1797. 1785.  749.  463.  567.  994.  161.  777.  591.  110.
  472.  705.  798. 1694.   70.  667.  351.  175. 1715.  611.   39.  301.
 1351.   92.  430. 1611.  162. 1650.  879. 1024.  485. 1299.  919. 1296.
 

In [22]:
#replacing NAs and unknowns with 0s, change before submitting


missing_counts = df_encoded.isna().sum()
pd.options.display.max_rows = 400
#print(missing_counts,'NAs')
print(sum(missing_counts), 'NA total') #note that survival days col has missing value-what to do with that? 


df_filled = df_encoded.fillna(0)
print('Number of NAs is now :', sum(df_filled.isna().sum()>0))

#confirm no NAs 
#print(sum(df_filled.isna().sum()>0))







1 NA total
Number of NAs is now : 0


In [24]:
#Unknowns in dataset all from mgmt_index 


# Flatten the DataFrame into a single Series
all_values = df_filled.stack()
count_unknowns = (all_values == 'unknown').sum()
print(f"Count of value unknown in the entire DataFrame: {count_unknowns}")

print(df_filled['mgmt_index'].value_counts())


#replacing unknowns in clinical data with 0, change before final 

df_clean = df_filled.replace('unknown', '0')

all_values = df_clean.stack()
count_unknowns = (all_values == 'unknown').sum()
print(f"Count of value unknown in the entire DataFrame: {count_unknowns}")
#print(df_clean)


Count of value unknown in the entire DataFrame: 88
mgmt_index
0          112
unknown     88
1           45
16          39
17          38
2           27
3           22
14          22
13          16
12          15
15          14
11          11
4           11
7            9
9            9
5            8
8            6
10           5
6            3
Name: count, dtype: int64
Count of value unknown in the entire DataFrame: 0


In [27]:
X = df_clean.drop(columns=['who_grade']) #replace to df 
#print(X)
y = df_clean['who_grade']
#print(y)


     age_at_mri  overall_survival_days mgmt_index   fa_mean    fa_std  fa_min  \
0            66                 1303.0          0  0.253438  0.163584     0.0   
1            80                  274.0          0  0.225728  0.150805     0.0   
2            70                  417.0          0  0.245633  0.146553     0.0   
3            70                  185.0          0  0.270115  0.155586     0.0   
4            68                  389.0          0  0.200340  0.136351     0.0   
..          ...                    ...        ...       ...       ...     ...   
495          58                  267.0         16  0.256128  0.145090     0.0   
496          64                  334.0         16  0.253504  0.164926     0.0   
497          41                  287.0          0  0.263215  0.164676     0.0   
498          72                  291.0         12  0.244807  0.163983     0.0   
499          70                  215.0         17  0.242826  0.170698     0.0   

       fa_max  fa_median   

In [28]:
# Split into train/test with stratification to maintain class distribution
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, stratify=y, random_state=42
)
print('X_train', X_train.shape)
print('X_test', X_test.shape)
print('y_train', y_train.shape)
print('y_test', y_test.shape)






X_train (400, 82)
X_test (100, 82)
y_train (400,)
y_test (100,)


In [30]:
# Compute class weights to address imbalance
classes = np.unique(y_train)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weight_dict = dict(zip(classes, weights))



In [31]:
# Define the pipeline (scaling + random forest)
pipeline = Pipeline([
('scaler', StandardScaler()), # optional depending on feature types
('rf', RandomForestClassifier(
n_estimators=100,    #if do more, could stabilize??
class_weight=class_weight_dict,
random_state=42,
n_jobs=-1
))
])



In [32]:
# Repeated stratified K-Fold cross-validation
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=42)



In [33]:
# Cross-validation on training data
scores = cross_val_score(pipeline, X_train, y_train, cv=cv, scoring='accuracy', n_jobs=-1)
print(f'Cross-validated Accuracy: {np.mean(scores):.4f} ± {np.std(scores):.4f}')



Cross-validated Accuracy: 0.8662 ± 0.0203


In [35]:
# Fit on full training set
pipeline.fit(X_train, y_train)

# Evaluate on held-out test set
y_pred = pipeline.predict(X_test)
print("Classification Report on Test Set:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Classification Report on Test Set:
              precision    recall  f1-score   support

           2       0.82      0.82      0.82        11
           3       1.00      0.11      0.20         9
           4       0.90      0.99      0.94        80

    accuracy                           0.89       100
   macro avg       0.91      0.64      0.65       100
weighted avg       0.90      0.89      0.86       100

Confusion Matrix:
[[ 9  0  2]
 [ 1  1  7]
 [ 1  0 79]]
