In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.utils.class_weight import compute_class_weight

# Load your data (assume already loaded as df)

#neuroimaging data 
df = pd.read_csv('patient_level_comprehensive_features_fixed.csv')
print(df.shape)

#shell data is stored in boundary_shell_, deleting shell_ and all clinical data 

df = df.drop(columns=['who_grade', 'age_at_mri', 'sex', 'overall_survival_days', 'mgmt_status', 
                      'mgmt_index', '1p19q', 'idh_status', 'eor',
                      'shell_0_fa_mean', 'shell_0_fa_std', 'shell_0_fa_median', 'shell_0_voxel_count',
                      'shell_1_fa_mean', 'shell_1_fa_std','shell_1_fa_median','shell_1_voxel_count', 
                      'shell_2_fa_mean','shell_2_fa_std','shell_2_fa_median','shell_2_voxel_count',
                      'shell_3_fa_mean','shell_3_fa_std','shell_3_fa_median','shell_3_voxel_count', 
                      'shell_0_gradient_mean', 'shell_0_gradient_std','shell_1_gradient_mean','shell_1_gradient_std',
                      'shell_2_gradient_mean','shell_2_gradient_std','shell_3_gradient_mean','shell_3_gradient_std'])


df.rename(columns={'patient_id': 'ID'}, inplace=True)

#cleaned clinical data upload


df_clinical = pd.read_csv('PDGM/UCSF-PDGM-metadata_v5_cleaned.csv')
#clinical analysis will include sex, MGMT status, IDH status, 1p/19q
 
print(df_clinical.columns)
#dropping other columns
df_clinical = df_clinical.drop(columns=[
       'Final pathologic diagnosis (WHO 2021)', 'MGMT index',
       '1-dead 0-alive', 'OS', 'EOR',
       'Biopsy prior to imaging', 'BraTS21 ID', 'BraTS21 Segmentation Cohort',
       'BraTS21 MGMT Cohort'])

print('updated clinical column list', df_clinical.columns)


                               

    



(500, 91)
Index(['ID', 'Sex', 'Age at MRI', 'WHO CNS Grade',
       'Final pathologic diagnosis (WHO 2021)', 'MGMT status', 'MGMT index',
       '1p/19q', 'IDH', '1-dead 0-alive', 'OS', 'EOR',
       'Biopsy prior to imaging', 'BraTS21 ID', 'BraTS21 Segmentation Cohort',
       'BraTS21 MGMT Cohort'],
      dtype='object')
updated clinical column list Index(['ID', 'Sex', 'Age at MRI', 'WHO CNS Grade', 'MGMT status', '1p/19q',
       'IDH'],
      dtype='object')


In [4]:
#merge dfs by ID and classify grade 2 and 3 as the same

df_merged = pd.merge(df, df_clinical, on ='ID', how='inner')


df_merged['WHO CNS Grade'] = df_merged['WHO CNS Grade'].replace(3, 2)
print('Merged df shape is ', df_merged.shape)

column_grade = ['WHO CNS Grade']

for column in column_grade:
    unique_values = df_merged[column].unique()
    print(f"Unique values in column '{column}': {unique_values}")

Merged df shape is  (500, 64)
Unique values in column 'WHO CNS Grade': [4 2]


In [5]:
#data pre-prcessing: need to convert categorical data to numbers using one-hot numeric arrays
columns_by_index = ['Sex', 'MGMT status', '1p/19q','IDH']

for column in columns_by_index:
    unique_values = df_merged[column].unique()
    print(f"Unique values in column '{column}': {unique_values}")

#all categorical data being encoded, unknowns included
df_encoded = pd.get_dummies(df_merged, columns=['Sex', 'MGMT status', '1p/19q','IDH'], drop_first=True)

print("\nOne-Hot Encoded DataFrame (Pandas):")
print(df_encoded)
    


#testing data w/out clinical features 
#indices_to_drop = ['Sex', 'MGMT status', '1p/19q','IDH']
#columns_by_index = df.columns[indices_to_drop].tolist()
#df_dropped = df.drop(columns=columns_by_index)


#df = df.drop(columns=[indices_to_drop])
#print(df_dropped)



Unique values in column 'Sex': ['M' 'F']
Unique values in column 'MGMT status': ['negative' 'unknown' 'positive']
Unique values in column '1p/19q': ['unknown' 'intact' 'co-deletion']
Unique values in column 'IDH': ['wildtype' 'mutated']

One-Hot Encoded DataFrame (Pandas):
                ID   fa_mean    fa_std  fa_min    fa_max  fa_median     fa_p5  \
0    UCSF-PDGM-004  0.253438  0.163584     0.0  1.192218   0.212847  0.059179   
1    UCSF-PDGM-005  0.225728  0.150805     0.0  1.173550   0.180397  0.058431   
2    UCSF-PDGM-007  0.245633  0.146553     0.0  1.214064   0.209004  0.073694   
3    UCSF-PDGM-008  0.270115  0.155586     0.0  1.138740   0.228412  0.086311   
4    UCSF-PDGM-009  0.200340  0.136351     0.0  1.124572   0.158248  0.054843   
..             ...       ...       ...     ...       ...        ...       ...   
495  UCSF-PDGM-536  0.256128  0.145090     0.0  1.187934   0.221200  0.087824   
496  UCSF-PDGM-537  0.253504  0.164926     0.0  1.188299   0.207506  0.065204 

In [6]:
#replacing NAs and unknowns with 0s, change before submitting


missing_counts = df_encoded.isna().sum()
pd.options.display.max_rows = 400
#print(missing_counts,'NAs')
print(sum(missing_counts), 'NA total') #note that survival days col has missing value-what to do with that? 


df_filled = df_encoded.fillna(0)
print('Number of NAs is now :', sum(df_filled.isna().sum()>0)) #delete the one person 
#delete survival data

#confirm no NAs 
#print(sum(df_filled.isna().sum()>0))







0 NA total
Number of NAs is now : 0


In [7]:
#Check unknowns in dataset


# Flatten the DataFrame into a single Series
all_values = df_filled.stack()
count_unknowns = (all_values == 'unknown').sum()
print(f"Count of value unknown in the entire DataFrame: {count_unknowns}")




#print(df_clean)


Count of value unknown in the entire DataFrame: 0


                ID   fa_mean    fa_std  fa_min    fa_max  fa_median     fa_p5  \
0    UCSF-PDGM-004  0.253438  0.163584     0.0  1.192218   0.212847  0.059179   
1    UCSF-PDGM-005  0.225728  0.150805     0.0  1.173550   0.180397  0.058431   
2    UCSF-PDGM-007  0.245633  0.146553     0.0  1.214064   0.209004  0.073694   
3    UCSF-PDGM-008  0.270115  0.155586     0.0  1.138740   0.228412  0.086311   
4    UCSF-PDGM-009  0.200340  0.136351     0.0  1.124572   0.158248  0.054843   
..             ...       ...       ...     ...       ...        ...       ...   
495  UCSF-PDGM-536  0.256128  0.145090     0.0  1.187934   0.221200  0.087824   
496  UCSF-PDGM-537  0.253504  0.164926     0.0  1.188299   0.207506  0.065204   
497  UCSF-PDGM-538  0.263215  0.164676     0.0  1.207332   0.215951  0.075165   
498  UCSF-PDGM-539  0.244807  0.163983     0.0  1.186552   0.200918  0.057449   
499  UCSF-PDGM-540  0.242826  0.170698     0.0  1.198951   0.187723  0.060611   

       fa_p10    fa_p25    

In [8]:
# Split into train/test with stratification to maintain class distribution


train = pd.read_csv('PDGM/train_metadata.csv')['ID']
print(train.shape, 'in train')



#df_train= train.merge(df_filled,how='inner', )
#print(train)
df_train = df_filled[df_filled['ID'].isin(train.tolist())]
#print(df_train)
print(df_train.shape)






test=  pd.read_csv('PDGM/test_metadata.csv')['ID']
print(test.shape, 'in test')
df_test = df_filled[df_filled['ID'].isin(test.tolist())]




print(df_test.shape)





(400,) in train
(399, 66)
(101,) in test
(101, 66)


In [9]:

X_train = df_train.drop(columns=['WHO CNS Grade','ID'])
#print(X_train)
X_test= df_test.drop(columns=['WHO CNS Grade','ID'])
y_train= df_train['WHO CNS Grade']

y_test= df_test['WHO CNS Grade']
#X_train, X_test, y_train, y_test = train_test_split(
#X, y, test_size=0.2, stratify=y, random_state=42)
print('X_train', X_train.shape)
print('X_test', X_test.shape)
print('y_train', y_train.shape)
print('y_test', y_test.shape)






X_train (399, 64)
X_test (101, 64)
y_train (399,)
y_test (101,)


In [10]:
# Compute class weights to address imbalance
classes = np.unique(y_train)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
print(weights)
class_weight_dict = dict(zip(classes, weights))





[2.52531646 0.6234375 ]
WHO CNS Grade
4    0.802005
2    0.197995
Name: proportion, dtype: float64


In [11]:
# Define the pipeline (scaling + random forest)
pipeline = Pipeline([
('scaler', StandardScaler()), # optional depending on feature types
('rf', RandomForestClassifier(
n_estimators=100,    #if do more, could stabilize??
class_weight=class_weight_dict,
random_state=42,
n_jobs=-1
))
])



In [12]:
# Repeated stratified K-Fold cross-validation
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=42)



In [13]:
# Cross-validation on training data
scores = cross_val_score(pipeline, X_train, y_train, cv=cv, scoring='accuracy', n_jobs=-1)
print(f'Cross-validated Accuracy: {np.mean(scores):.4f} ± {np.std(scores):.4f}')



Cross-validated Accuracy: 0.9161 ± 0.0235


In [87]:
# Fit on full training set
pipeline.fit(X_train, y_train)

# Evaluate on held-out test set
y_pred = pipeline.predict(X_test)
print("Classification Report on Test Set:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Classification Report on Test Set:
              precision    recall  f1-score   support

           2       0.83      0.50      0.62        20
           4       0.89      0.98      0.93        81

    accuracy                           0.88       101
   macro avg       0.86      0.74      0.78       101
weighted avg       0.88      0.88      0.87       101

Confusion Matrix:
[[10 10]
 [ 2 79]]


In [None]:
rf = RandomForestRegressor(100)

# sample 30 random features
X = features.sample(30, axis=1)
rf.fit(X, y)

pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False).head(10)