In [1]:
import pandas as pd
from plotly import express
import warnings
warnings.filterwarnings(action='ignore', category=FutureWarning)

INFO = '/kaggle/input/glioma-grading-clinical-and-mutation-features/TCGA_InfoWithGrade.csv'
df = pd.read_csv(filepath_or_buffer=INFO)
df['grade'] = df['Grade'].map({0:'LGG', 1:'GMB'})
df.head()

Unnamed: 0,Grade,Gender,Age_at_diagnosis,Race,IDH1,TP53,ATRX,PTEN,EGFR,CIC,...,RB1,NOTCH1,BCOR,CSMD3,SMARCA4,GRIN2A,IDH2,FAT4,PDGFRA,grade
0,0,0,51.3,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,LGG
1,0,0,38.72,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,LGG
2,0,0,35.17,0,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,LGG
3,0,1,32.78,0,1,1,1,0,0,0,...,0,0,0,0,0,0,0,1,0,LGG
4,0,0,31.51,0,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,LGG


Grade is our target variable; everything else is an input variable.

In [2]:
express.pie(data_frame=df, names='grade',  color='grade')

Our classes are moderately unbalanced.

In [3]:
express.histogram(data_frame=df, x='Age_at_diagnosis', color='Grade', facet_col='Grade')

Obviously our grades have different age distributions.

In [4]:
from umap import UMAP
from plotly import express

columns = ['Gender', 'Age_at_diagnosis', 'Race', 'IDH1', 'TP53', 'ATRX','PTEN', 'EGFR', 'CIC', 'MUC16', 'PIK3CA', 'NF1', 'PIK3R1', 'FUBP1', 'RB1', 'NOTCH1', 'BCOR', 'CSMD3', 'SMARCA4', 'GRIN2A', 'IDH2', 'FAT4','PDGFRA']
df['grade'] = df['Grade'].map({0: 'LGG', 1: 'GBM'})
target = 'grade'

reducer = UMAP(n_components=2, random_state=2024, transform_seed=2024, verbose=True, n_jobs=1, n_epochs=200)
df[['x', 'y']] = pd.DataFrame(data=reducer.fit_transform(X=df[columns]))
express.scatter(data_frame=df, x='x', y='y', color=target, ).show()

2024-03-06 16:20:06.706988: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-06 16:20:06.707178: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-06 16:20:06.864458: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


UMAP(n_epochs=200, n_jobs=1, random_state=2024, transform_seed=2024, verbose=True)
Wed Mar  6 16:20:21 2024 Construct fuzzy simplicial set
Wed Mar  6 16:20:22 2024 Finding Nearest Neighbors
Wed Mar  6 16:20:26 2024 Finished Nearest Neighbor Search
Wed Mar  6 16:20:30 2024 Construct embedding


Epochs completed:   0%|            0/200 [00:00]

	completed  0  /  200 epochs
	completed  20  /  200 epochs
	completed  40  /  200 epochs
	completed  60  /  200 epochs
	completed  80  /  200 epochs
	completed  100  /  200 epochs
	completed  120  /  200 epochs
	completed  140  /  200 epochs
	completed  160  /  200 epochs
	completed  180  /  200 epochs
Wed Mar  6 16:20:32 2024 Finished embedding


This looks encouraging: we have lots of cases that dimension reduction separates and a minority that it does'nt; let's build a simple classifier and see which features matter.

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(df[columns], df['grade'], test_size=0.2, random_state=2024)

regression = LogisticRegression(max_iter=100000)

regression.fit(X=X_train, y=y_train)
print('accuracy: {:5.4f} '.format(regression.score(X=X_test, y=y_test)))
express.histogram(y=regression.coef_.tolist()[0], x=columns).show(validate=True)

accuracy: 0.8690 


Wow. Genetic mutations dominate our regression coefficients. Let's build a more complicated model and see if we can improve the accuracy.

In [6]:
from sklearn.metrics import classification_report
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier


model = AdaBoostClassifier(algorithm='SAMME', estimator=DecisionTreeClassifier(max_depth=4, random_state=2024), n_estimators=71, random_state=2024)
model.fit(X=X_train, y=y_train)
print('score: {:5.4f}'.format(model.score(X=X_test, y=y_test)))
print(classification_report(y_true=y_test, y_pred=model.predict(X=X_test,)))


score: 0.8869
              precision    recall  f1-score   support

         GBM       0.88      0.81      0.84        63
         LGG       0.89      0.93      0.91       105

    accuracy                           0.89       168
   macro avg       0.89      0.87      0.88       168
weighted avg       0.89      0.89      0.89       168



Let's look at the feature importances from the AdaBoost model.

In [7]:
express.histogram(y=model.feature_importances_, x=columns).show(validate=True)

This is a somewhat stark reminder that feature importances and regression coefficients are different things.