In [51]:
from anndata import read_h5ad
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score

In [25]:
# load data
adata = read_h5ad("data/TCGA.HNSC.embedded.h5ad")
adata

AnnData object with n_obs × n_vars = 545 × 20262
    obs: 'type', 'age_at_initial_pathologic_diagnosis', 'gender', 'race', 'ajcc_pathologic_tumor_stage', 'histological_type', 'histological_grade', 'tumor_status', 'vital_status', 'OS', 'OS.time', 'DSS', 'DSS.time', 'DFI', 'DFI.time', 'PFI', 'PFI.time', 'Redaction', 'tissue_source_site'
    var: 'gene', 'n_cells', 'mean', 'std'
    uns: 'log1p'
    obsm: 'mds', 'pca', 'tsne', 'umap'

In [31]:
# remove samples that belong to minority tissue_source_site
adata = adata[adata.obs["tissue_source_site"].isin(["CN","CV","CR","CQ","BA"])]

In [49]:
def classify_embedding(embedding, labels):
	# split data into train and test
	X_train, X_test, y_train, y_test  = train_test_split(embedding, labels, test_size=0.2, random_state=0, stratify=adata.obs["tissue_source_site"])

	# fit the model
	clf = RandomForestClassifier(oob_score=True).fit(X_train, y_train)

	# predict on train data
	y_pred = clf.predict(X_train)
	score = accuracy_score(y_train, y_pred)
	print("Train accuracy: {:.2f}".format(score))


	# predict on test data
	y_pred = clf.predict(X_test)
	score = accuracy_score(y_test, y_pred)
	# print rounded to 2 decimal places
	print("Test accuracy: {:.2f}".format(score))

	return y_pred

In [50]:
for emb in ["umap", "pca", "tsne", "mds"]:
	print("Classification on {} embedding".format(emb))
	y_pred = classify_embedding(adata.obsm[emb], adata.obs["tissue_source_site"])

Classification on umap embedding
Train accuracy:  1.0
Test accuracy:  0.5
Classification on pca embedding
Train accuracy:  1.0
Test accuracy:  0.4594594594594595
Classification on tsne embedding
Train accuracy:  1.0
Test accuracy:  0.43243243243243246
Classification on mds embedding
Train accuracy:  1.0
Test accuracy:  0.43243243243243246
