# Classify metadata labels in low dim embeddings

TODO:
 - try to predict biological labels
    - gender
    - race
	- histological_grade (could try regression here + mean_squared_error)

 - try other classification/regression models
    - random forest
    - support vector machine
    - logistic regression (regularized, maybe try elasticnet)
    - linear regression (regularized, maybe try elasticnet)

In [51]:
from anndata import read_h5ad
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error

In [25]:
# load data
adata = read_h5ad("data/TCGA.HNSC.embedded.h5ad")
adata

AnnData object with n_obs × n_vars = 545 × 20262
    obs: 'type', 'age_at_initial_pathologic_diagnosis', 'gender', 'race', 'ajcc_pathologic_tumor_stage', 'histological_type', 'histological_grade', 'tumor_status', 'vital_status', 'OS', 'OS.time', 'DSS', 'DSS.time', 'DFI', 'DFI.time', 'PFI', 'PFI.time', 'Redaction', 'tissue_source_site'
    var: 'gene', 'n_cells', 'mean', 'std'
    uns: 'log1p'
    obsm: 'mds', 'pca', 'tsne', 'umap'

In [56]:
adata.obs.head()

Unnamed: 0_level_0,type,age_at_initial_pathologic_diagnosis,gender,race,ajcc_pathologic_tumor_stage,histological_type,histological_grade,tumor_status,vital_status,OS,OS.time,DSS,DSS.time,DFI,DFI.time,PFI,PFI.time,Redaction,tissue_source_site
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
TCGA-BA-4074-01A-01R-1436-07,HNSC,69.0,MALE,WHITE,Stage IVA,Head & Neck Squamous Cell Carcinoma,G3,WITH TUMOR,Dead,1.0,462.0,1.0,462.0,,,1.0,396.0,,BA
TCGA-BA-4075-01A-01R-1436-07,HNSC,49.0,MALE,BLACK OR AFRICAN AMERICAN,Stage III,Head & Neck Squamous Cell Carcinoma,G2,WITH TUMOR,Dead,1.0,283.0,1.0,283.0,,,1.0,236.0,,BA
TCGA-BA-4076-01A-01R-1436-07,HNSC,39.0,MALE,WHITE,[Not Available],Head & Neck Squamous Cell Carcinoma,G2,WITH TUMOR,Dead,1.0,415.0,1.0,415.0,,,1.0,286.0,,BA
TCGA-BA-4077-01B-01R-1436-07,HNSC,45.0,FEMALE,WHITE,Stage IVA,Head & Neck Squamous Cell Carcinoma,G2,WITH TUMOR,Dead,1.0,1134.0,1.0,1134.0,,,1.0,1134.0,,BA
TCGA-BA-4078-01A-01R-1436-07,HNSC,83.0,MALE,WHITE,[Not Available],Head & Neck Squamous Cell Carcinoma,G2,WITH TUMOR,Dead,1.0,276.0,1.0,276.0,,,1.0,276.0,,BA


In [31]:
# remove samples that belong to minority tissue_source_site
adata = adata[adata.obs["tissue_source_site"].isin(["CN","CV","CR","CQ","BA"])]

In [49]:
def classify_embedding(embedding, labels):
	# split data into train and test
	X_train, X_test, y_train, y_test  = train_test_split(embedding, labels, test_size=0.2, random_state=0, stratify=adata.obs["tissue_source_site"])

	# fit the model
	clf = RandomForestClassifier(oob_score=True).fit(X_train, y_train)

	# predict on train data
	y_pred = clf.predict(X_train)
	score = accuracy_score(y_train, y_pred)
	print("Train accuracy: {:.2f}".format(score))


	# predict on test data
	y_pred = clf.predict(X_test)
	score = accuracy_score(y_test, y_pred)
	# print rounded to 2 decimal places
	print("Test accuracy: {:.2f}".format(score))

	return y_pred

In [50]:
for emb in ["umap", "pca", "tsne", "mds"]:
	print("Classification on {} embedding".format(emb))
	y_pred = classify_embedding(adata.obsm[emb], adata.obs["tissue_source_site"])

Classification on umap embedding
Train accuracy:  1.0
Test accuracy:  0.5
Classification on pca embedding
Train accuracy:  1.0
Test accuracy:  0.4594594594594595
Classification on tsne embedding
Train accuracy:  1.0
Test accuracy:  0.43243243243243246
Classification on mds embedding
Train accuracy:  1.0
Test accuracy:  0.43243243243243246
