In [1]:
import pandas as pd
import numpy as np
import glob
import gzip
import shutil
import os
from tqdm import tqdm
import plotly.express as px
import sklearn

# set infinite display
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

In [2]:
# import the TPM dataframe
TPM = pd.read_csv('../../results/TPM.tsv', sep='\t', index_col=0)
# preview the dataframe only showing the first 10 columns
TPM.iloc[:, :10].head()

Unnamed: 0_level_0,52aef68b-63de-46b1-bb88-fb11d875c803,98460c61-c4ef-43ff-b7ac-b5c1b7b0ae26,5810cc10-f53a-451a-8001-c68765e30565,2042bb7e-ff24-439d-b33c-70c15cc3d201,dba512d3-58c5-47ed-a3f5-ee3d81e0b0c9,4aa493c6-6177-4d64-add9-f0c0198e2530,53b68f98-1d15-42d5-b374-b2163590b3a8,1748692d-651e-4bb1-ab9e-8460d1c8b588,eb0750ce-bed8-41a0-b93c-d72d02c65303,cf4aa3e0-2256-42e3-a7cf-9ea08900b622
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ENST00000456328.2,0.088142,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.147231
ENST00000450305.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENST00000488147.1,3.235923,2.721829,5.28927,13.728161,2.106743,6.399453,2.024906,1.171819,0.643958,4.371894
ENST00000619216.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENST00000473358.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
# print shape of dataframe
print('Before removing non-performing transcripts: ', TPM.shape)
# find genes with 0 TPM in all samples
TPM = TPM.loc[(TPM != 0).any(axis=1)]
# print shape of dataframe
print('After removing non-performing transcripts: ', TPM.shape)

# # find genes with 0 TPM in at least 99% of samples
# TPM = TPM.loc[(TPM == 0).sum(axis=1) <= 0.99 * TPM.shape[1]]
# # print shape of dataframe
# print('After removing genes with 0 TPM in at least 99% of samples: ', TPM.shape)

Before removing non-performing transcripts:  (252045, 133)
After removing non-performing transcripts:  (234191, 133)


In [4]:
# transpose the dataframe
TPM = TPM.T

In [5]:
# Flatten the DataFrame to a 1D array
flat_values = TPM.values.flatten()

# Calculate mean and standard deviation
mean_value = np.mean(flat_values)
std_value = np.std(flat_values)

# calculate max, min and range
max_value = np.max(flat_values)
min_value = np.min(flat_values)
range_value = max_value - min_value

# print all statistical values
print('Mean TPM: ', mean_value)
print('Standard deviation TPM: ', std_value)
print('Max TPM: ', max_value)
print('Min TPM: ', min_value)
print('Range TPM: ', range_value)

# find the smallest non-zero value in the flattened TPM array
smallest_nonzero = np.min(TPM.values[TPM.values > 0])
# print the smallest non-zero value
print('Smallest non-zero TPM value: ', smallest_nonzero)

Mean TPM:  nan
Standard deviation TPM:  nan
Max TPM:  nan
Min TPM:  nan
Range TPM:  nan
Smallest non-zero TPM value:  1e-06


In [6]:
# introduce pseudocount
TPM = TPM + 0.000000001
# perform log-transformation
TPM = TPM.apply(np.log2)
# preview the dataframe only showing the first 10 columns
TPM.iloc[:, :10].head()

Name,ENST00000456328.2,ENST00000488147.1,ENST00000473358.1,ENST00000469289.1,ENST00000417324.1,ENST00000461467.1,ENST00000606857.1,ENST00000642116.1,ENST00000466430.5,ENST00000477740.5
52aef68b-63de-46b1-bb88-fb11d875c803,-3.504027,1.694177,-29.897353,-29.897353,-29.897353,-29.897353,-29.897353,-29.897353,-2.315795,-29.897353
98460c61-c4ef-43ff-b7ac-b5c1b7b0ae26,-29.897353,1.444576,-29.897353,-29.897353,-29.897353,-29.897353,-29.897353,-29.897353,-3.405161,-29.897353
5810cc10-f53a-451a-8001-c68765e30565,-29.897353,2.403069,-29.897353,-29.897353,-29.897353,-29.897353,-29.897353,-29.897353,-2.343447,-29.897353
2042bb7e-ff24-439d-b33c-70c15cc3d201,-29.897353,3.779066,-29.897353,-29.897353,-29.897353,-29.897353,-29.897353,-29.897353,-1.387547,-29.897353
dba512d3-58c5-47ed-a3f5-ee3d81e0b0c9,-29.897353,1.075014,-29.897353,-29.897353,-29.897353,-29.897353,-29.897353,-29.897353,-3.771008,-29.897353


In [7]:
# perform PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=3, whiten=True)
pca.fit(TPM)

# transform the data
TPM_pca = pca.transform(TPM)

# create a dataframe with the PCA results
TPM_pca_df = pd.DataFrame(data = TPM_pca, columns = ['PC1', 'PC2', 'PC3'], index=TPM.index)

ValueError: Input X contains NaN.
PCA does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
# preview the dataframe
TPM_pca_df.head()

Unnamed: 0,PC1,PC2,PC3
52aef68b-63de-46b1-bb88-fb11d875c803,-0.646184,0.30754,1.055633
98460c61-c4ef-43ff-b7ac-b5c1b7b0ae26,-0.228434,0.686052,-1.465171
5810cc10-f53a-451a-8001-c68765e30565,3.217787,-0.360605,-0.996451
2042bb7e-ff24-439d-b33c-70c15cc3d201,3.087338,-1.13054,-0.300919
dba512d3-58c5-47ed-a3f5-ee3d81e0b0c9,-0.198615,-0.553225,0.892547


In [None]:
# use plotly to create a 2d scatterplot
fig = px.scatter(TPM_pca_df, x='PC1', y='PC2', title='PCA of TPM values')
fig.show()

In [None]:
# use plotly to create a 3d scatterplot
fig = px.scatter_3d(TPM_pca_df, x='PC1', y='PC2', z='PC3', title='PCA of TPM values')
fig.show()

In [None]:
# import the metadata
metadata = pd.read_csv('../../results/metadata.tsv', sep='\t', index_col=0)
# preview the dataframe
metadata.head()

Unnamed: 0_level_0,aliquot_id,read_group_id,has_blood_cancer,tissue_type,instrument_model,RIN,includes_spike_ins,library_preparation_kit_name,library_preparation_kit_vendor,library_preparation_kit_version,library_preparation_kit_catalog_number,library_selection,library_strand,library_strategy,size_selection_range,spike_ins_concentration,demographic_id,diagnosis_id,batch_id,index_date,cause_of_death,days_to_birth,days_to_death,ethnicity,gender,race,vital_status,adrenal_hormone,age_at_diagnosis,ajcc_clinical_m,ajcc_clinical_n,ajcc_clinical_stage,ajcc_clinical_t,ajcc_pathologic_m,ajcc_pathologic_n,ajcc_pathologic_stage,ajcc_pathologic_t,ajcc_staging_system_edition,ann_arbor_b_symptoms,ann_arbor_b_symptoms_described,ann_arbor_clinical_stage,ann_arbor_extranodal_involvement,ann_arbor_pathologic_stage,best_overall_response,burkitt_lymphoma_clinical_variant,child_pugh_classification,classification_of_tumor,cog_liver_stage,cog_neuroblastoma_risk_group,cog_renal_stage,cog_rhabdomyosarcoma_risk_group,created_datetime,days_to_best_overall_response,days_to_diagnosis,days_to_last_follow_up,days_to_last_known_disease_status,days_to_recurrence,diagnosis_is_primary_disease,eln_risk_classification,enneking_msts_grade,enneking_msts_metastasis,enneking_msts_stage,enneking_msts_tumor_site,esophageal_columnar_dysplasia_degree,esophageal_columnar_metaplasia_present,figo_stage,figo_staging_edition_year,first_symptom_prior_to_diagnosis,gastric_esophageal_junction_involvement,gleason_grade_group,gleason_grade_tertiary,gleason_patterns_percent,goblet_cells_columnar_mucosa_present,icd_10_code,igcccg_stage,inpc_grade,inpc_histologic_group,inrg_stage,inss_stage,international_prognostic_index,irs_group,irs_stage,ishak_fibrosis_score,iss_stage,last_known_disease_status,laterality,margin_distance,margins_involved_site,masaoka_stage,medulloblastoma_molecular_classification,metastasis_at_diagnosis,metastasis_at_diagnosis_site,method_of_diagnosis,micropapillary_features,mitosis_karyorrhexis_index,mitotic_count,morphology,ovarian_specimen_status,ovarian_surface_involvement,papillary_renal_cell_type,peritoneal_fluid_cytological_status,pregnant_at_diagnosis,primary_diagnosis,primary_disease,primary_gleason_grade,prior_malignancy,prior_treatment,progression_or_recurrence,project_id.1,residual_disease,satellite_nodule_present,secondary_gleason_grade,site_of_resection_or_biopsy,sites_of_involvement,submitter_id,supratentorial_localization,synchronous_malignancy,tissue_or_organ_of_origin,tumor_confined_to_organ_of_origin,tumor_depth,tumor_focality,tumor_grade,tumor_regression_grade,updated_datetime,weiss_assessment_score,who_cns_grade,who_nte_grade,wilms_tumor_histologic_subtype,year_of_diagnosis,disease_type,primary_site,project_id
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1
5705efcc-b48f-435c-8a28-9e0d407ecadd,75ac0619-947a-427b-a53f-71e121a7ec8f,71894d8b-5210-44dc-aadc-a199d3843dd2,False,Tumor,Illumina HiSeq 4000,,True,TruSeq Stranded Total RNA Library Prep Kit wit...,Illumina,,,rRNA Depletion,First_Stranded,RNA-Seq,308.0,1.0,f1ec7b5b-7f11-41e6-9fb4-a27fb2adc8f1,c7f325cd-98a6-4996-825e-1c23b86e1a22,2.0,Diagnosis,,-22142.0,,not hispanic or latino,female,white,Alive,,22142.0,M0,,,,Unknown,NX,Stage I,T1a,7th,,,,,,,,,,,,,,2018-05-17T23:34:39.302537-05:00,,,823.0,823.0,,True,,,,,,,,Stage I,,,,,,,,,,,,,,,,,,,Tumor free,,,,,,,,,,,,8380/3,,,,,,"Endometrioid adenocarcinoma, NOS",,,,,no,CPTAC-3,R0,,,Corpus uteri,,C3L-00137-DIAG,,,Corpus uteri,,,Unifocal,G1,,2021-09-22T14:36:32.037275-05:00,,,,,2016.0,Adenomas and Adenocarcinomas,"Uterus, NOS",CPTAC-3
5705efcc-b48f-435c-8a28-9e0d407ecadd,948c4d53-3d91-48a6-bec4-0cc96020e572,86774648-bb57-42c3-b835-9fb11b590d8b,False,Tumor,,,,,,,,rRNA Depletion,,RNA-Seq,,,f1ec7b5b-7f11-41e6-9fb4-a27fb2adc8f1,c7f325cd-98a6-4996-825e-1c23b86e1a22,2.0,Diagnosis,,-22142.0,,not hispanic or latino,female,white,Alive,,22142.0,M0,,,,Unknown,NX,Stage I,T1a,7th,,,,,,,,,,,,,,2018-05-17T23:34:39.302537-05:00,,,823.0,823.0,,True,,,,,,,,Stage I,,,,,,,,,,,,,,,,,,,Tumor free,,,,,,,,,,,,8380/3,,,,,,"Endometrioid adenocarcinoma, NOS",,,,,no,CPTAC-3,R0,,,Corpus uteri,,C3L-00137-DIAG,,,Corpus uteri,,,Unifocal,G1,,2021-09-22T14:36:32.037275-05:00,,,,,2016.0,Adenomas and Adenocarcinomas,"Uterus, NOS",CPTAC-3
763e0702-8379-4b5e-95d1-a84f412c51e7,ce810e2e-4929-4bbc-95ff-6da493477391,c2980255-7c57-4b79-82a7-f77098ff164e,False,Tumor,Illumina HiSeq 4000,,True,TruSeq Stranded Total RNA Library Prep Kit wit...,Illumina,,,rRNA Depletion,First_Stranded,RNA-Seq,342.0,1.0,2cb2a0b4-acd0-4529-b7ed-deaf1aa119c6,5e74687f-b68b-4891-9a20-1a68826ca706,2.0,Diagnosis,,-22179.0,,not hispanic or latino,female,white,Alive,,22179.0,Unknown,,,,Unknown,NX,Stage II,T2b,7th,,,,,,,,,,,,,,2018-05-17T14:05:59.588773-05:00,,,1785.0,1785.0,,True,,,,,,,,,,,,,,,,,,,,,,,,,,,With tumor,,,,,,,,,,,,8312/3,,,,,,"Renal cell carcinoma, NOS",,,,,yes,CPTAC-3,R0,,,"Kidney, NOS",,C3L-00908-DIAG,,,"Kidney, NOS",,,Unifocal,G3,,2023-02-03T18:28:48.308091-06:00,,,,,2016.0,Adenomas and Adenocarcinomas,Kidney,CPTAC-3
763e0702-8379-4b5e-95d1-a84f412c51e7,33c921ea-b743-4d32-9c56-875de6028c71,8062c6e4-d501-4c91-ab02-f36f4e7fd387,False,Tumor,Illumina HiSeq 4000,,True,TruSeq Stranded Total RNA Library Prep Kit wit...,Illumina,,,rRNA Depletion,First_Stranded,RNA-Seq,325.0,1.0,2cb2a0b4-acd0-4529-b7ed-deaf1aa119c6,5e74687f-b68b-4891-9a20-1a68826ca706,2.0,Diagnosis,,-22179.0,,not hispanic or latino,female,white,Alive,,22179.0,Unknown,,,,Unknown,NX,Stage II,T2b,7th,,,,,,,,,,,,,,2018-05-17T14:05:59.588773-05:00,,,1785.0,1785.0,,True,,,,,,,,,,,,,,,,,,,,,,,,,,,With tumor,,,,,,,,,,,,8312/3,,,,,,"Renal cell carcinoma, NOS",,,,,yes,CPTAC-3,R0,,,"Kidney, NOS",,C3L-00908-DIAG,,,"Kidney, NOS",,,Unifocal,G3,,2023-02-03T18:28:48.308091-06:00,,,,,2016.0,Adenomas and Adenocarcinomas,Kidney,CPTAC-3
763e0702-8379-4b5e-95d1-a84f412c51e7,173c0d6a-bc67-4a72-b6d3-b2a411e24785,39c8b5e7-ac68-4009-ab82-e1ee495bdbd9,False,Normal,Illumina HiSeq 4000,,True,TruSeq Stranded Total RNA Library Prep Kit wit...,Illumina,,,rRNA Depletion,First_Stranded,RNA-Seq,314.0,1.0,2cb2a0b4-acd0-4529-b7ed-deaf1aa119c6,5e74687f-b68b-4891-9a20-1a68826ca706,2.0,Diagnosis,,-22179.0,,not hispanic or latino,female,white,Alive,,22179.0,Unknown,,,,Unknown,NX,Stage II,T2b,7th,,,,,,,,,,,,,,2018-05-17T14:05:59.588773-05:00,,,1785.0,1785.0,,True,,,,,,,,,,,,,,,,,,,,,,,,,,,With tumor,,,,,,,,,,,,8312/3,,,,,,"Renal cell carcinoma, NOS",,,,,yes,CPTAC-3,R0,,,"Kidney, NOS",,C3L-00908-DIAG,,,"Kidney, NOS",,,Unifocal,G3,,2023-02-03T18:28:48.308091-06:00,,,,,2016.0,Adenomas and Adenocarcinomas,Kidney,CPTAC-3


In [None]:
# give the index column the name 'read_group_id'
TPM_pca_df.index.name = 'read_group_id'
# transform the index header of TPM_pca_df into a column
TPM_pca_df.reset_index(inplace=True)

In [None]:
# preview the dataframe
TPM_pca_df.head()

Unnamed: 0,read_group_id,PC1,PC2,PC3
0,52aef68b-63de-46b1-bb88-fb11d875c803,-0.646184,0.30754,1.055633
1,98460c61-c4ef-43ff-b7ac-b5c1b7b0ae26,-0.228434,0.686052,-1.465171
2,5810cc10-f53a-451a-8001-c68765e30565,3.217787,-0.360605,-0.996451
3,2042bb7e-ff24-439d-b33c-70c15cc3d201,3.087338,-1.13054,-0.300919
4,dba512d3-58c5-47ed-a3f5-ee3d81e0b0c9,-0.198615,-0.553225,0.892547


In [None]:
# take tissue_type, gender, race, age_at_diagnosis/365, ajcc_pathologic_stage, primary_diagnosis, morphology, tissue_or_organ_of_origin, tumor_focality, disease_type, primary_site from the metadata and merge it with the TPM_pca_df into a new dataframe called TPM_pre_cluster
columns = ['read_group_id', 'tissue_type', 'gender', 'race', 'age_at_diagnosis', 'ajcc_pathologic_stage', 'primary_diagnosis', 'morphology', 'tissue_or_organ_of_origin', 'tumor_focality', 'disease_type', 'primary_site']

# Create a new dataframe with selected columns from metadata
selected_metadata = metadata[columns]

# Merge selected_metadata with TPM_pca_df using the 'read_group_id' column
TPM_pre_cluster = pd.merge(TPM_pca_df, selected_metadata, on='read_group_id')

In [None]:
# divide age_at_diagnosis by 365 to get age_at_diagnosis in years
TPM_pre_cluster['age_at_diagnosis'] = TPM_pre_cluster['age_at_diagnosis'] / 365

In [None]:
# preview the dataframe
TPM_pre_cluster.head()

Unnamed: 0,read_group_id,PC1,PC2,PC3,tissue_type,gender,race,age_at_diagnosis,ajcc_pathologic_stage,primary_diagnosis,morphology,tissue_or_organ_of_origin,tumor_focality,disease_type,primary_site
0,52aef68b-63de-46b1-bb88-fb11d875c803,-0.646184,0.30754,1.055633,Tumor,female,asian,39.980822,Not Reported,Glioblastoma,9440/3,"Brain, NOS",,Gliomas,Brain
1,98460c61-c4ef-43ff-b7ac-b5c1b7b0ae26,-0.228434,0.686052,-1.465171,Tumor,male,white,65.978082,Stage IVA,"Squamous cell carcinoma, NOS",8070/3,"Tongue, NOS",Unifocal,Squamous Cell Neoplasms,Other and ill-defined sites
2,5810cc10-f53a-451a-8001-c68765e30565,3.217787,-0.360605,-0.996451,Tumor,male,white,72.452055,Unknown,"Squamous cell carcinoma, NOS",8070/3,"Larynx, NOS",Unifocal,Squamous Cell Neoplasms,Other and ill-defined sites
3,2042bb7e-ff24-439d-b33c-70c15cc3d201,3.087338,-1.13054,-0.300919,Tumor,female,white,42.550685,Stage I,"Endometrioid adenocarcinoma, NOS",8380/3,Endometrium,Unifocal,Adenomas and Adenocarcinomas,"Uterus, NOS"
4,dba512d3-58c5-47ed-a3f5-ee3d81e0b0c9,-0.198615,-0.553225,0.892547,Normal,female,white,39.065753,Stage I,"Endometrioid adenocarcinoma, NOS",8380/3,Endometrium,Unifocal,Adenomas and Adenocarcinomas,"Uterus, NOS"


In [None]:
# remove read_group_id column
TPM_pre_cluster.drop('read_group_id', axis=1, inplace=True)

In [None]:
fig = px.scatter_3d(TPM_pre_cluster, x='PC1', y='PC2', z='PC3', color='disease_type', title='PCA of TPM values')
fig.show()


In [None]:
# do k-means clustering on the pre-clustered dataframe
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=5, random_state=0).fit(TPM_pre_cluster.iloc[:, :3])

# add the cluster labels to the pre-clustered dataframe
TPM_pre_cluster['cluster'] = kmeans.labels_

# use plotly to create a 3d scatterplot
fig = px.scatter_3d(TPM_pre_cluster, x='PC1', y='PC2', z='PC3', color='cluster', title='PCA of TPM values')



