# Import packages

In [1]:
%matplotlib inline
import qiime2, biom, pickle
import pandas as pd
import numpy as np
from tempfile import mkdtemp
from qiime2.plugins import demux, deblur, quality_filter, \
                            metadata, feature_table, alignment, \
                            phylogeny, diversity, emperor, feature_classifier, \
                            taxa, composition

## Modify settings

In [2]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', -1)

# Loading Patient Pickled Metadata

In [3]:
# Load pickle files to demonstrate that the Pandas formatting is not lost in the binary conversions
caseTesting = pd.read_pickle(open("caseGroupedDict.p", "rb"))
fileNameTesting = pd.read_pickle(open("fileGroupedDict.p", "rb"))

In [4]:
# Example of loading Pickle file by case key (ie per patient)
caseTestingKey = '7861a5f2-8910-4bcb-9c34-f79c5acd6e21'.upper()
caseTesting[caseTestingKey]

Unnamed: 0,ageAtDiagnosis,aliquot_name,case_name,clinical_m_label,clinical_n_label,clinical_t_label,data_format_label,data_type_label,disease_type_label,ethnicity_label,experimental_strategy_label,file_name,file_submitter_id,gdc_file_uuid,gender_label,histological_diagnosis_label,pathologic_m_label,pathologic_n_label,pathologic_stage_label,pathologic_t_label,perf_score_eastern_cancer_oncology_group_label,perf_score_karnofsky_label,primary_site_label,prior_dx,race_label,ref_genome_label,sample_label,sample_type_label,aliquot_concentration,analyte_A260A280Ratio,analyte_amount,analyte_type_label,analyte_well_number,country_of_sample_procurement,data_submitting_center_label,days_to_last_followup,file_last_modified_date,file_published_date,file_upload_date,freezing_method_label,icd03_histology_label,icd03_histology_site,icd10,new_tumor_event_after_initial_trtmt,new_tumor_event_label,perf_score_timing_label,portion_is_ffpe,portion_number,portion_slide_label,portion_weight,primary_therapy_outcome_success_label,radiation_therapy_code_label,radiation_therapy_site_label,radiation_therapy_type_label,seq_platform_label,spectrophotometer_method_label,tissue_source_site_label,vital_status_label,year_of_diagnosis
42,42,1CCF908A-7620-4AF8-B83F-0E287F3655C5,7861A5F2-8910-4BCB-9C34-F79C5ACD6E21,Not available,Not available,Not available,BAM,Raw sequencing data,Breast Invasive Carcinoma,Not available,RNA-Seq,UNCID_1147830.1ccf908a-7620-4af8-b83f-0e287f3655c5.sorted_genome_alignments.bam,D38DF7DB-3C58-4865-AE5A-E1E0B80201F1,C0D7AB92-46B4-4940-AEFF-881241D7E017,FEMALE,Infiltrating Ductal Carcinoma,Not available,N1a,Stage IIB,T2,Not available,Not available,Breast,No,Not available,HG19,D3263082-946D-4420-8A63-8D67EEEE09C5,Primary Tumor,0.15,1.79,59.37,RNA,8EA8BD47-0E8A-4B6F-B87A-27C1BDEF9B7E,Germany,University of North Carolina,518,2013-05-16T20:56:17.000Z,2012-06-21T22:01:14.000Z,2012-05-15T15:00:21.000Z,Not available,8500/3,C50.9,C50.9,,,Not available,NO,11,3658536C-C029-4685-979F-11E914E53E4E,30,Not available,7D9218C9-489E-4C64-9C92-7EE1AEB9C75E,Primary Tumor Field,External,Illumina HiSeq,UV Spec,Indivumed,Alive,2009
43,42,1CCF908A-7620-4AF8-B83F-0E287F3655C5,7861A5F2-8910-4BCB-9C34-F79C5ACD6E21,Not available,Not available,Not available,BAM,Raw sequencing data,Breast Invasive Carcinoma,Not available,RNA-Seq,UNCID_1147830.1ccf908a-7620-4af8-b83f-0e287f3655c5.sorted_genome_alignments.bam,D38DF7DB-3C58-4865-AE5A-E1E0B80201F1,C0D7AB92-46B4-4940-AEFF-881241D7E017,FEMALE,Infiltrating Ductal Carcinoma,Not available,N1a,Stage IIB,T2,Not available,Not available,Breast,No,Not available,HG19,D3263082-946D-4420-8A63-8D67EEEE09C5,Primary Tumor,0.15,1.79,59.37,RNA,8EA8BD47-0E8A-4B6F-B87A-27C1BDEF9B7E,Germany,University of North Carolina,518,2013-05-16T20:56:17.000Z,2012-06-21T22:01:14.000Z,2012-05-15T15:00:21.000Z,Not available,8500/3,C50.9,C50.9,,,Not available,NO,11,F15749FB-E5FD-4124-AD85-EEC12751816B,30,Not available,7D9218C9-489E-4C64-9C92-7EE1AEB9C75E,Primary Tumor Field,External,Illumina HiSeq,UV Spec,Indivumed,Alive,2009
44,42,6C8E9197-FD16-4FED-BFDA-E349CAB26314,7861A5F2-8910-4BCB-9C34-F79C5ACD6E21,Not available,Not available,Not available,BAM,Raw sequencing data,Breast Invasive Carcinoma,Not available,WGS,0d9ef8b814476b334be08fe7ee6ea88c.bam,8244953D-EBE8-4A88-A547-00185B4CB18A,4D604B9E-E0AE-4C4E-B844-4C1D7924D75B,FEMALE,Infiltrating Ductal Carcinoma,Not available,N1a,Stage IIB,T2,Not available,Not available,Breast,No,Not available,GRCh37-lite,D3263082-946D-4420-8A63-8D67EEEE09C5,Primary Tumor,0.07,2.23,66.73,DNA,604D99A4-3F8E-4267-BB83-C1EF582899C3,Germany,Washington University School of Medicine,518,2014-06-19T01:26:13.000Z,2012-05-25T02:12:55.000Z,2012-05-24T14:30:44.000Z,Not available,8500/3,C50.9,C50.9,,,Not available,NO,11,3658536C-C029-4685-979F-11E914E53E4E,30,Not available,7D9218C9-489E-4C64-9C92-7EE1AEB9C75E,Primary Tumor Field,External,Illumina HiSeq,UV Spec,Indivumed,Alive,2009
45,42,6C8E9197-FD16-4FED-BFDA-E349CAB26314,7861A5F2-8910-4BCB-9C34-F79C5ACD6E21,Not available,Not available,Not available,BAM,Raw sequencing data,Breast Invasive Carcinoma,Not available,WGS,0d9ef8b814476b334be08fe7ee6ea88c.bam,8244953D-EBE8-4A88-A547-00185B4CB18A,4D604B9E-E0AE-4C4E-B844-4C1D7924D75B,FEMALE,Infiltrating Ductal Carcinoma,Not available,N1a,Stage IIB,T2,Not available,Not available,Breast,No,Not available,GRCh37-lite,D3263082-946D-4420-8A63-8D67EEEE09C5,Primary Tumor,0.07,2.23,66.73,DNA,604D99A4-3F8E-4267-BB83-C1EF582899C3,Germany,Washington University School of Medicine,518,2014-06-19T01:26:13.000Z,2012-05-25T02:12:55.000Z,2012-05-24T14:30:44.000Z,Not available,8500/3,C50.9,C50.9,,,Not available,NO,11,F15749FB-E5FD-4124-AD85-EEC12751816B,30,Not available,7D9218C9-489E-4C64-9C92-7EE1AEB9C75E,Primary Tumor Field,External,Illumina HiSeq,UV Spec,Indivumed,Alive,2009
3015,42,6D1294B5-4FBB-4F31-8D45-E8540BE8452D,7861A5F2-8910-4BCB-9C34-F79C5ACD6E21,Not available,Not available,Not available,BAM,Raw sequencing data,Breast Invasive Carcinoma,Not available,WGS,01aa8d222c93eac50081544889046aeb.bam,22A130C7-FA78-4777-ACDE-BAA79C8DCBE7,8D13D716-2DB8-42E8-9C7A-42FCE5C7DFC6,FEMALE,Infiltrating Ductal Carcinoma,Not available,N1a,Stage IIB,T2,Not available,Not available,Breast,No,Not available,GRCh37-lite,D123AD55-3559-4185-87C1-A16235950BFC,Blood Derived Normal,0.08,1.88,9.4,DNA,27C1BE5B-49FB-418B-A9BE-7DCFC371CBB3,Germany,Washington University School of Medicine,518,2014-06-19T00:31:55.000Z,2012-05-24T02:01:02.000Z,2012-05-23T20:12:36.000Z,Not available,8500/3,C50.9,C50.9,,,Not available,NO,1,,200,Not available,7D9218C9-489E-4C64-9C92-7EE1AEB9C75E,Primary Tumor Field,External,Illumina HiSeq,UV Spec,Indivumed,Alive,2009


In [5]:
# Example of loading Pickle file by file key (ie per sample)
fileTestingKey = '01249B8C-2E9E-4CEA-B39A-485863AB231A'
fileNameTesting[fileTestingKey]

Unnamed: 0,ageAtDiagnosis,aliquot_name,case_name,clinical_m_label,clinical_n_label,clinical_t_label,data_format_label,data_type_label,disease_type_label,ethnicity_label,experimental_strategy_label,file_name,file_submitter_id,gdc_file_uuid,gender_label,histological_diagnosis_label,pathologic_m_label,pathologic_n_label,pathologic_stage_label,pathologic_t_label,perf_score_eastern_cancer_oncology_group_label,perf_score_karnofsky_label,primary_site_label,prior_dx,race_label,ref_genome_label,sample_label,sample_type_label,aliquot_concentration,analyte_A260A280Ratio,analyte_amount,analyte_type_label,analyte_well_number,country_of_sample_procurement,data_submitting_center_label,days_to_last_followup,file_last_modified_date,file_published_date,file_upload_date,freezing_method_label,icd03_histology_label,icd03_histology_site,icd10,new_tumor_event_after_initial_trtmt,new_tumor_event_label,perf_score_timing_label,portion_is_ffpe,portion_number,portion_slide_label,portion_weight,primary_therapy_outcome_success_label,radiation_therapy_code_label,radiation_therapy_site_label,radiation_therapy_type_label,seq_platform_label,spectrophotometer_method_label,tissue_source_site_label,vital_status_label,year_of_diagnosis
5070,64,9ABEF075-5168-4A66-9826-FD61C94B8F6D,C83D38FC-E011-4F75-A100-96513611F3E9,Not available,Not available,Not available,BAM,Raw sequencing data,Colon Adenocarcinoma,NOT HISPANIC OR LATINO,RNA-Seq,UNCID_2153922.9abef075-5168-4a66-9826-fd61c94b8f6d.sorted_genome_alignments.bam,39C04C87-8662-4704-9E72-53F94D852914,01249B8C-2E9E-4CEA-B39A-485863AB231A,FEMALE,Colon Adenocarcinoma,Not available,N1a,Stage IIIC,T4b,Not available,Not available,Colorectal,No,BLACK OR AFRICAN AMERICAN,HG19,69EF3E70-5D7A-4E30-ABDE-0F9F76EA3141,Primary Tumor,0.17,2.23,24.87,RNA,02451886-80DE-42DE-885F-5A8BBF98016D,United States,University of North Carolina,821,2013-08-28T22:31:06.000Z,2013-08-28T22:31:06.000Z,2013-08-28T22:15:17.000Z,Not available,8140/3,C18.7,C18.7,NO,,Not available,NO,11,B81997A0-3519-4647-9CBB-A8E62A31E839,30,Complete Remission/Response,,,,Illumina HiSeq,UV Spec,BLN - Baylor,Alive,2011


# Loading Cancer Microbiome Data

In [6]:
workdir = '/Users/gdpoore/Google Drive/AAA_UCSD_Research/AAA_Knight_Lab/AAA_Projects/AAA_TCGA/All_Tumor_Analysis_GDP082118/'
# %cd $workdir

In [7]:
bactDataBarn = biom.load_table(workdir+'Bacterial_all_cancer_types.biom')
virDataBarn = biom.load_table(workdir+'Viral_all_cancer_types.biom')

# Convert BIOM format to QIIME format
bactDataBarnQ2 = qiime2.Artifact.import_data('FeatureTable[Frequency]', bactDataBarn)
virDataBarnQ2 = qiime2.Artifact.import_data('FeatureTable[Frequency]', virDataBarn)

In [8]:
bactDataBarnDF = bactDataBarnQ2.view(pd.DataFrame)
virDataBarnDF = virDataBarnQ2.view(pd.DataFrame)

In [9]:
print(bactDataBarnDF.shape)
print(virDataBarnDF.shape)
bactDataBarnDF.head()
virDataBarnDF.head()

(18124, 1626)
(18120, 367)


Unnamed: 0,k__Viruses;f__Phycodnaviridae;g__Prasinovirus,k__Viruses;o__Caudovirales;f__Siphoviridae;g__Sfi1unalikevirus,k__Viruses;o__Herpesvirales;f__Herpesviridae;g__Simplexvirus,k__Viruses;o__Caudovirales;f__Siphoviridae;g__C2likevirus,k__Viruses;f__Poxviridae;g__Parapoxvirus,k__Viruses;f__Papillomaviridae;g__Dyolambdapapillomavirus,k__Viruses;o__Caudovirales;f__Siphoviridae;g__Bignuzlikevirus,k__Viruses;o__Mononegavirales;f__Bornaviridae;g__Bornavirus,k__Viruses;f__Bicaudaviridae;g__Bicaudavirus,k__Viruses;o__Picornavirales;f__Picornaviridae;g__Aquamavirus,k__Viruses;o__Herpesvirales;f__Herpesviridae;g__Scutavirus,k__Viruses;o__Herpesvirales;f__Herpesviridae;g__Mardivirus,k__Viruses;f__Baculoviridae;g__Deltabaculovirus,k__Viruses;f__Papillomaviridae;g__Taupapillomavirus,k__Viruses;o__Caudovirales;f__Myoviridae;g__I3likevirus,k__Viruses;f__Virgaviridae;g__Furovirus,k__Viruses;o__Herpesvirales;f__Herpesviridae;g__Proboscivirus,k__Viruses;o__Herpesvirales;f__Alloherpesviridae;g__Ictalurivirus,k__Viruses;f__Inoviridae;g__Inovirus,k__Viruses;f__Marseilleviridae;g__Marseillevirus,k__Viruses;o__Nidovirales;f__Coronaviridae;g__Gammacoronavirus,k__Viruses;f__Anelloviridae;g__Thetatorquevirus,k__Viruses;f__Ascoviridae;g__Ascovirus,k__Viruses;o__Tymovirales;f__Tymoviridae;g__Maculavirus,k__Viruses;g__Negevirus,k__Viruses;o__Caudovirales;f__Myoviridae;g__Spo1virus,k__Viruses;f__Circoviridae;g__Gyrovirus,k__Viruses;o__Mononegavirales;f__Rhabdoviridae;g__Novirhabdovirus,k__Viruses;o__Picornavirales;f__Picornaviridae;g__Sicinivirus,k__Viruses;o__Mononegavirales;g__Pneumovirus,k__Viruses;o__Picornavirales;f__Picornaviridae;g__Dicipivirus,k__Viruses;f__Poxviridae;g__Molluscipoxvirus,k__Viruses;o__Tymovirales;f__Tymoviridae;g__Tymovirus,k__Viruses;f__Arenaviridae;g__Mammarenavirus,k__Viruses;f__Totiviridae;g__Victorivirus,k__Viruses;f__Flaviviridae;g__Hepacivirus,k__Viruses;f__Poxviridae;g__Alphaentomopoxvirus,k__Viruses;f__Anelloviridae;g__Gammatorquevirus,k__Viruses;o__Tymovirales;f__Gammaflexiviridae;g__Mycoflexivirus,k__Viruses;g__Tenuivirus,k__Viruses;f__Hepeviridae;g__Piscihepevirus,k__Viruses;f__Virgaviridae;g__Pomovirus,k__Viruses;f__Caulimoviridae;g__Badnavirus,k__Viruses;f__Polydnaviridae;g__Ichnovirus,k__Viruses;o__Herpesvirales;f__Herpesviridae;g__Percavirus,k__Viruses;f__Papillomaviridae;g__Betapapillomavirus,k__Viruses;f__Togaviridae;g__Alphavirus,k__Viruses;o__Tymovirales;f__Betaflexiviridae;g__Trichovirus,k__Viruses;o__Caudovirales;f__Podoviridae;g__Bcep22likevirus,k__Viruses;o__Tymovirales;f__Alphaflexiviridae;g__Sclerodarnavirus,k__Viruses;f__Phycodnaviridae;g__Coccolithovirus,k__Viruses;o__Ligamenvirales;f__Lipothrixviridae;g__Deltalipothrixvirus,k__Viruses;f__Papillomaviridae;g__Omikronpapillomavirus,k__Viruses;f__Retroviridae;g__Alpharetrovirus,k__Viruses;o__Caudovirales;f__Siphoviridae;g__Jerseylikevirus,k__Viruses;f__Poxviridae;g__Capripoxvirus,k__Viruses;f__Poxviridae;g__Cervidpoxvirus,k__Viruses;g__Higrevirus,k__Viruses;f__Potyviridae;g__Bymovirus,k__Viruses;o__Caudovirales;f__Podoviridae;g__Phikmvlikevirus,k__Viruses;o__Picornavirales;f__Picornaviridae;g__Tremovirus,k__Viruses;o__Picornavirales;f__Secoviridae;g__Torradovirus,k__Viruses;o__Caudovirales;f__Siphoviridae;g__T5likevirus,k__Viruses;o__Nidovirales;f__Coronaviridae;g__Bafinivirus,k__Viruses;f__Baculoviridae;g__Alphabaculovirus,k__Viruses;f__Papillomaviridae;g__Mupapillomavirus,k__Viruses;o__Caudovirales;f__Siphoviridae;g__Lambdalikevirus,k__Viruses;o__Caudovirales;f__Myoviridae;g__Felixounalikevirus,k__Viruses;f__Papillomaviridae;g__Phipapillomavirus,k__Viruses;f__Papillomaviridae;g__Omegapapillomavirus,k__Viruses;o__Nidovirales;f__Coronaviridae;g__Betacoronavirus,k__Viruses;f__Luteoviridae;g__Enamovirus,k__Viruses;f__Phycodnaviridae;g__Chlorovirus,k__Viruses;f__Hypoviridae;g__Hypovirus,k__Viruses;f__Anelloviridae;g__Alphatorquevirus,k__Viruses;o__Picornavirales;f__Dicistroviridae;g__Aparavirus,k__Viruses;g__Pithovirus,k__Viruses;f__Phycodnaviridae;g__Phaeovirus,k__Viruses;o__Picornavirales;f__Picornaviridae;g__Hepatovirus,k__Viruses;f__Virgaviridae;g__Tobamovirus,k__Viruses;o__Caudovirales;f__Siphoviridae;g__Barnyardlikevirus,k__Viruses;f__Baculoviridae;g__Betabaculovirus,k__Viruses;f__Arenaviridae;g__Arenavirus,k__Viruses;o__Picornavirales;f__Marnaviridae;g__Marnavirus,k__Viruses;o__Caudovirales;f__Myoviridae;g__P100virus,k__Viruses;o__Caudovirales;f__Myoviridae;g__Spounalikevirus,k__Viruses;f__Partitiviridae;g__Betapartitivirus,k__Viruses;f__Reoviridae;g__Cypovirus,k__Viruses;o__Tymovirales;f__Betaflexiviridae;g__Carlavirus,k__Viruses;o__Nidovirales;f__Coronaviridae;g__Alphacoronavirus,k__Viruses;o__Mononegavirales;f__Paramyxoviridae;g__Avulavirus,k__Viruses;o__Herpesvirales;f__Herpesviridae;g__Iltovirus,k__Viruses;f__Bromoviridae;g__Bromovirus,k__Viruses;f__Bunyaviridae;g__Phlebovirus,k__Viruses;f__Closteroviridae;g__Closterovirus,k__Viruses;o__Caudovirales;f__Myoviridae;g__Viunalikevirus,k__Viruses;o__Caudovirales;f__Siphoviridae;g__Phicbklikevirus,k__Viruses;f__Fuselloviridae;g__Alphafusellovirus,k__Viruses;f__Bunyaviridae;g__Orthobunyavirus,k__Viruses;o__Picornavirales;f__Picornaviridae;g__Sapelovirus,k__Viruses;f__Bunyaviridae;g__Tospovirus,k__Viruses;f__Mimiviridae;g__Cafeteriavirus,k__Viruses;f__Poxviridae;g__Suipoxvirus,k__Viruses;f__Potyviridae;g__Potyvirus,k__Viruses;o__Herpesvirales;f__Herpesviridae;g__Muromegalovirus,k__Viruses;o__Picornavirales;f__Picornaviridae;g__Salivirus,k__Viruses;o__Caudovirales;f__Podoviridae;g__N4likevirus,k__Viruses;f__Retroviridae;g__Lentivirus,k__Viruses;o__Picornavirales;f__Secoviridae;g__Fabavirus,k__Viruses;f__Polydnaviridae;g__Bracovirus,k__Viruses;o__Herpesvirales;f__Herpesviridae;g__Rhadinovirus,k__Viruses;f__Luteoviridae;g__Polerovirus,k__Viruses;f__Flaviviridae;g__Pestivirus,k__Viruses;f__Poxviridae;g__Betaentomopoxvirus,k__Viruses;f__Geminiviridae;g__Begomovirus,k__Viruses;o__Herpesvirales;f__Alloherpesviridae;g__Cyprinivirus,k__Viruses;o__Mononegavirales;f__Nyamiviridae;g__Nyavirus,k__Viruses;f__Potyviridae;g__Rymovirus,k__Viruses;f__Retroviridae;g__Betaretrovirus,k__Viruses;f__Microviridae;g__Microvirus,k__Viruses;o__Caudovirales;f__Siphoviridae;g__Andromedalikevirus,k__Viruses;f__Iridoviridae;g__Chloriridovirus,k__Viruses;o__Picornavirales;f__Picornaviridae;g__Aphthovirus,k__Viruses;f__Iridoviridae;g__Lymphocystivirus,k__Viruses;o__Tymovirales;f__Alphaflexiviridae;g__Lolavirus,k__Viruses;o__Caudovirales;f__Myoviridae;g__Silviavirus,k__Viruses;f__Caliciviridae;g__Vesivirus,k__Viruses;f__Poxviridae;g__Avipoxvirus,k__Viruses;g__Cilevirus,k__Viruses;o__Mononegavirales;f__Paramyxoviridae;g__Rubulavirus,k__Viruses;f__Chrysoviridae;g__Chrysovirus,k__Viruses;o__Caudovirales;f__Myoviridae;g__Twortlikevirus,k__Viruses;g__Emaravirus,k__Viruses;f__Astroviridae;g__Avastrovirus,k__Viruses;o__Picornavirales;f__Secoviridae;g__Cheravirus,k__Viruses;o__Ligamenvirales;f__Rudiviridae;g__Rudivirus,k__Viruses;o__Herpesvirales;f__Herpesviridae;g__Varicellovirus,k__Viruses;f__Polyomaviridae;g__Polyomavirus,k__Viruses;o__Herpesvirales;f__Alloherpesviridae;g__Batrachovirus,k__Viruses;o__Picornavirales;f__Picornaviridae;g__Enterovirus,k__Viruses;o__Picornavirales;f__Secoviridae;g__Waikavirus,k__Viruses;o__Caudovirales;f__Myoviridae;g__T4likevirus,k__Viruses;f__Poxviridae;g__Yatapoxvirus,k__Viruses;f__Adenoviridae;g__Mastadenovirus,k__Viruses;o__Herpesvirales;f__Herpesviridae;g__Cytomegalovirus,k__Viruses;o__Picornavirales;f__Picornaviridae;g__Kobuvirus,k__Viruses;o__Caudovirales;f__Podoviridae;g__T7likevirus,k__Viruses;o__Herpesvirales;f__Herpesviridae;g__Roseolovirus,k__Viruses;f__Adenoviridae;g__Aviadenovirus,k__Viruses;o__Ligamenvirales;f__Lipothrixviridae;g__Betalipothrixvirus,k__Viruses;f__Iridoviridae;g__Ranavirus,k__Viruses;f__Closteroviridae;g__Ampelovirus,k__Viruses;o__Caudovirales;f__Siphoviridae;g__Phietalikevirus,k__Viruses;o__Caudovirales;f__Siphoviridae;g__Pgonelikevirus,k__Viruses;f__Orthomyxoviridae;g__Isavirus,k__Viruses;f__Poxviridae;g__Orthopoxvirus,k__Viruses;f__Papillomaviridae;g__Gammapapillomavirus,k__Viruses;f__Phycodnaviridae;g__Prymnesiovirus,k__Viruses;f__Bunyaviridae;g__Hantavirus,k__Viruses;o__Picornavirales;f__Picornaviridae;g__Senecavirus,k__Viruses;o__Caudovirales;f__Siphoviridae;g__L5likevirus,k__Viruses;o__Picornavirales;f__Dicistroviridae;g__Cripavirus,k__Viruses;o__Caudovirales;f__Siphoviridae;g__Tunalikevirus,k__Viruses;f__Hytrosaviridae;g__Muscavirus,k__Viruses;o__Herpesvirales;f__Herpesviridae;g__Macavirus,k__Viruses;f__Bromoviridae;g__Oleavirus,k__Viruses;f__Closteroviridae;g__Crinivirus,k__Viruses;f__Papillomaviridae;g__Lambdapapillomavirus,k__Viruses;f__Papillomaviridae;g__Alphapapillomavirus,k__Viruses;f__Partitiviridae;g__Alphapartitivirus,k__Viruses;f__Mimiviridae;g__Mimivirus,k__Viruses;o__Herpesvirales;f__Malacoherpesviridae;g__Ostreavirus,k__Viruses;o__Mononegavirales;g__Metapneumovirus,k__Viruses;o__Tymovirales;f__Alphaflexiviridae;g__Potexvirus,k__Viruses;o__Caudovirales;f__Siphoviridae;g__Sfi21dtunalikevirus,k__Viruses;f__Retroviridae;g__Gammaretrovirus,k__Viruses;f__Circoviridae;g__Circovirus,k__Viruses;o__Tymovirales;f__Betaflexiviridae;g__Foveavirus,k__Viruses;o__Picornavirales;f__Secoviridae;g__Nepovirus,k__Viruses;o__Picornavirales;f__Iflaviridae;g__Iflavirus,k__Viruses;f__Iridoviridae;g__Iridovirus,k__Viruses;f__Flaviviridae;g__Flavivirus,k__Viruses;f__Astroviridae;g__Mamastrovirus,k__Viruses;f__Virgaviridae;g__Pecluvirus,k__Viruses;o__Nidovirales;f__Roniviridae;g__Okavirus,k__Viruses;g__Bacilladnavirus,k__Viruses;f__Bromoviridae;g__Cucumovirus,k__Viruses;o__Herpesvirales;f__Herpesviridae;g__Lymphocryptovirus,k__Viruses;f__Nimaviridae;g__Whispovirus,k__Viruses;f__Hytrosaviridae;g__Glossinavirus,k__Viruses;f__Virgaviridae;g__Hordeivirus,k__Viruses;o__Caudovirales;f__Siphoviridae;g__Skunalikevirus,k__Viruses;o__Caudovirales;f__Myoviridae;g__Cp220likevirus,k__Viruses;o__Tymovirales;f__Tymoviridae;g__Marafivirus,k__Viruses;o__Caudovirales;f__Myoviridae;g__Phikzlikevirus,k__Viruses;o__Caudovirales;f__Myoviridae;g__Cp8unalikevirus,k__Viruses;o__Caudovirales;f__Myoviridae;g__PhiCD119likevirus,k__Viruses;f__Papillomaviridae;g__Pipapillomavirus,k__Viruses;o__Picornavirales;f__Picornaviridae;g__Megrivirus,k__Viruses;o__Caudovirales;f__Myoviridae;g__Kayvirus,k__Viruses;o__Caudovirales;f__Siphoviridae;g__C5likevirus,k__Viruses;f__Endornaviridae;g__Endornavirus,k__Viruses;f__Reoviridae;g__Orbivirus,k__Viruses;f__Adenoviridae;g__Atadenovirus,k__Viruses;f__Orthomyxoviridae;g__Influenzavirus_C,k__Viruses;o__Tymovirales;f__Alphaflexiviridae;g__Botrexvirus,k__Viruses;f__Poxviridae;g__Leporipoxvirus,k__Viruses;o__Caudovirales;f__Podoviridae;g__Phi29likevirus,k__Viruses;f__Parvoviridae;g__Erythroparvovirus,k__Viruses;o__Caudovirales;f__Myoviridae;g__Schizot4likevirus,k__Viruses;f__Baculoviridae;g__Gammabaculovirus,k__Viruses;o__Caudovirales;f__Siphoviridae;g__Yualikevirus,k__Viruses;o__Picornavirales;f__Picornaviridae;g__Cosavirus,k__Viruses;f__Ampullaviridae;g__Ampullavirus,k__Viruses;f__Tectiviridae;g__Tectivirus,k__Viruses;f__Caulimoviridae;g__Cavemovirus,k__Viruses;f__Anelloviridae;g__Epsilontorquevirus,k__Viruses;f__Papillomaviridae;g__Deltapapillomavirus,k__Viruses;f__Parvoviridae;g__Iteradensovirus,k__Viruses;o__Mononegavirales;f__Rhabdoviridae;g__Ephemerovirus,k__Viruses;f__Poxviridae;g__Crocodylidpoxvirus,k__Viruses;f__Anelloviridae;g__Betatorquevirus,k__Viruses;f__Turriviridae;g__Alphaturrivirus,k__Viruses;f__Caulimoviridae;g__Caulimovirus,k__Viruses;o__Caudovirales;f__Siphoviridae;g__P23likevirus,k__Viruses;o__Caudovirales;f__Myoviridae;g__Pbunalikevirus,k__Viruses;f__Globuloviridae;g__Globulovirus,k__Viruses;f__Reoviridae;g__Fijivirus,k__Viruses;f__Reoviridae;g__Phytoreovirus,k__Viruses;f__Benyviridae;g__Benyvirus,k__Viruses;f__Parvoviridae;g__Protoparvovirus,k__Viruses;f__Asfarviridae;g__Asfivirus,k__Viruses;f__Luteoviridae;g__Luteovirus,k__Viruses;o__Mononegavirales;f__Paramyxoviridae;g__Respirovirus,k__Viruses;o__Mononegavirales;f__Filoviridae;g__Marburgvirus,k__Viruses;f__Fuselloviridae;g__Betafusellovirus,k__Viruses;o__Picornavirales;f__Picornaviridae;g__Pasivirus,k__Viruses;f__Bromoviridae;g__Alfamovirus,k__Viruses;o__Caudovirales;f__Podoviridae;g__Sp6likevirus,k__Viruses;o__Nidovirales;f__Coronaviridae;g__Deltacoronavirus,k__Viruses;f__Inoviridae;g__Plectrovirus,k__Viruses;o__Picornavirales;f__Secoviridae;g__Sadwavirus,k__Viruses;o__Mononegavirales;f__Paramyxoviridae;g__Morbillivirus,k__Viruses;o__Mononegavirales;g__Orthopneumovirus,k__Viruses;o__Caudovirales;f__Siphoviridae;g__3alikevirus,k__Viruses;o__Tymovirales;f__Alphaflexiviridae;g__Mandarivirus,k__Viruses;f__Togaviridae;g__Rubivirus,k__Viruses;o__Tymovirales;f__Betaflexiviridae;g__Vitivirus,k__Viruses;o__Caudovirales;f__Siphoviridae;g__Xp10likevirus,k__Viruses;f__Sphaerolipoviridae;g__Alphasphaerolipovirus,k__Viruses;o__Caudovirales;f__Myoviridae;g__Hpunalikevirus,k__Viruses;f__Parvoviridae;g__Brevidensovirus,k__Viruses;f__Papillomaviridae;g__Dyoetapapillomavirus,k__Viruses;o__Caudovirales;f__Siphoviridae;g__Sap6likevirus,k__Viruses;o__Caudovirales;f__Siphoviridae;g__Che9clikevirus,k__Viruses;f__Caulimoviridae;g__Rosadnavirus,k__Viruses;f__Caulimoviridae;g__Solendovirus,k__Viruses;f__Sphaerolipoviridae;g__Gammasphaerolipovirus,k__Viruses;o__Caudovirales;f__Siphoviridae;g__Phijlunalikevirus,k__Viruses;o__Caudovirales;f__Myoviridae;g__P2likevirus,k__Viruses;f__Flaviviridae;g__Pegivirus,k__Viruses;f__Adenoviridae;g__Siadenovirus,k__Viruses;f__Parvoviridae;g__Copiparvovirus,k__Viruses;f__Anelloviridae;g__Iotatorquevirus,k__Viruses;o__Caudovirales;f__Myoviridae;g__Hapunalikevirus,k__Viruses;o__Caudovirales;f__Siphoviridae;g__N15likevirus,k__Viruses;f__Retroviridae;g__Epsilonretrovirus,k__Viruses;o__Mononegavirales;f__Paramyxoviridae;g__Henipavirus,k__Viruses;f__Reoviridae;g__Rotavirus,k__Viruses;o__Ligamenvirales;f__Lipothrixviridae;g__Gammalipothrixvirus,k__Viruses;f__Reoviridae;g__Mimoreovirus,k__Viruses;o__Mononegavirales;f__Rhabdoviridae;g__Vesiculovirus,k__Viruses;o__Mononegavirales;f__Rhabdoviridae;g__Lyssavirus,k__Viruses;f__Hepadnaviridae;g__Orthohepadnavirus,k__Viruses;f__Parvoviridae;g__Dependoparvovirus,k__Viruses;o__Tymovirales;f__Alphaflexiviridae;g__Allexivirus,k__Viruses;o__Picornavirales;f__Picornaviridae;g__Parechovirus,k__Viruses;o__Nidovirales;f__Mesoniviridae;g__Alphamesonivirus,k__Viruses;o__Caudovirales;f__Siphoviridae;g__Corndoglikevirus,k__Viruses;f__Tombusviridae;g__Carmovirus,k__Viruses;f__Parvoviridae;g__Ambidensovirus,k__Viruses;g__Sobemovirus,k__Viruses;o__Picornavirales;f__Secoviridae;g__Comovirus,k__Viruses;o__Mononegavirales;f__Filoviridae;g__Ebolavirus,k__Viruses;f__Geminiviridae;g__Mastrevirus,k__Viruses;g__Polemovirus,k__Viruses;o__Picornavirales;f__Picornaviridae;g__Passerivirus,k__Viruses;f__Leviviridae;g__Levivirus,k__Viruses;o__Picornavirales;f__Picornaviridae;g__Rosavirus,k__Viruses;f__Retroviridae;g__Deltaretrovirus,k__Viruses;o__Mononegavirales;f__Paramyxoviridae;g__Aquaparamyxovirus,k__Viruses;f__Parvoviridae;g__Hepandensovirus,k__Viruses;o__Caudovirales;f__Podoviridae;g__Phieco32likevirus,k__Viruses;f__Reoviridae;g__Aquareovirus,k__Viruses;f__Bromoviridae;g__Ilarvirus,k__Viruses;f__Nanoviridae;g__Nanovirus,k__Viruses;f__Totiviridae;g__Trichomonasvirus,k__Viruses;f__Nanoviridae;g__Babuvirus,k__Viruses;f__Papillomaviridae;g__Dyoomikronpapillomavirus,k__Viruses;f__Caulimoviridae;g__Soymovirus,k__Viruses;f__Tombusviridae;g__Tombusvirus,k__Viruses;f__Totiviridae;g__Leishmaniavirus,k__Viruses;o__Caudovirales;f__Siphoviridae;g__Phic3unalikevirus,k__Viruses;f__Parvoviridae;g__Bocaparvovirus,k__Viruses;o__Caudovirales;f__Siphoviridae;g__Hk578likevirus,k__Viruses;o__Caudovirales;f__Siphoviridae;g__Tm4likevirus,k__Viruses;o__Caudovirales;f__Siphoviridae;g__Che8likevirus,k__Viruses;f__Tombusviridae;g__Panicovirus,k__Viruses;f__Papillomaviridae;g__Nupapillomavirus,k__Viruses;f__Orthomyxoviridae;g__Influenzavirus_A,k__Viruses;o__Caudovirales;f__Podoviridae;g__Ahjdlikevirus,k__Viruses;o__Caudovirales;f__Podoviridae;g__Luz24likevirus,k__Viruses;f__Anelloviridae;g__Zetatorquevirus,k__Viruses;f__Barnaviridae;g__Barnavirus,k__Viruses;f__Parvoviridae;g__Tetraparvovirus,k__Viruses;f__Narnaviridae;g__Mitovirus,k__Viruses;o__Caudovirales;f__Siphoviridae;g__Charlielikevirus,k__Viruses;o__Nidovirales;f__Arteriviridae;g__Arterivirus,k__Viruses;o__Mononegavirales;f__Rhabdoviridae;g__Perhabdovirus,k__Viruses;f__Potyviridae;g__Ipomovirus,k__Viruses;f__Geminiviridae;g__Curtovirus,k__Viruses;o__Picornavirales;f__Picornaviridae;g__Gallivirus,k__Viruses;o__Caudovirales;f__Siphoviridae;g__Iebhlikevirus,k__Viruses;o__Caudovirales;f__Myoviridae;g__Mulikevirus,k__Viruses;o__Picornavirales;g__Bacillarnavirus,k__Viruses;f__Megabirnaviridae;g__Megabirnavirus,k__Viruses;f__Corticoviridae;g__Corticovirus,k__Viruses;f__Bunyaviridae;g__Nairovirus,k__Viruses;f__Caulimoviridae;g__Tungrovirus,k__Viruses;f__Retroviridae;g__Spumavirus,k__Viruses;f__Reoviridae;g__Mycoreovirus,k__Viruses;o__Caudovirales;f__Myoviridae;g__Bcep78likevirus,k__Viruses;g__Idaeovirus,k__Viruses;f__Narnaviridae;g__Narnavirus,k__Viruses;f__Reoviridae;g__Coltivirus,k__Viruses;f__Totiviridae;g__Totivirus,k__Viroids;f__Pospiviroidae;g__Cocadviroid,k__Viruses;o__Caudovirales;f__Siphoviridae;g__Chilikevirus,k__Viruses;f__Potyviridae;g__Tritimovirus,k__Viruses;f__Papillomaviridae;g__Kappapapillomavirus,k__Viruses;f__Reoviridae;g__Dinovernavirus,k__Viruses;f__Anelloviridae;g__Kappatorquevirus,k__Viruses;f__Potyviridae;g__Poacevirus,k__Viruses;f__Iridoviridae;g__Megalocytivirus,k__Viruses;o__Mononegavirales;f__Rhabdoviridae;g__Nucleorhabdovirus,k__Viruses;o__Mononegavirales;f__Rhabdoviridae;g__Tibrovirus,k__Viruses;f__Closteroviridae;g__Velarivirus,k__Viruses;f__Papillomaviridae;g__Rhopapillomavirus,k__Viruses;o__Tymovirales;f__Betaflexiviridae;g__Capillovirus,k__Viruses;g__Varicosavirus,k__Viruses;f__Hepadnaviridae;g__Avihepadnavirus,k__Viruses;o__Caudovirales;f__Siphoviridae;g__Bronlikevirus,k__Viroids;f__Pospiviroidae;g__Coleviroid,k__Viroids;f__Avsunviroidae;g__Pelamoviroid,k__Viruses;o__Caudovirales;f__Podoviridae;g__Bppunalikevirus,k__Viruses;o__Caudovirales;f__Siphoviridae;g__D3likevirus,k__Viruses;f__Caliciviridae;g__Norovirus,k__Viruses;f__Leviviridae;g__Allolevivirus,k__Viruses;f__Microviridae;g__Bdellomicrovirus,k__Viruses;o__Caudovirales;f__Siphoviridae;g__Omegalikevirus,k__Viruses;o__Picornavirales;f__Picornaviridae;g__Sakobuvirus,k__Viruses;f__Plasmaviridae;g__Plasmavirus,k__Viroids;f__Pospiviroidae;g__Apscaviroid,k__Viruses;o__Caudovirales;f__Siphoviridae;g__Pbiunalikevirus,k__Viruses;f__Microviridae;g__Spiromicrovirus,k__Viruses;f__Papillomaviridae;g__Dyopipapillomavirus,k__Viruses;o__Caudovirales;f__Siphoviridae;g__Cjwunalikevirus
s17489,0.0,0.0,124.0,0.0,2.0,0.0,0.0,0.0,0.0,26.0,0.0,24.0,2.0,2.0,0.0,510.0,22.0,0.0,4.0,0.0,128.0,0.0,10.0,0.0,0.0,346.0,0.0,0.0,42.0,0.0,418.0,36.0,28.0,0.0,0.0,102.0,2.0,4.0,0.0,0.0,26.0,6.0,0.0,598.0,16.0,0.0,312.0,0.0,0.0,182.0,0.0,0.0,0.0,42.0,0.0,4.0,2.0,150.0,0.0,14.0,0.0,0.0,0.0,22.0,54.0,0.0,4.0,4.0,0.0,4.0,0.0,0.0,10.0,78.0,0.0,32.0,0.0,0.0,12.0,20.0,0.0,32.0,0.0,0.0,0.0,0.0,698.0,0.0,2676.0,4.0,0.0,0.0,194.0,0.0,2.0,0.0,0.0,0.0,4.0,78.0,58.0,2.0,0.0,90.0,20.0,112.0,0.0,2.0,12.0,576.0,34.0,0.0,26.0,0.0,0.0,200.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,2.0,0.0,0.0,6.0,268.0,0.0,0.0,14.0,154.0,2.0,8.0,22.0,2186.0,0.0,2.0,46.0,336.0,10.0,4.0,4.0,0.0,4.0,0.0,0.0,0.0,6.0,2.0,0.0,92.0,0.0,38.0,2.0,4.0,2.0,0.0,6.0,2.0,0.0,0.0,8.0,366.0,0.0,0.0,0.0,156.0,0.0,100.0,2.0,0.0,18.0,136.0,6.0,104.0,688.0,0.0,0.0,2.0,2.0,2.0,950.0,28.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
s17512,4.0,0.0,144.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,6.0,0.0,0.0,2.0,88.0,20.0,2.0,0.0,0.0,14.0,0.0,2.0,0.0,0.0,110.0,0.0,0.0,2.0,0.0,136.0,14.0,14.0,0.0,0.0,48.0,0.0,2.0,0.0,0.0,0.0,2.0,0.0,698.0,10.0,0.0,104.0,0.0,0.0,58.0,6.0,0.0,0.0,56.0,0.0,0.0,2.0,26.0,0.0,0.0,2.0,0.0,0.0,12.0,94.0,4.0,0.0,0.0,0.0,14.0,0.0,0.0,4.0,8.0,0.0,16.0,0.0,4.0,6.0,12.0,0.0,66.0,0.0,0.0,0.0,0.0,268.0,0.0,678.0,2.0,0.0,0.0,46.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,68.0,2.0,0.0,14.0,6.0,16.0,0.0,0.0,0.0,460.0,16.0,0.0,40.0,0.0,0.0,64.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,2.0,0.0,0.0,2.0,54.0,0.0,0.0,14.0,52.0,0.0,0.0,6.0,0.0,0.0,8.0,42.0,72.0,14.0,24.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,86.0,0.0,18.0,4.0,0.0,2.0,0.0,6.0,0.0,0.0,0.0,50.0,150.0,2.0,0.0,2.0,24.0,0.0,36.0,10.0,0.0,10.0,32.0,0.0,64.0,150.0,0.0,0.0,0.0,0.0,2.0,138.0,6.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
s17498,0.0,0.0,108.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,28.0,0.0,10.0,2.0,86.0,20.0,4.0,2.0,0.0,22.0,0.0,18.0,0.0,0.0,74.0,0.0,0.0,2.0,0.0,60.0,16.0,20.0,0.0,0.0,56.0,0.0,0.0,0.0,2.0,0.0,2.0,2.0,458.0,14.0,0.0,70.0,0.0,0.0,30.0,2.0,0.0,0.0,16.0,0.0,6.0,0.0,12.0,0.0,2.0,2.0,0.0,0.0,8.0,76.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,10.0,8.0,0.0,12.0,0.0,2.0,6.0,14.0,0.0,50.0,0.0,0.0,0.0,0.0,224.0,0.0,426.0,6.0,0.0,0.0,52.0,0.0,2.0,0.0,0.0,0.0,10.0,14.0,52.0,2.0,0.0,18.0,14.0,20.0,0.0,0.0,2.0,300.0,36.0,0.0,20.0,6.0,0.0,92.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,2.0,0.0,4.0,40.0,0.0,0.0,12.0,74.0,0.0,4.0,16.0,0.0,0.0,10.0,54.0,46.0,40.0,14.0,12.0,0.0,2.0,0.0,0.0,0.0,2.0,4.0,0.0,92.0,2.0,12.0,0.0,4.0,0.0,0.0,8.0,0.0,0.0,2.0,56.0,194.0,0.0,4.0,0.0,30.0,0.0,220.0,4.0,0.0,8.0,18.0,0.0,64.0,86.0,0.0,0.0,0.0,0.0,0.0,316.0,22.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
s17528,2.0,0.0,110.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,14.0,2.0,2.0,4.0,40.0,14.0,0.0,0.0,0.0,6.0,0.0,4.0,0.0,0.0,50.0,0.0,0.0,2.0,0.0,52.0,20.0,2.0,2.0,0.0,22.0,0.0,2.0,0.0,0.0,2.0,0.0,0.0,778.0,8.0,0.0,62.0,0.0,0.0,28.0,4.0,0.0,0.0,24.0,0.0,2.0,2.0,6.0,0.0,0.0,0.0,0.0,0.0,8.0,62.0,0.0,0.0,0.0,0.0,8.0,2.0,0.0,2.0,6.0,2.0,2.0,0.0,2.0,4.0,4.0,0.0,418.0,0.0,0.0,0.0,0.0,86.0,0.0,386.0,2.0,0.0,0.0,16.0,0.0,0.0,0.0,0.0,0.0,36.0,4.0,24.0,6.0,0.0,14.0,18.0,16.0,0.0,0.0,4.0,126.0,4.0,0.0,20.0,0.0,0.0,28.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,40.0,0.0,0.0,38.0,38.0,2.0,0.0,6.0,68.0,0.0,10.0,30.0,32.0,12.0,14.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,4.0,4.0,66.0,0.0,4.0,4.0,0.0,0.0,0.0,8.0,0.0,4.0,2.0,8.0,68.0,0.0,2.0,0.0,14.0,0.0,42.0,6.0,0.0,0.0,14.0,0.0,26.0,72.0,0.0,0.0,0.0,2.0,0.0,122.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
s17535,0.0,0.0,42.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0,0.0,14.0,0.0,0.0,0.0,134.0,2.0,2.0,0.0,0.0,48.0,0.0,0.0,0.0,0.0,104.0,0.0,0.0,16.0,0.0,202.0,16.0,2.0,0.0,0.0,32.0,0.0,2.0,0.0,0.0,2.0,4.0,0.0,218.0,4.0,0.0,70.0,0.0,0.0,44.0,2.0,0.0,0.0,16.0,0.0,0.0,0.0,38.0,0.0,0.0,0.0,0.0,0.0,10.0,38.0,0.0,0.0,0.0,0.0,14.0,4.0,0.0,6.0,12.0,0.0,10.0,0.0,0.0,8.0,0.0,0.0,102.0,0.0,0.0,0.0,0.0,278.0,0.0,872.0,0.0,0.0,0.0,66.0,0.0,0.0,0.0,0.0,0.0,8.0,28.0,18.0,2.0,0.0,30.0,2.0,36.0,4.0,0.0,0.0,172.0,8.0,0.0,0.0,0.0,0.0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,74.0,0.0,0.0,4.0,28.0,0.0,2.0,2.0,426.0,0.0,2.0,10.0,118.0,24.0,6.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,22.0,0.0,20.0,2.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,14.0,134.0,8.0,0.0,0.0,46.0,14.0,430.0,2.0,0.0,6.0,38.0,0.0,24.0,156.0,0.0,0.0,0.0,0.0,0.0,260.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
vbDataBarnDF = pd.merge(virDataBarnDF, bactDataBarnDF, how='left', left_index=True, right_index=True)
print(vbDataBarnDF.shape) # Want this to be 18120 x 1993
vbDataBarnDF.head()

(18120, 1993)


Unnamed: 0,k__Viruses;f__Phycodnaviridae;g__Prasinovirus,k__Viruses;o__Caudovirales;f__Siphoviridae;g__Sfi1unalikevirus,k__Viruses;o__Herpesvirales;f__Herpesviridae;g__Simplexvirus,k__Viruses;o__Caudovirales;f__Siphoviridae;g__C2likevirus,k__Viruses;f__Poxviridae;g__Parapoxvirus,k__Viruses;f__Papillomaviridae;g__Dyolambdapapillomavirus,k__Viruses;o__Caudovirales;f__Siphoviridae;g__Bignuzlikevirus,k__Viruses;o__Mononegavirales;f__Bornaviridae;g__Bornavirus,k__Viruses;f__Bicaudaviridae;g__Bicaudavirus,k__Viruses;o__Picornavirales;f__Picornaviridae;g__Aquamavirus,k__Viruses;o__Herpesvirales;f__Herpesviridae;g__Scutavirus,k__Viruses;o__Herpesvirales;f__Herpesviridae;g__Mardivirus,k__Viruses;f__Baculoviridae;g__Deltabaculovirus,k__Viruses;f__Papillomaviridae;g__Taupapillomavirus,k__Viruses;o__Caudovirales;f__Myoviridae;g__I3likevirus,k__Viruses;f__Virgaviridae;g__Furovirus,k__Viruses;o__Herpesvirales;f__Herpesviridae;g__Proboscivirus,k__Viruses;o__Herpesvirales;f__Alloherpesviridae;g__Ictalurivirus,k__Viruses;f__Inoviridae;g__Inovirus,k__Viruses;f__Marseilleviridae;g__Marseillevirus,k__Viruses;o__Nidovirales;f__Coronaviridae;g__Gammacoronavirus,k__Viruses;f__Anelloviridae;g__Thetatorquevirus,k__Viruses;f__Ascoviridae;g__Ascovirus,k__Viruses;o__Tymovirales;f__Tymoviridae;g__Maculavirus,k__Viruses;g__Negevirus,k__Viruses;o__Caudovirales;f__Myoviridae;g__Spo1virus,k__Viruses;f__Circoviridae;g__Gyrovirus,k__Viruses;o__Mononegavirales;f__Rhabdoviridae;g__Novirhabdovirus,k__Viruses;o__Picornavirales;f__Picornaviridae;g__Sicinivirus,k__Viruses;o__Mononegavirales;g__Pneumovirus,k__Viruses;o__Picornavirales;f__Picornaviridae;g__Dicipivirus,k__Viruses;f__Poxviridae;g__Molluscipoxvirus,k__Viruses;o__Tymovirales;f__Tymoviridae;g__Tymovirus,k__Viruses;f__Arenaviridae;g__Mammarenavirus,k__Viruses;f__Totiviridae;g__Victorivirus,k__Viruses;f__Flaviviridae;g__Hepacivirus,k__Viruses;f__Poxviridae;g__Alphaentomopoxvirus,k__Viruses;f__Anelloviridae;g__Gammatorquevirus,k__Viruses;o__Tymovirales;f__Gammaflexiviridae;g__Mycoflexivirus,k__Viruses;g__Tenuivirus,k__Viruses;f__Hepeviridae;g__Piscihepevirus,k__Viruses;f__Virgaviridae;g__Pomovirus,k__Viruses;f__Caulimoviridae;g__Badnavirus,k__Viruses;f__Polydnaviridae;g__Ichnovirus,k__Viruses;o__Herpesvirales;f__Herpesviridae;g__Percavirus,k__Viruses;f__Papillomaviridae;g__Betapapillomavirus,k__Viruses;f__Togaviridae;g__Alphavirus,k__Viruses;o__Tymovirales;f__Betaflexiviridae;g__Trichovirus,k__Viruses;o__Caudovirales;f__Podoviridae;g__Bcep22likevirus,k__Viruses;o__Tymovirales;f__Alphaflexiviridae;g__Sclerodarnavirus,k__Viruses;f__Phycodnaviridae;g__Coccolithovirus,k__Viruses;o__Ligamenvirales;f__Lipothrixviridae;g__Deltalipothrixvirus,k__Viruses;f__Papillomaviridae;g__Omikronpapillomavirus,k__Viruses;f__Retroviridae;g__Alpharetrovirus,k__Viruses;o__Caudovirales;f__Siphoviridae;g__Jerseylikevirus,k__Viruses;f__Poxviridae;g__Capripoxvirus,k__Viruses;f__Poxviridae;g__Cervidpoxvirus,k__Viruses;g__Higrevirus,k__Viruses;f__Potyviridae;g__Bymovirus,k__Viruses;o__Caudovirales;f__Podoviridae;g__Phikmvlikevirus,k__Viruses;o__Picornavirales;f__Picornaviridae;g__Tremovirus,k__Viruses;o__Picornavirales;f__Secoviridae;g__Torradovirus,k__Viruses;o__Caudovirales;f__Siphoviridae;g__T5likevirus,k__Viruses;o__Nidovirales;f__Coronaviridae;g__Bafinivirus,k__Viruses;f__Baculoviridae;g__Alphabaculovirus,k__Viruses;f__Papillomaviridae;g__Mupapillomavirus,k__Viruses;o__Caudovirales;f__Siphoviridae;g__Lambdalikevirus,k__Viruses;o__Caudovirales;f__Myoviridae;g__Felixounalikevirus,k__Viruses;f__Papillomaviridae;g__Phipapillomavirus,k__Viruses;f__Papillomaviridae;g__Omegapapillomavirus,k__Viruses;o__Nidovirales;f__Coronaviridae;g__Betacoronavirus,k__Viruses;f__Luteoviridae;g__Enamovirus,k__Viruses;f__Phycodnaviridae;g__Chlorovirus,k__Viruses;f__Hypoviridae;g__Hypovirus,k__Viruses;f__Anelloviridae;g__Alphatorquevirus,k__Viruses;o__Picornavirales;f__Dicistroviridae;g__Aparavirus,k__Viruses;g__Pithovirus,k__Viruses;f__Phycodnaviridae;g__Phaeovirus,k__Viruses;o__Picornavirales;f__Picornaviridae;g__Hepatovirus,k__Viruses;f__Virgaviridae;g__Tobamovirus,k__Viruses;o__Caudovirales;f__Siphoviridae;g__Barnyardlikevirus,k__Viruses;f__Baculoviridae;g__Betabaculovirus,k__Viruses;f__Arenaviridae;g__Arenavirus,k__Viruses;o__Picornavirales;f__Marnaviridae;g__Marnavirus,k__Viruses;o__Caudovirales;f__Myoviridae;g__P100virus,k__Viruses;o__Caudovirales;f__Myoviridae;g__Spounalikevirus,k__Viruses;f__Partitiviridae;g__Betapartitivirus,k__Viruses;f__Reoviridae;g__Cypovirus,k__Viruses;o__Tymovirales;f__Betaflexiviridae;g__Carlavirus,k__Viruses;o__Nidovirales;f__Coronaviridae;g__Alphacoronavirus,k__Viruses;o__Mononegavirales;f__Paramyxoviridae;g__Avulavirus,k__Viruses;o__Herpesvirales;f__Herpesviridae;g__Iltovirus,k__Viruses;f__Bromoviridae;g__Bromovirus,k__Viruses;f__Bunyaviridae;g__Phlebovirus,k__Viruses;f__Closteroviridae;g__Closterovirus,k__Viruses;o__Caudovirales;f__Myoviridae;g__Viunalikevirus,k__Viruses;o__Caudovirales;f__Siphoviridae;g__Phicbklikevirus,k__Viruses;f__Fuselloviridae;g__Alphafusellovirus,k__Viruses;f__Bunyaviridae;g__Orthobunyavirus,k__Viruses;o__Picornavirales;f__Picornaviridae;g__Sapelovirus,k__Viruses;f__Bunyaviridae;g__Tospovirus,k__Viruses;f__Mimiviridae;g__Cafeteriavirus,k__Viruses;f__Poxviridae;g__Suipoxvirus,k__Viruses;f__Potyviridae;g__Potyvirus,k__Viruses;o__Herpesvirales;f__Herpesviridae;g__Muromegalovirus,k__Viruses;o__Picornavirales;f__Picornaviridae;g__Salivirus,k__Viruses;o__Caudovirales;f__Podoviridae;g__N4likevirus,k__Viruses;f__Retroviridae;g__Lentivirus,k__Viruses;o__Picornavirales;f__Secoviridae;g__Fabavirus,k__Viruses;f__Polydnaviridae;g__Bracovirus,k__Viruses;o__Herpesvirales;f__Herpesviridae;g__Rhadinovirus,k__Viruses;f__Luteoviridae;g__Polerovirus,k__Viruses;f__Flaviviridae;g__Pestivirus,k__Viruses;f__Poxviridae;g__Betaentomopoxvirus,k__Viruses;f__Geminiviridae;g__Begomovirus,k__Viruses;o__Herpesvirales;f__Alloherpesviridae;g__Cyprinivirus,k__Viruses;o__Mononegavirales;f__Nyamiviridae;g__Nyavirus,k__Viruses;f__Potyviridae;g__Rymovirus,k__Viruses;f__Retroviridae;g__Betaretrovirus,k__Viruses;f__Microviridae;g__Microvirus,k__Viruses;o__Caudovirales;f__Siphoviridae;g__Andromedalikevirus,k__Viruses;f__Iridoviridae;g__Chloriridovirus,k__Viruses;o__Picornavirales;f__Picornaviridae;g__Aphthovirus,k__Viruses;f__Iridoviridae;g__Lymphocystivirus,k__Viruses;o__Tymovirales;f__Alphaflexiviridae;g__Lolavirus,k__Viruses;o__Caudovirales;f__Myoviridae;g__Silviavirus,k__Viruses;f__Caliciviridae;g__Vesivirus,k__Viruses;f__Poxviridae;g__Avipoxvirus,k__Viruses;g__Cilevirus,k__Viruses;o__Mononegavirales;f__Paramyxoviridae;g__Rubulavirus,k__Viruses;f__Chrysoviridae;g__Chrysovirus,k__Viruses;o__Caudovirales;f__Myoviridae;g__Twortlikevirus,k__Viruses;g__Emaravirus,k__Viruses;f__Astroviridae;g__Avastrovirus,k__Viruses;o__Picornavirales;f__Secoviridae;g__Cheravirus,k__Viruses;o__Ligamenvirales;f__Rudiviridae;g__Rudivirus,k__Viruses;o__Herpesvirales;f__Herpesviridae;g__Varicellovirus,k__Viruses;f__Polyomaviridae;g__Polyomavirus,k__Viruses;o__Herpesvirales;f__Alloherpesviridae;g__Batrachovirus,k__Viruses;o__Picornavirales;f__Picornaviridae;g__Enterovirus,k__Viruses;o__Picornavirales;f__Secoviridae;g__Waikavirus,k__Viruses;o__Caudovirales;f__Myoviridae;g__T4likevirus,k__Viruses;f__Poxviridae;g__Yatapoxvirus,k__Viruses;f__Adenoviridae;g__Mastadenovirus,k__Viruses;o__Herpesvirales;f__Herpesviridae;g__Cytomegalovirus,k__Viruses;o__Picornavirales;f__Picornaviridae;g__Kobuvirus,k__Viruses;o__Caudovirales;f__Podoviridae;g__T7likevirus,k__Viruses;o__Herpesvirales;f__Herpesviridae;g__Roseolovirus,k__Viruses;f__Adenoviridae;g__Aviadenovirus,k__Viruses;o__Ligamenvirales;f__Lipothrixviridae;g__Betalipothrixvirus,k__Viruses;f__Iridoviridae;g__Ranavirus,k__Viruses;f__Closteroviridae;g__Ampelovirus,k__Viruses;o__Caudovirales;f__Siphoviridae;g__Phietalikevirus,k__Viruses;o__Caudovirales;f__Siphoviridae;g__Pgonelikevirus,k__Viruses;f__Orthomyxoviridae;g__Isavirus,k__Viruses;f__Poxviridae;g__Orthopoxvirus,k__Viruses;f__Papillomaviridae;g__Gammapapillomavirus,k__Viruses;f__Phycodnaviridae;g__Prymnesiovirus,k__Viruses;f__Bunyaviridae;g__Hantavirus,k__Viruses;o__Picornavirales;f__Picornaviridae;g__Senecavirus,k__Viruses;o__Caudovirales;f__Siphoviridae;g__L5likevirus,k__Viruses;o__Picornavirales;f__Dicistroviridae;g__Cripavirus,k__Viruses;o__Caudovirales;f__Siphoviridae;g__Tunalikevirus,k__Viruses;f__Hytrosaviridae;g__Muscavirus,k__Viruses;o__Herpesvirales;f__Herpesviridae;g__Macavirus,k__Viruses;f__Bromoviridae;g__Oleavirus,k__Viruses;f__Closteroviridae;g__Crinivirus,k__Viruses;f__Papillomaviridae;g__Lambdapapillomavirus,k__Viruses;f__Papillomaviridae;g__Alphapapillomavirus,k__Viruses;f__Partitiviridae;g__Alphapartitivirus,k__Viruses;f__Mimiviridae;g__Mimivirus,k__Viruses;o__Herpesvirales;f__Malacoherpesviridae;g__Ostreavirus,k__Viruses;o__Mononegavirales;g__Metapneumovirus,k__Viruses;o__Tymovirales;f__Alphaflexiviridae;g__Potexvirus,k__Viruses;o__Caudovirales;f__Siphoviridae;g__Sfi21dtunalikevirus,k__Viruses;f__Retroviridae;g__Gammaretrovirus,k__Viruses;f__Circoviridae;g__Circovirus,k__Viruses;o__Tymovirales;f__Betaflexiviridae;g__Foveavirus,k__Viruses;o__Picornavirales;f__Secoviridae;g__Nepovirus,k__Viruses;o__Picornavirales;f__Iflaviridae;g__Iflavirus,k__Viruses;f__Iridoviridae;g__Iridovirus,k__Viruses;f__Flaviviridae;g__Flavivirus,k__Viruses;f__Astroviridae;g__Mamastrovirus,k__Viruses;f__Virgaviridae;g__Pecluvirus,k__Viruses;o__Nidovirales;f__Roniviridae;g__Okavirus,k__Viruses;g__Bacilladnavirus,k__Viruses;f__Bromoviridae;g__Cucumovirus,k__Viruses;o__Herpesvirales;f__Herpesviridae;g__Lymphocryptovirus,k__Viruses;f__Nimaviridae;g__Whispovirus,k__Viruses;f__Hytrosaviridae;g__Glossinavirus,k__Viruses;f__Virgaviridae;g__Hordeivirus,k__Viruses;o__Caudovirales;f__Siphoviridae;g__Skunalikevirus,k__Viruses;o__Caudovirales;f__Myoviridae;g__Cp220likevirus,k__Viruses;o__Tymovirales;f__Tymoviridae;g__Marafivirus,k__Viruses;o__Caudovirales;f__Myoviridae;g__Phikzlikevirus,k__Viruses;o__Caudovirales;f__Myoviridae;g__Cp8unalikevirus,k__Viruses;o__Caudovirales;f__Myoviridae;g__PhiCD119likevirus,k__Viruses;f__Papillomaviridae;g__Pipapillomavirus,k__Viruses;o__Picornavirales;f__Picornaviridae;g__Megrivirus,k__Viruses;o__Caudovirales;f__Myoviridae;g__Kayvirus,k__Viruses;o__Caudovirales;f__Siphoviridae;g__C5likevirus,k__Viruses;f__Endornaviridae;g__Endornavirus,k__Viruses;f__Reoviridae;g__Orbivirus,k__Viruses;f__Adenoviridae;g__Atadenovirus,k__Viruses;f__Orthomyxoviridae;g__Influenzavirus_C,k__Viruses;o__Tymovirales;f__Alphaflexiviridae;g__Botrexvirus,k__Viruses;f__Poxviridae;g__Leporipoxvirus,k__Viruses;o__Caudovirales;f__Podoviridae;g__Phi29likevirus,k__Viruses;f__Parvoviridae;g__Erythroparvovirus,k__Viruses;o__Caudovirales;f__Myoviridae;g__Schizot4likevirus,k__Viruses;f__Baculoviridae;g__Gammabaculovirus,k__Viruses;o__Caudovirales;f__Siphoviridae;g__Yualikevirus,k__Viruses;o__Picornavirales;f__Picornaviridae;g__Cosavirus,k__Viruses;f__Ampullaviridae;g__Ampullavirus,k__Viruses;f__Tectiviridae;g__Tectivirus,k__Viruses;f__Caulimoviridae;g__Cavemovirus,k__Viruses;f__Anelloviridae;g__Epsilontorquevirus,k__Viruses;f__Papillomaviridae;g__Deltapapillomavirus,k__Viruses;f__Parvoviridae;g__Iteradensovirus,k__Viruses;o__Mononegavirales;f__Rhabdoviridae;g__Ephemerovirus,k__Viruses;f__Poxviridae;g__Crocodylidpoxvirus,k__Viruses;f__Anelloviridae;g__Betatorquevirus,k__Viruses;f__Turriviridae;g__Alphaturrivirus,k__Viruses;f__Caulimoviridae;g__Caulimovirus,k__Viruses;o__Caudovirales;f__Siphoviridae;g__P23likevirus,k__Viruses;o__Caudovirales;f__Myoviridae;g__Pbunalikevirus,k__Viruses;f__Globuloviridae;g__Globulovirus,k__Viruses;f__Reoviridae;g__Fijivirus,k__Viruses;f__Reoviridae;g__Phytoreovirus,k__Viruses;f__Benyviridae;g__Benyvirus,k__Viruses;f__Parvoviridae;g__Protoparvovirus,k__Viruses;f__Asfarviridae;g__Asfivirus,k__Viruses;f__Luteoviridae;g__Luteovirus,k__Viruses;o__Mononegavirales;f__Paramyxoviridae;g__Respirovirus,k__Viruses;o__Mononegavirales;f__Filoviridae;g__Marburgvirus,k__Viruses;f__Fuselloviridae;g__Betafusellovirus,k__Viruses;o__Picornavirales;f__Picornaviridae;g__Pasivirus,k__Viruses;f__Bromoviridae;g__Alfamovirus,k__Viruses;o__Caudovirales;f__Podoviridae;g__Sp6likevirus,k__Viruses;o__Nidovirales;f__Coronaviridae;g__Deltacoronavirus,k__Viruses;f__Inoviridae;g__Plectrovirus,k__Viruses;o__Picornavirales;f__Secoviridae;g__Sadwavirus,k__Viruses;o__Mononegavirales;f__Paramyxoviridae;g__Morbillivirus,k__Viruses;o__Mononegavirales;g__Orthopneumovirus,k__Viruses;o__Caudovirales;f__Siphoviridae;g__3alikevirus,k__Viruses;o__Tymovirales;f__Alphaflexiviridae;g__Mandarivirus,k__Viruses;f__Togaviridae;g__Rubivirus,k__Viruses;o__Tymovirales;f__Betaflexiviridae;g__Vitivirus,k__Viruses;o__Caudovirales;f__Siphoviridae;g__Xp10likevirus,k__Viruses;f__Sphaerolipoviridae;g__Alphasphaerolipovirus,...,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Legionellales;f__Coxiellaceae;g__Coxiella,k__Archaea;p__Euryarchaeota;c__Methanomicrobia;o__Methanomicrobiales;f__Methanomicrobiaceae;g__Methanoplanus,k__Bacteria;p__Actinobacteria;c__Nitriliruptoria;o__Nitriliruptorales;f__Nitriliruptoraceae;g__Nitriliruptor,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rickettsiales;f__Anaplasmataceae;g__Wolbachia,k__Archaea;p__Euryarchaeota;c__Methanomicrobia;o__Methanosarcinales;f__Methanosarcinaceae;g__Methanomethylovorans,k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacillales;f__Bacillaceae;g__Anaerobacillus,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhizobiales;f__Methylocystaceae;g__Methylosinus,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhodospirillales;f__Acetobacteraceae;g__Acidocella,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Geodermatophilales;f__Geodermatophilaceae;g__Modestobacter,k__Bacteria;p__Proteobacteria;c__Betaproteobacteria;o__Rhodocyclales;f__Rhodocyclaceae;g__Azonexus,k__Archaea;p__Euryarchaeota;c__Halobacteria;o__Halobacteriales;f__Halobacteriaceae;g__Halalkalicoccus,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Bifidobacteriales;f__Bifidobacteriaceae;g__Scardovia,k__Bacteria;p__Proteobacteria;c__Betaproteobacteria;o__Nitrosomonadales;f__Nitrosomonadaceae;g__Nitrosomonas,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Micrococcales;f__Micrococcaceae;g__Rothia,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Micrococcales;f__Micrococcaceae;g__Arthrobacter,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Cellvibrionales;f__Halieaceae;g__Congregibacter,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhodospirillales;f__Acetobacteraceae;g__Acetobacter,k__Bacteria;p__Proteobacteria;c__Betaproteobacteria;o__Rhodocyclales;f__Rhodocyclaceae;g__Azospira,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Streptomycetales;f__Streptomycetaceae;g__Kitasatospora,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhodobacterales;f__Rhodobacteraceae;g__Pseudorhodobacter,k__Bacteria;p__Proteobacteria;c__Betaproteobacteria;o__Burkholderiales;f__Oxalobacteraceae;g__Oxalobacter,k__Bacteria;p__Proteobacteria;c__Acidithiobacillia;o__Acidithiobacillales;f__Acidithiobacillaceae;g__Acidithiobacillus,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Cellvibrionales;f__Microbulbiferaceae;g__Microbulbifer,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhodobacterales;f__Rhodobacteraceae;g__Roseibacterium,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Peptococcaceae;g__Desulfurispora,k__Bacteria;p__Cyanobacteria;o__Chroococcales;g__Dactylococcopsis,k__Bacteria;p__Proteobacteria;c__Betaproteobacteria;o__Burkholderiales;f__Oxalobacteraceae;g__Herminiimonas,k__Bacteria;p__Dictyoglomi;c__Dictyoglomia;o__Dictyoglomales;f__Dictyoglomaceae;g__Dictyoglomus,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Cellvibrionales;f__Cellvibrionaceae;g__Simiduia,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rickettsiales;f__Anaplasmataceae;g__Neorickettsia,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Streptosporangiales;f__Thermomonosporaceae;g__Actinomadura,k__Bacteria;p__Proteobacteria;c__Betaproteobacteria;g__Candidatus_Nasuia,k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacillales;f__Bacillaceae;g__Ornithinibacillus,k__Bacteria;p__Bacteroidetes;c__Flavobacteriia;o__Flavobacteriales;f__Flavobacteriaceae;g__Olleya,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Pseudonocardiales;f__Pseudonocardiaceae;g__Lechevalieria,k__Bacteria;p__Proteobacteria;c__Deltaproteobacteria;o__Desulfuromonadales;f__Geobacteraceae;g__Geobacter,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacteriales;f__Enterobacteriaceae;g__Edwardsiella,k__Bacteria;p__Deinococcus-Thermus;c__Deinococci;o__Thermales;f__Thermaceae;g__Oceanithermus,k__Archaea;p__Crenarchaeota;c__Thermoprotei;o__Sulfolobales;f__Sulfolobaceae;g__Metallosphaera,k__Bacteria;p__Bacteroidetes;c__Cytophagia;o__Cytophagales;f__Flammeovirgaceae;g__Marivirga,k__Bacteria;p__Bacteroidetes;c__Flavobacteriia;o__Flavobacteriales;f__Flavobacteriaceae;g__Algibacter,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Xanthomonadales;f__Xanthomonadaceae;g__Luteimonas,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Cellvibrionales;f__Halieaceae;g__Pseudohaliea,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhodospirillales;f__Rhodospirillaceae;g__Tistrella,k__Archaea;p__Crenarchaeota;c__Thermoprotei;o__Desulfurococcales;f__Desulfurococcaceae;g__Aeropyrum,k__Bacteria;p__Proteobacteria;c__Betaproteobacteria;o__Neisseriales;f__Chromobacteriaceae;g__Leeia,k__Bacteria;p__Proteobacteria;c__Deltaproteobacteria;o__Desulfobacterales;f__Desulfobacteraceae;g__Desulfotignum,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Micrococcales;f__Microbacteriaceae;g__Agreia,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Clostridiaceae;g__Proteiniclasticum,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Pseudonocardiales;f__Pseudonocardiaceae;g__Lentzea,k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacillales;f__Bacillaceae;g__Psychrobacillus,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Propionibacteriales;f__Propionibacteriaceae;g__Propionimicrobium,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Vibrionales;f__Vibrionaceae;g__Vibrio,k__Bacteria;p__Planctomycetes;c__Planctomycetia;o__Planctomycetales;f__Isosphaeraceae;g__Singulisphaera,k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacillales;f__Bacillaceae;g__Hydrogenibacillus,k__Bacteria;p__Proteobacteria;c__Betaproteobacteria;o__Methylophilales;f__Methylophilaceae;g__Candidatus_Methylopumilus,k__Bacteria;p__Aquificae;c__Aquificae;o__Aquificales;f__Hydrogenothermaceae;g__Sulfurihydrogenibium,k__Bacteria;p__Thermotogae;c__Thermotogae;o__Petrotogales;g__Petrotoga,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Pseudomonadales;f__Pseudomonadaceae;g__Azotobacter,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhizobiales;f__Methylocystaceae;g__Methylocystis,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;g__Geminicoccus,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Catenulisporales;f__Actinospicaceae;g__Actinospica,k__Bacteria;p__Acidobacteria;c__Acidobacteriia;o__Acidobacteriales;f__Acidobacteriaceae;g__Granulicella,k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacillales;f__Thermoactinomycetaceae;g__Desmospora,k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Marinilabiliaceae;g__Geofilum,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Streptosporangiales;f__Streptosporangiaceae;g__Microtetraspora,k__Bacteria;p__Proteobacteria;c__Deltaproteobacteria;o__Myxococcales;f__Kofleriaceae;g__Haliangium,k__Archaea;p__Candidatus_Korarchaeota;g__Candidatus_Korarchaeum,k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Prevotellaceae;g__Paraprevotella,k__Bacteria;p__Aquificae;c__Aquificae;o__Aquificales;f__Aquificaceae;g__Hydrogenobaculum,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Pasteurellales;f__Pasteurellaceae;g__Pasteurella,k__Bacteria;p__Bacteroidetes;c__Flavobacteriia;o__Flavobacteriales;f__Flavobacteriaceae;g__Tenacibaculum,k__Bacteria;p__Bacteroidetes;c__Chitinophagia;o__Chitinophagales;f__Saprospiraceae;g__Lewinella,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Pseudonocardiales;f__Pseudonocardiaceae;g__Saccharothrix,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Cellvibrionales;f__Halieaceae;g__Haliea,k__Archaea;p__Euryarchaeota;c__Methanomicrobia;o__Methanosarcinales;f__Methanosarcinaceae;g__Methanococcoides,k__Bacteria;p__Bacteroidetes;c__Chitinophagia;o__Chitinophagales;f__Chitinophagaceae;g__Chitinophaga,k__Bacteria;p__Proteobacteria;c__Betaproteobacteria;o__Neisseriales;f__Neisseriaceae;g__Neisseria,k__Archaea;p__Euryarchaeota;c__Halobacteria;o__Haloferacales;f__Haloferacaceae;g__Haloquadratum,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Oceanospirillales;f__Halomonadaceae;g__Halotalea,k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Porphyromonadaceae;g__Paludibacter,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Pseudonocardiales;f__Pseudonocardiaceae;g__Saccharopolyspora,k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Prevotellaceae;g__Alloprevotella,k__Bacteria;p__Cyanobacteria;o__Nostocales;f__Nostocaceae;g__Trichormus,k__Bacteria;p__Chlorobi;c__Chlorobia;o__Chlorobiales;f__Chlorobiaceae;g__Pelodictyon,k__Bacteria;p__Synergistetes;c__Synergistia;o__Synergistales;f__Synergistaceae;g__Thermovirga,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhizobiales;f__Hyphomicrobiaceae;g__Filomicrobium,k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacillales;f__Bacillaceae;g__Halolactibacillus,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Alteromonadales;f__Alteromonadaceae;g__Lacimicrobium,k__Bacteria;p__Bacteroidetes;c__Cytophagia;o__Cytophagales;f__Cytophagaceae;g__Dyadobacter,k__Bacteria;p__Bacteroidetes;c__Flavobacteriia;o__Flavobacteriales;f__Flavobacteriaceae;g__Zobellia,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhodospirillales;f__Rhodospirillaceae;g__Nisaea,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rickettsiales;g__Caedibacter,k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacillales;f__Bacillaceae;g__Halalkalibacillus,k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacillales;f__Listeriaceae;g__Listeria,k__Bacteria;p__Proteobacteria;c__Deltaproteobacteria;o__Myxococcales;f__Nannocystaceae;g__Plesiocystis,k__Bacteria;p__Proteobacteria;c__Deltaproteobacteria;o__Desulfobacterales;f__Desulfobacteraceae;g__Desulfatitalea,k__Bacteria;p__Proteobacteria;c__Betaproteobacteria;o__Burkholderiales;g__Methylibium,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Micrococcales;f__Micrococcaceae;g__Citricoccus,k__Bacteria;p__Synergistetes;c__Synergistia;o__Synergistales;f__Synergistaceae;g__Synergistes,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Pasteurellales;f__Pasteurellaceae;g__Mannheimia,k__Bacteria;p__Firmicutes;c__Clostridia;o__Thermoanaerobacterales;f__Thermoanaerobacteraceae;g__Carboxydothermus,k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Odoribacteraceae;g__Butyricimonas,k__Bacteria;p__Proteobacteria;c__Betaproteobacteria;o__Rhodocyclales;f__Rhodocyclaceae;g__Methyloversatilis,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhodobacterales;f__Rhodobacteraceae;g__Planktomarina,k__Bacteria;p__Chloroflexi;c__Ardenticatenia;o__Ardenticatenales;f__Ardenticatenaceae;g__Ardenticatena,k__Bacteria;p__Firmicutes;c__Limnochordia;o__Limnochordales;f__Limnochordaceae;g__Limnochorda,k__Bacteria;p__Cyanobacteria;o__Chroococcales;g__Acaryochloris,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhodobacterales;f__Hyphomonadaceae;g__Hyphomonas,k__Bacteria;p__Proteobacteria;c__Betaproteobacteria;o__Burkholderiales;f__Oxalobacteraceae;g__Herbaspirillum,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Sphingomonadales;f__Erythrobacteraceae;g__Erythrobacter,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhodospirillales;f__Acetobacteraceae;g__Tanticharoenia,k__Bacteria;p__Proteobacteria;c__Deltaproteobacteria;o__Desulfobacterales;f__Desulfobacteraceae;g__Desulfococcus,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Oceanospirillales;f__Halomonadaceae;g__Kushneria,k__Bacteria;p__Bacteroidetes;c__Flavobacteriia;o__Flavobacteriales;f__Flavobacteriaceae;g__Dokdonia,k__Bacteria;p__Chlamydiae;c__Chlamydiia;o__Chlamydiales;f__Waddliaceae;g__Waddlia,k__Bacteria;p__Chloroflexi;c__Chloroflexia;o__Chloroflexales;f__Oscillochloridaceae;g__Oscillochloris,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhodobacterales;f__Rhodobacteraceae;g__Thalassobius,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Oceanospirillales;f__Halomonadaceae;g__Carnimonas,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Xanthomonadales;f__Xanthomonadaceae;g__Pseudoxanthomonas,k__Archaea;p__Crenarchaeota;c__Thermoprotei;o__Sulfolobales;f__Sulfolobaceae;g__Acidianus,k__Bacteria;p__Cyanobacteria;o__Oscillatoriales;g__Leptolyngbya,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Thiotrichales;f__Piscirickettsiaceae;g__Hydrogenovibrio,k__Bacteria;p__Chloroflexi;c__Anaerolineae;o__Anaerolineales;f__Anaerolineaceae;g__Flexilinea,k__Bacteria;p__Bacteroidetes;c__Flavobacteriia;o__Flavobacteriales;f__Crocinitomicaceae;g__Crocinitomix,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Orbales;f__Orbaceae;g__Candidatus_Schmidhempelia,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Lachnospiraceae;g__Roseburia,k__Bacteria;p__Cyanobacteria;o__Oscillatoriales;g__Planktothrix,k__Bacteria;p__Acidobacteria;c__Solibacteres;o__Solibacterales;f__Solibacteraceae;g__Candidatus_Solibacter,k__Archaea;p__Euryarchaeota;c__Halobacteria;o__Natrialbales;f__Natrialbaceae;g__Halostagnicola,k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;g__Candidatus_Azobacteroides,k__Bacteria;p__Bacteroidetes;c__Flavobacteriia;o__Flavobacteriales;f__Flavobacteriaceae;g__Riemerella,k__Bacteria;p__Actinobacteria;c__Coriobacteriia;o__Coriobacteriales;f__Atopobiaceae;g__Atopobium,k__Bacteria;p__Planctomycetes;c__Planctomycetia;o__Planctomycetales;f__Planctomycetaceae;g__Zavarzinella,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhodospirillales;f__Acetobacteraceae;g__Rubritepida,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Xanthomonadales;f__Rhodanobacteraceae;g__Dokdonella,k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Prevotellaceae;g__Hallella,k__Bacteria;p__Cyanobacteria;o__Oscillatoriales;g__Oscillatoria,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Sphingomonadales;f__Sphingomonadaceae;g__Blastomonas,k__Bacteria;p__Planctomycetes;c__Planctomycetia;o__Candidatus_Brocadiales;f__Candidatus_Brocadiaceae;g__Candidatus_Brocadia,k__Bacteria;p__Proteobacteria;c__Epsilonproteobacteria;o__Campylobacterales;f__Helicobacteraceae;g__Helicobacter,k__Bacteria;p__Bacteroidetes;c__Sphingobacteriia;o__Sphingobacteriales;f__Sphingobacteriaceae;g__Mucilaginibacter,k__Bacteria;p__Cyanobacteria;o__Chroococcales;g__Microcystis,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhodobacterales;f__Rhodobacteraceae;g__Stappia,k__Bacteria;p__Firmicutes;c__Clostridia;o__Halanaerobiales;f__Halobacteroidaceae;g__Halonatronum,k__Archaea;p__Euryarchaeota;c__Thermoplasmata;o__Thermoplasmatales;f__Picrophilaceae;g__Picrophilus,k__Bacteria;p__Bacteroidetes;c__Flavobacteriia;o__Flavobacteriales;f__Flavobacteriaceae;g__Bergeyella,k__Bacteria;p__Cyanobacteria;o__Oscillatoriales;g__Lyngbya,k__Bacteria;p__Synergistetes;c__Synergistia;o__Synergistales;f__Synergistaceae;g__Pyramidobacter,k__Bacteria;p__Bacteroidetes;c__Flavobacteriia;o__Flavobacteriales;f__Flavobacteriaceae;g__Zhouia,k__Bacteria;p__Acidobacteria;c__Holophagae;o__Holophagales;f__Holophagaceae;g__Holophaga,k__Archaea;p__Euryarchaeota;c__Halobacteria;o__Natrialbales;f__Natrialbaceae;g__Salinarchaeum,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Micrococcales;f__Microbacteriaceae;g__Yonghaparkia,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Streptosporangiales;f__Streptosporangiaceae;g__Microbispora,k__Bacteria;p__Actinobacteria;c__Coriobacteriia;o__Eggerthellales;f__Eggerthellaceae;g__Adlercreutzia,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Legionellales;f__Legionellaceae;g__Tatlockia,k__Archaea;p__Euryarchaeota;c__Halobacteria;o__Haloferacales;f__Haloferacaceae;g__Halorubrum,k__Bacteria;p__Cyanobacteria;o__Chroococcales;g__Geminocystis,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhizobiales;f__Bradyrhizobiaceae;g__Salinarimonas,k__Bacteria;p__Chlamydiae;c__Chlamydiia;o__Chlamydiales;f__Simkaniaceae;g__Simkania,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacteriales;f__Enterobacteriaceae;g__Candidatus_Regiella,k__Bacteria;p__Thermodesulfobacteria;c__Thermodesulfobacteria;o__Thermodesulfobacteriales;f__Thermodesulfobacteriaceae;g__Thermodesulfobacterium,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Lachnospiraceae;g__Anaerostipes,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Oceanospirillales;f__Halomonadaceae;g__Halomonas,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Nevskiales;f__Sinobacteraceae;g__Steroidobacter,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhizobiales;f__Hyphomicrobiaceae;g__Pelagibacterium,k__Bacteria;p__Bacteroidetes;c__Chitinophagia;o__Chitinophagales;f__Chitinophagaceae;g__Niabella,k__Bacteria;p__Proteobacteria;c__Betaproteobacteria;o__Neisseriales;f__Chromobacteriaceae;g__Chitinilyticum,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhizobiales;f__Beijerinckiaceae;g__Beijerinckia,k__Bacteria;p__Cyanobacteria;o__Chroococcales;g__Cyanothece,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhizobiales;f__Methylobacteriaceae;g__Microvirga,k__Bacteria;p__Actinobacteria;c__Acidimicrobiia;o__Acidimicrobiales;f__Acidimicrobiaceae;g__Ilumatobacter,k__Bacteria;p__Deferribacteres;c__Deferribacteres;o__Deferribacterales;f__Deferribacteraceae;g__Flexistipes,k__Bacteria;p__Actinobacteria;c__Coriobacteriia;o__Eggerthellales;f__Eggerthellaceae;g__Denitrobacterium,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Xanthomonadales;f__Rhodanobacteraceae;g__Frateuria,k__Bacteria;p__Proteobacteria;c__Betaproteobacteria;o__Hydrogenophilales;f__Hydrogenophilaceae;g__Thiobacillus,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Chromatiales;f__Chromatiaceae;g__Rheinheimera,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhizobiales;f__Xanthobacteraceae;g__Azorhizobium,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Oceanospirillales;f__Alcanivoracaceae;g__Kangiella,k__Bacteria;p__Proteobacteria;c__Deltaproteobacteria;o__Syntrophobacterales;f__Syntrophorhabdaceae;g__Syntrophorhabdus,k__Bacteria;p__Deinococcus-Thermus;c__Deinococci;o__Thermales;f__Thermaceae;g__Thermus,k__Bacteria;p__Cyanobacteria;o__Chroococcales;g__Aphanocapsa,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Vibrionales;f__Vibrionaceae;g__Candidatus_Photodesmus,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Oscillospiraceae;g__Oscillibacter,k__Bacteria;p__Proteobacteria;c__Deltaproteobacteria;o__Desulfovibrionales;f__Desulfohalobiaceae;g__Desulfonatronospira,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Bifidobacteriales;f__Bifidobacteriaceae;g__Parascardovia,k__Bacteria;p__Bacteroidetes;c__Cytophagia;o__Cytophagales;f__Cyclobacteriaceae;g__Indibacter,k__Bacteria;p__Cyanobacteria;o__Chroococcales;g__Chamaesiphon,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Sphingomonadales;f__Sphingomonadaceae;g__Sphingobium,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhizobiales;f__Hyphomicrobiaceae;g__Prosthecomicrobium,k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Porphyromonadaceae;g__Parabacteroides,k__Bacteria;p__Bacteroidetes;c__Flavobacteriia;o__Flavobacteriales;f__Flavobacteriaceae;g__Maribacter,k__Bacteria;p__Proteobacteria;c__Betaproteobacteria;o__Burkholderiales;f__Alcaligenaceae;g__Brackiella,k__Bacteria;p__Acidobacteria;c__Blastocatellia;g__Chloracidobacterium,k__Bacteria;p__Proteobacteria;c__Betaproteobacteria;o__Burkholderiales;f__Alcaligenaceae;g__Kerstersia,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhodobacterales;f__Rhodobacteraceae;g__Nereida,k__Bacteria;p__Chloroflexi;c__Anaerolineae;o__Anaerolineales;f__Anaerolineaceae;g__Leptolinea,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Micrococcales;f__Microbacteriaceae;g__Leifsonia,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacteriales;f__Enterobacteriaceae;g__Lelliottia,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Alteromonadales;f__Colwelliaceae;g__Colwellia,k__Bacteria;p__Proteobacteria;c__Epsilonproteobacteria;o__Campylobacterales;f__Campylobacteraceae;g__Arcobacter,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Micrococcales;f__Microbacteriaceae;g__Agrococcus,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Vibrionales;f__Vibrionaceae;g__Grimontia,k__Bacteria;p__Bacteroidetes;c__Flavobacteriia;o__Flavobacteriales;f__Flavobacteriaceae;g__Sediminibacter,k__Bacteria;p__Deferribacteres;c__Deferribacteres;o__Deferribacterales;f__Deferribacteraceae;g__Deferribacter,k__Bacteria;p__Proteobacteria;c__Betaproteobacteria;o__Burkholderiales;g__Thiomonas,k__Bacteria;p__Proteobacteria;c__Betaproteobacteria;o__Rhodocyclales;f__Rhodocyclaceae;g__Thauera,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Actinomycetaceae;g__Varibaculum,k__Bacteria;p__Bacteroidetes;c__Flavobacteriia;o__Flavobacteriales;f__Flavobacteriaceae;g__Bizionia,k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Marinilabiliaceae;g__Anaerophaga,k__Bacteria;p__Cyanobacteria;c__Gloeobacteria;o__Gloeobacterales;g__Gloeobacter,k__Archaea;p__Euryarchaeota;c__Methanomicrobia;o__Methanosarcinales;f__Methanosarcinaceae;g__Methanosarcina,k__Bacteria;p__Actinobacteria;c__Thermoleophilia;o__Solirubrobacterales;f__Conexibacteraceae;g__Conexibacter,k__Bacteria;p__Tenericutes;c__Mollicutes;o__Mycoplasmatales;f__Mycoplasmataceae;g__Ureaplasma,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Micrococcales;f__Microbacteriaceae;g__Microbacterium,k__Archaea;p__Euryarchaeota;c__Halobacteria;o__Haloferacales;f__Haloferacaceae;g__Halolamina,k__Bacteria;p__Proteobacteria;c__Epsilonproteobacteria;o__Campylobacterales;f__Campylobacteraceae;g__Sulfurospirillum,k__Bacteria;p__Spirochaetes;c__Spirochaetia;o__Spirochaetales;f__Spirochaetaceae;g__Spirochaeta,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Micrococcales;f__Microbacteriaceae;g__Cryobacterium,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Corynebacteriales;f__Gordoniaceae;g__Gordonia,k__Bacteria;p__Planctomycetes;c__Planctomycetia;o__Planctomycetales;f__Planctomycetaceae;g__Rubinisphaera,k__Bacteria;p__Actinobacteria;c__Actinobacteria;g__Thermobispora,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Glycomycetales;f__Glycomycetaceae;g__Glycomyces,k__Bacteria;p__Proteobacteria;c__Betaproteobacteria;o__Burkholderiales;f__Comamonadaceae;g__Ottowia,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhizobiales;f__Xanthobacteraceae;g__Ancylobacter,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Ruminococcaceae;g__Faecalibacterium,k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacillales;f__Bacillaceae;g__Paucisalibacillus,k__Bacteria;p__Proteobacteria;c__Betaproteobacteria;o__Burkholderiales;f__Burkholderiaceae;g__Polynucleobacter,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhodospirillales;f__Rhodospirillaceae;g__Caenispirillum,k__Archaea;p__Euryarchaeota;c__Methanomicrobia;o__Methanomicrobiales;f__Methanoregulaceae;g__Methanoregula,k__Archaea;p__Crenarchaeota;c__Thermoprotei;o__Thermoproteales;f__Thermofilaceae;g__Thermofilum,k__Bacteria;p__Proteobacteria;c__Betaproteobacteria;o__Sulfuricellales;f__Sulfuricellaceae;g__Sulfuricella,k__Bacteria;p__Proteobacteria;c__Deltaproteobacteria;o__Bdellovibrionales;f__Bacteriovoracaceae;g__Bacteriovorax,k__Bacteria;p__Bacteroidetes;c__Chitinophagia;o__Chitinophagales;f__Chitinophagaceae;g__Niastella,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Micrococcales;f__Micrococcaceae;g__Yaniella,k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacteriales;f__Enterobacteriaceae;g__Citrobacter,k__Bacteria;p__Proteobacteria;c__Deltaproteobacteria;o__Myxococcales;f__Polyangiaceae;g__Sorangium,k__Archaea;p__Euryarchaeota;c__Methanomicrobia;o__Methanosarcinales;f__Methermicoccaceae;g__Methermicoccus,k__Bacteria;p__Aquificae;c__Aquificae;o__Aquificales;f__Aquificaceae;g__Hydrogenobacter,k__Bacteria;p__Synergistetes;c__Synergistia;o__Synergistales;f__Synergistaceae;g__Cloacibacillus,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Lachnospiraceae;g__Sellimonas,k__Archaea;p__Thaumarchaeota;g__Candidatus_Nitrosotenuis,k__Bacteria;p__Rhodothermaeota;c__Balneolia;o__Balneolales;f__Balneolaceae;g__Gracilimonas,k__Archaea;p__Crenarchaeota;c__Thermoprotei;o__Thermoproteales;f__Thermoproteaceae;g__Caldivirga,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Clostridiales_Family_XIII._Incertae_Sedis;g__Casaltella,k__Archaea;p__Crenarchaeota;c__Thermoprotei;o__Desulfurococcales;f__Desulfurococcaceae;g__Thermosphaera,k__Bacteria;p__Proteobacteria;c__Betaproteobacteria;o__Neisseriales;f__Chromobacteriaceae;g__Deefgea,k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhizobiales;f__Phyllobacteriaceae;g__Aquamicrobium,k__Bacteria;p__Deferribacteres;c__Deferribacteres;o__Deferribacterales;f__Deferribacteraceae;g__Geovibrio,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Ruminococcaceae;g__Subdoligranulum
s17489,0.0,0.0,124.0,0.0,2.0,0.0,0.0,0.0,0.0,26.0,0.0,24.0,2.0,2.0,0.0,510.0,22.0,0.0,4.0,0.0,128.0,0.0,10.0,0.0,0.0,346.0,0.0,0.0,42.0,0.0,418.0,36.0,28.0,0.0,0.0,102.0,2.0,4.0,0.0,0.0,26.0,6.0,0.0,598.0,16.0,0.0,312.0,0.0,0.0,182.0,0.0,0.0,0.0,42.0,0.0,4.0,2.0,150.0,0.0,14.0,0.0,0.0,0.0,22.0,54.0,0.0,4.0,4.0,0.0,4.0,0.0,0.0,10.0,78.0,0.0,32.0,0.0,0.0,12.0,20.0,0.0,32.0,0.0,0.0,0.0,0.0,698.0,0.0,2676.0,4.0,0.0,0.0,194.0,0.0,2.0,0.0,0.0,0.0,4.0,78.0,58.0,2.0,0.0,90.0,20.0,112.0,0.0,2.0,12.0,576.0,34.0,0.0,26.0,0.0,0.0,200.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,2.0,0.0,0.0,6.0,268.0,0.0,0.0,14.0,154.0,2.0,8.0,22.0,2186.0,0.0,2.0,46.0,336.0,10.0,4.0,4.0,0.0,4.0,0.0,0.0,0.0,6.0,2.0,0.0,92.0,0.0,38.0,2.0,4.0,2.0,0.0,6.0,2.0,0.0,0.0,8.0,366.0,0.0,0.0,0.0,156.0,0.0,100.0,2.0,0.0,18.0,136.0,6.0,104.0,688.0,0.0,0.0,2.0,2.0,2.0,950.0,28.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1926.0,0.0,50.0,2757.0,5.0,0.0,106.0,0.0,16.0,7.0,0.0,2.0,11.0,50.0,458.0,0.0,323.0,0.0,207.0,1.0,21.0,4303.0,5.0,0.0,1.0,0.0,2.0,0.0,0.0,4.0,42.0,4343.0,58.0,0.0,0.0,12.0,11.0,2.0,272.0,0.0,50.0,209.0,2.0,1.0,0.0,0.0,2.0,9.0,2.0,2.0,0.0,0.0,195073.0,2.0,412.0,0.0,14.0,1.0,2.0,3.0,2.0,11.0,0.0,4.0,2.0,6.0,126.0,2.0,6.0,0.0,94.0,31.0,49.0,10.0,2.0,1.0,1.0,21789.0,18.0,3.0,125.0,8.0,106.0,0.0,0.0,0.0,0.0,1.0,0.0,8.0,0.0,0.0,0.0,0.0,46784.0,2.0,1.0,7.0,0.0,0.0,97.0,10.0,9.0,3.0,3.0,0.0,1.0,0.0,10.0,20028.0,162.0,2.0,40559.0,2.0,19.0,1647.0,0.0,0.0,0.0,63.0,1.0,11.0,0.0,1.0,0.0,11.0,4.0,1.0,1.0,0.0,2.0,133.0,573.0,0.0,7.0,4.0,0.0,28.0,4.0,0.0,20802.0,1.0,25.0,0.0,0.0,0.0,1.0,7.0,0.0,0.0,54.0,3.0,5.0,7.0,2.0,0.0,11.0,11.0,4.0,0.0,0.0,1327.0,0.0,104.0,0.0,1.0,2.0,0.0,0.0,407.0,16.0,1.0,0.0,0.0,4.0,0.0,8.0,7.0,0.0,0.0,56.0,2.0,0.0,35.0,0.0,1.0,42.0,24.0,589.0,4.0,946.0,1322.0,1434.0,0.0,2.0,1.0,0.0,26.0,2.0,7823.0,50.0,0.0,3746.0,1.0,1.0,34.0,4.0,22.0,2.0,1.0,7.0,2230.0,3.0,396.0,1906.0,6.0,3.0,16.0,0.0,134.0,0.0,1.0,5.0,9.0,0.0,2.0,0.0,0.0,8.0,0.0,1.0,2.0,1.0,19.0,6.0,7806.0,236.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
s17512,4.0,0.0,144.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,6.0,0.0,0.0,2.0,88.0,20.0,2.0,0.0,0.0,14.0,0.0,2.0,0.0,0.0,110.0,0.0,0.0,2.0,0.0,136.0,14.0,14.0,0.0,0.0,48.0,0.0,2.0,0.0,0.0,0.0,2.0,0.0,698.0,10.0,0.0,104.0,0.0,0.0,58.0,6.0,0.0,0.0,56.0,0.0,0.0,2.0,26.0,0.0,0.0,2.0,0.0,0.0,12.0,94.0,4.0,0.0,0.0,0.0,14.0,0.0,0.0,4.0,8.0,0.0,16.0,0.0,4.0,6.0,12.0,0.0,66.0,0.0,0.0,0.0,0.0,268.0,0.0,678.0,2.0,0.0,0.0,46.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,68.0,2.0,0.0,14.0,6.0,16.0,0.0,0.0,0.0,460.0,16.0,0.0,40.0,0.0,0.0,64.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,2.0,0.0,0.0,2.0,54.0,0.0,0.0,14.0,52.0,0.0,0.0,6.0,0.0,0.0,8.0,42.0,72.0,14.0,24.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,86.0,0.0,18.0,4.0,0.0,2.0,0.0,6.0,0.0,0.0,0.0,50.0,150.0,2.0,0.0,2.0,24.0,0.0,36.0,10.0,0.0,10.0,32.0,0.0,64.0,150.0,0.0,0.0,0.0,0.0,2.0,138.0,6.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,189.0,0.0,1.0,1005.0,7.0,0.0,477.0,1.0,7.0,1.0,0.0,5.0,10.0,16.0,415.0,0.0,275.0,5.0,749.0,0.0,22.0,1069.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,52.0,870.0,6.0,2.0,2.0,7.0,19.0,0.0,69.0,1.0,43.0,168.0,0.0,0.0,0.0,2.0,0.0,5.0,4.0,0.0,1.0,0.0,57247.0,4.0,59.0,0.0,0.0,1.0,18.0,13.0,4.0,18.0,1.0,0.0,0.0,61.0,193.0,0.0,16.0,15.0,53.0,4.0,18.0,28.0,4.0,5.0,0.0,12258.0,41.0,0.0,96.0,6.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,5417.0,0.0,0.0,9.0,0.0,2.0,170.0,6.0,9.0,3.0,4.0,0.0,0.0,0.0,22.0,6329.0,276.0,1.0,433234.0,0.0,24.0,2672.0,0.0,1.0,0.0,72.0,1.0,8.0,0.0,0.0,2.0,5.0,0.0,2.0,0.0,0.0,0.0,265.0,58.0,2.0,7.0,3.0,2.0,33.0,11.0,1.0,6583.0,2.0,52.0,1.0,0.0,0.0,6.0,6.0,0.0,0.0,20.0,0.0,0.0,12.0,0.0,0.0,38.0,3.0,2.0,0.0,0.0,336.0,0.0,51.0,1.0,0.0,0.0,1.0,5.0,281.0,7.0,0.0,0.0,0.0,6.0,0.0,31.0,2.0,0.0,0.0,85.0,0.0,0.0,11.0,0.0,2.0,23.0,36.0,126.0,0.0,182.0,140.0,1052.0,0.0,1.0,1.0,0.0,4.0,2.0,651.0,14.0,1.0,1186.0,1.0,0.0,93.0,6.0,1.0,6.0,1.0,8.0,1084.0,2.0,154.0,1285.0,4.0,2.0,23.0,0.0,166.0,0.0,0.0,10.0,2.0,0.0,0.0,11.0,1.0,31.0,0.0,0.0,3.0,6.0,11.0,0.0,2526.0,560.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
s17498,0.0,0.0,108.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,28.0,0.0,10.0,2.0,86.0,20.0,4.0,2.0,0.0,22.0,0.0,18.0,0.0,0.0,74.0,0.0,0.0,2.0,0.0,60.0,16.0,20.0,0.0,0.0,56.0,0.0,0.0,0.0,2.0,0.0,2.0,2.0,458.0,14.0,0.0,70.0,0.0,0.0,30.0,2.0,0.0,0.0,16.0,0.0,6.0,0.0,12.0,0.0,2.0,2.0,0.0,0.0,8.0,76.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,10.0,8.0,0.0,12.0,0.0,2.0,6.0,14.0,0.0,50.0,0.0,0.0,0.0,0.0,224.0,0.0,426.0,6.0,0.0,0.0,52.0,0.0,2.0,0.0,0.0,0.0,10.0,14.0,52.0,2.0,0.0,18.0,14.0,20.0,0.0,0.0,2.0,300.0,36.0,0.0,20.0,6.0,0.0,92.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,2.0,0.0,4.0,40.0,0.0,0.0,12.0,74.0,0.0,4.0,16.0,0.0,0.0,10.0,54.0,46.0,40.0,14.0,12.0,0.0,2.0,0.0,0.0,0.0,2.0,4.0,0.0,92.0,2.0,12.0,0.0,4.0,0.0,0.0,8.0,0.0,0.0,2.0,56.0,194.0,0.0,4.0,0.0,30.0,0.0,220.0,4.0,0.0,8.0,18.0,0.0,64.0,86.0,0.0,0.0,0.0,0.0,0.0,316.0,22.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,357.0,0.0,18.0,780.0,20.0,1.0,3854.0,4.0,20.0,0.0,1.0,5.0,7.0,92.0,562.0,0.0,268.0,11.0,797.0,0.0,14.0,545.0,3.0,0.0,2.0,0.0,0.0,0.0,1.0,3.0,72.0,467.0,11.0,0.0,0.0,9.0,27.0,0.0,42.0,1.0,163.0,175.0,1.0,2.0,0.0,1.0,2.0,12.0,2.0,0.0,0.0,0.0,72975.0,0.0,79.0,6.0,0.0,0.0,9.0,16.0,8.0,21.0,2.0,0.0,3.0,75.0,383.0,0.0,12.0,4.0,35.0,7.0,15.0,30.0,2.0,5.0,0.0,6655.0,12.0,1.0,34.0,6.0,7.0,1.0,0.0,0.0,0.0,1.0,0.0,8.0,4.0,0.0,0.0,0.0,5476.0,8.0,0.0,18.0,1.0,2.0,134.0,12.0,4.0,9.0,1.0,2.0,2.0,0.0,91.0,10048.0,2569.0,1.0,1038595.0,0.0,11.0,1666.0,0.0,0.0,0.0,100.0,0.0,10.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0,0.0,162.0,44.0,4.0,7.0,16.0,0.0,25.0,9.0,2.0,6050.0,3.0,32.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,27.0,0.0,1.0,13.0,0.0,1.0,24.0,4.0,3.0,1.0,0.0,275.0,0.0,82.0,4.0,3.0,2.0,1.0,1.0,336.0,26.0,8.0,0.0,1.0,12.0,2.0,32.0,5.0,0.0,0.0,84.0,1.0,1.0,14.0,1.0,0.0,20.0,20.0,197.0,5.0,229.0,235.0,545.0,0.0,3.0,0.0,0.0,25.0,0.0,575.0,12.0,1.0,1225.0,0.0,0.0,84.0,16.0,0.0,6.0,0.0,21.0,827.0,4.0,121.0,11745.0,15.0,4.0,21.0,0.0,2848.0,0.0,0.0,32.0,5.0,0.0,0.0,5.0,0.0,27.0,0.0,1.0,1.0,5.0,101.0,0.0,1290.0,1094.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
s17528,2.0,0.0,110.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,14.0,2.0,2.0,4.0,40.0,14.0,0.0,0.0,0.0,6.0,0.0,4.0,0.0,0.0,50.0,0.0,0.0,2.0,0.0,52.0,20.0,2.0,2.0,0.0,22.0,0.0,2.0,0.0,0.0,2.0,0.0,0.0,778.0,8.0,0.0,62.0,0.0,0.0,28.0,4.0,0.0,0.0,24.0,0.0,2.0,2.0,6.0,0.0,0.0,0.0,0.0,0.0,8.0,62.0,0.0,0.0,0.0,0.0,8.0,2.0,0.0,2.0,6.0,2.0,2.0,0.0,2.0,4.0,4.0,0.0,418.0,0.0,0.0,0.0,0.0,86.0,0.0,386.0,2.0,0.0,0.0,16.0,0.0,0.0,0.0,0.0,0.0,36.0,4.0,24.0,6.0,0.0,14.0,18.0,16.0,0.0,0.0,4.0,126.0,4.0,0.0,20.0,0.0,0.0,28.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,40.0,0.0,0.0,38.0,38.0,2.0,0.0,6.0,68.0,0.0,10.0,30.0,32.0,12.0,14.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,4.0,4.0,66.0,0.0,4.0,4.0,0.0,0.0,0.0,8.0,0.0,4.0,2.0,8.0,68.0,0.0,2.0,0.0,14.0,0.0,42.0,6.0,0.0,0.0,14.0,0.0,26.0,72.0,0.0,0.0,0.0,2.0,0.0,122.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,534.0,1.0,7.0,493.0,7.0,0.0,12041.0,5.0,56.0,1.0,10.0,9.0,7.0,380.0,1989.0,0.0,451.0,0.0,5215.0,2.0,0.0,632.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,4.0,189.0,510.0,9.0,0.0,1.0,19.0,64.0,3.0,25.0,1.0,21.0,833.0,5.0,6.0,0.0,3.0,0.0,16.0,0.0,3.0,0.0,0.0,73952.0,3.0,44.0,1.0,2.0,0.0,19.0,65.0,118.0,76.0,9.0,1.0,2.0,620.0,1719.0,7.0,16.0,4.0,14.0,88.0,28.0,68.0,1.0,1.0,5.0,5670.0,6.0,3.0,47.0,13.0,9.0,1.0,0.0,0.0,1.0,0.0,0.0,3.0,0.0,1.0,1.0,0.0,7746.0,41.0,0.0,81.0,0.0,0.0,105.0,3.0,8.0,3.0,14.0,1.0,6.0,0.0,475.0,13454.0,5975.0,13.0,1729348.0,3.0,10.0,657.0,3.0,12.0,0.0,402.0,1.0,26.0,0.0,0.0,0.0,0.0,3.0,3.0,6.0,0.0,0.0,95.0,63.0,27.0,31.0,76.0,0.0,11.0,85.0,1.0,5096.0,0.0,15.0,29.0,0.0,0.0,2.0,0.0,1.0,0.0,13.0,4.0,10.0,128.0,0.0,0.0,143.0,10.0,22.0,0.0,0.0,159.0,0.0,109.0,2.0,46.0,0.0,1.0,2.0,96.0,28.0,2.0,0.0,0.0,40.0,3.0,23.0,17.0,1.0,1.0,229.0,0.0,0.0,27.0,0.0,0.0,65.0,7.0,797.0,9.0,162.0,113.0,265.0,1.0,0.0,2.0,0.0,37.0,14.0,1151.0,19.0,6.0,724.0,0.0,0.0,282.0,47.0,1.0,1.0,3.0,165.0,457.0,19.0,65.0,12396.0,13.0,2.0,210.0,8.0,2251.0,0.0,1.0,239.0,18.0,1.0,3.0,0.0,0.0,220.0,1.0,1.0,0.0,0.0,33.0,0.0,1401.0,4432.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
s17535,0.0,0.0,42.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0,0.0,14.0,0.0,0.0,0.0,134.0,2.0,2.0,0.0,0.0,48.0,0.0,0.0,0.0,0.0,104.0,0.0,0.0,16.0,0.0,202.0,16.0,2.0,0.0,0.0,32.0,0.0,2.0,0.0,0.0,2.0,4.0,0.0,218.0,4.0,0.0,70.0,0.0,0.0,44.0,2.0,0.0,0.0,16.0,0.0,0.0,0.0,38.0,0.0,0.0,0.0,0.0,0.0,10.0,38.0,0.0,0.0,0.0,0.0,14.0,4.0,0.0,6.0,12.0,0.0,10.0,0.0,0.0,8.0,0.0,0.0,102.0,0.0,0.0,0.0,0.0,278.0,0.0,872.0,0.0,0.0,0.0,66.0,0.0,0.0,0.0,0.0,0.0,8.0,28.0,18.0,2.0,0.0,30.0,2.0,36.0,4.0,0.0,0.0,172.0,8.0,0.0,0.0,0.0,0.0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,74.0,0.0,0.0,4.0,28.0,0.0,2.0,2.0,426.0,0.0,2.0,10.0,118.0,24.0,6.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,22.0,0.0,20.0,2.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,14.0,134.0,8.0,0.0,0.0,46.0,14.0,430.0,2.0,0.0,6.0,38.0,0.0,24.0,156.0,0.0,0.0,0.0,0.0,0.0,260.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,155.0,1.0,0.0,1485.0,2.0,0.0,1602.0,0.0,12.0,0.0,0.0,4.0,3.0,48.0,551.0,0.0,198.0,0.0,829.0,1.0,6.0,1795.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,47.0,1366.0,19.0,0.0,3.0,4.0,8.0,0.0,54.0,0.0,28.0,132.0,26.0,1.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,2.0,70240.0,38.0,129.0,1.0,2.0,0.0,3.0,16.0,15.0,11.0,0.0,0.0,0.0,67.0,309.0,0.0,1.0,2.0,36.0,4.0,32.0,18.0,2.0,0.0,0.0,7089.0,3.0,6.0,19.0,23.0,4.0,0.0,0.0,1.0,0.0,4.0,0.0,11.0,0.0,0.0,0.0,2.0,39170.0,2.0,0.0,13.0,1.0,0.0,32.0,2.0,1.0,9.0,2.0,0.0,0.0,0.0,28.0,7262.0,761.0,1.0,23031.0,0.0,6.0,568.0,0.0,0.0,0.0,91.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,43.0,26.0,10.0,5.0,8.0,0.0,12.0,14.0,2.0,7042.0,1.0,13.0,10.0,1.0,0.0,21.0,1.0,0.0,1.0,12.0,0.0,10.0,16.0,0.0,0.0,25.0,4.0,5.0,0.0,0.0,385.0,0.0,101.0,4.0,0.0,0.0,1.0,0.0,96.0,5.0,3.0,0.0,0.0,9.0,7.0,8.0,2.0,0.0,1.0,58.0,0.0,0.0,9.0,1.0,0.0,43.0,9.0,453.0,1.0,482.0,772.0,623.0,1.0,7.0,1.0,0.0,11.0,5.0,1698.0,17.0,44.0,807.0,0.0,0.0,35.0,26.0,1.0,0.0,0.0,14.0,655.0,11.0,168.0,11624.0,10.0,4.0,32.0,0.0,280.0,0.0,1.0,24.0,2.0,1.0,3.0,0.0,0.0,43.0,0.0,0.0,0.0,0.0,19.0,0.0,2667.0,896.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Loading QIIME Metadata File

In [11]:
sample_metadata = qiime2.Metadata.load(workdir+'tcga_mapping_file_updated082118.tsv')
dfMeta = sample_metadata.to_dataframe()
print(dfMeta.shape)
dfMeta.head()

(18150, 27)


Unnamed: 0_level_0,BarcodeSequence,LinkerPrimerSequence,filename,filenames_new_suffix,age_at_diagnosis,aliquot_id,aliquot_uuid,case_id,case_uuid,data_format,data_subtype,data_type,days_to_death,disease_type,ethnicity,experimental_strategy,gender,investigation,platform,primary_site,race,reference_genome,sample_id,sample_type,sample_uuid,vital_status,Description
#SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
s10247,,,G14083.TCGA-21-1082-01A-01D-1521-08.3.bam,G14083.TCGA-21-1082-01A-01D-1521-08.3.fasta,61,TCGA-21-1082-01A-01D-1521-08,1C3DF485-8E75-4378-87F6-C6463A520624,TCGA-21-1082,0e2ee54a-51c9-4868-842d-a2a1c1cfb016,BAM,Aligned reads,Raw sequencing data,,Lung Squamous Cell Carcinoma,NOT HISPANIC OR LATINO,WGS,MALE,TCGA-LUSC,Illumina HiSeq,Lung,WHITE,HG19_Broad_variant,TCGA-21-1082-01A,Primary Tumor,F091C76E-5D01-4F4B-8F6F-463C9F302930,Alive,s361__G14083.TCGA-21-1082-01A-01D-1521-08.3.bam
s10248,,,G13226.TCGA-21-1078-01A-01D-1521-08.1.bam,G13226.TCGA-21-1078-01A-01D-1521-08.1.fasta,77,TCGA-21-1078-01A-01D-1521-08,8CF9B32D-3D6F-4898-8C7A-89511B754021,TCGA-21-1078,0ab8d063-62b4-4d47-82aa-e3351a60029d,BAM,Aligned reads,Raw sequencing data,474.0,Lung Squamous Cell Carcinoma,NOT HISPANIC OR LATINO,WGS,MALE,TCGA-LUSC,Illumina HiSeq,Lung,WHITE,HG19_Broad_variant,TCGA-21-1078-01A,Primary Tumor,EFACAC80-9428-4482-9AEA-6B57FD172D49,Dead,s369__G13226.TCGA-21-1078-01A-01D-1521-08.1.bam
s10249,,,G14084.TCGA-22-1016-11A-01D-1521-08.3.bam,G14084.TCGA-22-1016-11A-01D-1521-08.3.fasta,65,TCGA-22-1016-11A-01D-1521-08,1845BC27-C3D3-4254-909C-2DC91F9E296B,TCGA-22-1016,037c57d1-b4a5-45dc-bda4-0550461d321b,BAM,Aligned reads,Raw sequencing data,822.0,Lung Squamous Cell Carcinoma,NOT HISPANIC OR LATINO,WGS,MALE,TCGA-LUSC,Illumina HiSeq,Lung,WHITE,HG19_Broad_variant,TCGA-22-1016-11A,Solid Tissue Normal,D06BC0A9-6053-454A-BDB9-BBDFDAA7C4E4,Dead,s380__G14084.TCGA-22-1016-11A-01D-1521-08.3.bam
s10250,,,G13232.TCGA-60-2726-01A-01D-1522-08.2.bam,G13232.TCGA-60-2726-01A-01D-1522-08.2.fasta,56,TCGA-60-2726-01A-01D-1522-08,A96EDDFC-3AFB-4BF8-A440-C91778113FBD,TCGA-60-2726,3d1f4059-2220-45b4-a4d2-b14f76cec96a,BAM,Aligned reads,Raw sequencing data,358.0,Lung Squamous Cell Carcinoma,NOT HISPANIC OR LATINO,WGS,MALE,TCGA-LUSC,Illumina HiSeq,Lung,WHITE,HG19_Broad_variant,TCGA-60-2726-01A,Primary Tumor,91A60807-2D0F-474E-B02F-18FEA788B319,Dead,s389__G13232.TCGA-60-2726-01A-01D-1522-08.2.bam
s10251,,,G14083.TCGA-21-1082-10B-01D-1521-08.4.bam,G14083.TCGA-21-1082-10B-01D-1521-08.4.fasta,61,TCGA-21-1082-10B-01D-1521-08,654AE6B0-65DA-4B11-B2FE-E7DD9C44E260,TCGA-21-1082,0e2ee54a-51c9-4868-842d-a2a1c1cfb016,BAM,Aligned reads,Raw sequencing data,,Lung Squamous Cell Carcinoma,NOT HISPANIC OR LATINO,WGS,MALE,TCGA-LUSC,Illumina HiSeq,Lung,WHITE,HG19_Broad_variant,TCGA-21-1082-10B,Blood Derived Normal,34E0475D-75C4-43C1-9715-027C19FA25F8,Alive,s399__G14083.TCGA-21-1082-10B-01D-1521-08.4.bam


In [12]:
# dfMeta.loc['s7010']
dfMeta.loc['s17588']

BarcodeSequence          NaN                                   
LinkerPrimerSequence     NaN                                   
filename                 G2185.TCGA-04-1371-11A-01D.7.bam      
filenames_new_suffix     G2185.TCGA-04-1371-11A-01D.7.fasta    
age_at_diagnosis         58                                    
aliquot_id               TCGA-04-1371-11A-01D-0516-08          
aliquot_uuid             81B30BC6-2BC4-48BA-AC99-E10786AFDC3A  
case_id                  TCGA-04-1371                          
case_uuid                bff84539-7862-45b2-b5fc-e77291fcca8b  
data_format              BAM                                   
data_subtype             Aligned reads                         
data_type                Raw sequencing data                   
days_to_death            NA                                    
disease_type             Ovarian Serous Cystadenocarcinoma     
ethnicity                Not available                         
experimental_strategy    WGS            

## Accounting for missing samples

In [13]:
print(np.setdiff1d(list(bactDataBarnDF.index), list(dfMeta.index)))
print(np.setdiff1d(list(virDataBarnDF.index), list(dfMeta.index)))
print(np.setdiff1d(list(vbDataBarnDF.index), list(dfMeta.index)),'\n')

print(np.setdiff1d(list(bactDataBarnDF.index), list(virDataBarnDF.index)),'\n')

print(np.setdiff1d(list(dfMeta.index), list(bactDataBarnDF.index)), len(np.setdiff1d(list(dfMeta.index), list(bactDataBarnDF.index))))
print(np.setdiff1d(list(dfMeta.index), list(virDataBarnDF.index)), len(np.setdiff1d(list(dfMeta.index), list(virDataBarnDF.index))))

['s2124' 's7807' 's8738' 's912']
['s2124' 's7807' 's8738' 's912']
['s2124' 's7807' 's8738' 's912'] 

['s17327' 's17330' 's17337' 's17339'] 

['s13044' 's13894' 's13930' 's13951' 's13987' 's13988' 's13990' 's14030'
 's14063' 's14095' 's14096' 's14097' 's14098' 's14099' 's14142' 's14175'
 's14176' 's14177' 's14224' 's14225' 's14228' 's14267' 's14312' 's14313'
 's14374' 's14743' 's17172' 's17553' 's17555' 's18092'] 30
['s13044' 's13894' 's13930' 's13951' 's13987' 's13988' 's13990' 's14030'
 's14063' 's14095' 's14096' 's14097' 's14098' 's14099' 's14142' 's14175'
 's14176' 's14177' 's14224' 's14225' 's14228' 's14267' 's14312' 's14313'
 's14374' 's14743' 's17172' 's17327' 's17330' 's17337' 's17339' 's17553'
 's17555' 's18092'] 34


In [14]:
missing = np.setdiff1d(list(dfMeta.index), list(vbDataBarnDF.index))
print(len(missing))
len(missing) == (dfMeta.shape[0] - vbDataBarnDF.shape[0])

34


False

In [15]:
print(sorted(missing))

['s13044', 's13894', 's13930', 's13951', 's13987', 's13988', 's13990', 's14030', 's14063', 's14095', 's14096', 's14097', 's14098', 's14099', 's14142', 's14175', 's14176', 's14177', 's14224', 's14225', 's14228', 's14267', 's14312', 's14313', 's14374', 's14743', 's17172', 's17327', 's17330', 's17337', 's17339', 's17553', 's17555', 's18092']


In [16]:
dfMeta.loc[sorted(missing),]

Unnamed: 0_level_0,BarcodeSequence,LinkerPrimerSequence,filename,filenames_new_suffix,age_at_diagnosis,aliquot_id,aliquot_uuid,case_id,case_uuid,data_format,data_subtype,data_type,days_to_death,disease_type,ethnicity,experimental_strategy,gender,investigation,platform,primary_site,race,reference_genome,sample_id,sample_type,sample_uuid,vital_status,Description
#SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
s13044,,,TCGA-QG-A5YX-01A-11D-A28G-10_wgs_Illumina.bam,TCGA-QG-A5YX-01A-11D-A28G-10_wgs_Illumina.fasta,61,TCGA-QG-A5YX-01A-11D-A28G-10,10AD692B-4C3D-42DE-9B5E-4968441388B3,TCGA-QG-A5YX,300fff5a-0f61-402d-b845-3938ba7b1294,BAM,Aligned reads,Raw sequencing data,,Colon Adenocarcinoma,NOT HISPANIC OR LATINO,WGS,FEMALE,TCGA-COAD,Illumina HiSeq,Colorectal,BLACK OR AFRICAN AMERICAN,GRCh37-lite,TCGA-QG-A5YX-01A,Primary Tumor,346253C0-8E8B-459B-A6D5-03DA9586578F,Alive,s6238__TCGA-QG-A5YX-01A-11D-A28G-10_wgs_Illumina.bam
s13894,,,G78484.TCGA-26-A7UX-01B-11R-A39C-08.1.bam,G78484.TCGA-26-A7UX-01B-11R-A39C-08.1.fasta,47,TCGA-26-A7UX-01B-11R-A39C-08,B7B97BFF-DC3B-4E90-937A-9F58CD443AC9,TCGA-26-A7UX,5f9baf9b-6d2d-495a-83a1-77f7e4cb9ccd,BAM,Aligned reads,Raw sequencing data,,Glioblastoma Multiforme,NOT HISPANIC OR LATINO,RNA-Seq,MALE,TCGA-GBM,Illumina HiSeq,Brain,BLACK OR AFRICAN AMERICAN,HG19_Broad_variant,TCGA-26-A7UX-01B,Primary Tumor,6FBD4E97-943F-48C5-A869-7CE6B08BF576,Alive,s1383__G78484.TCGA-26-A7UX-01B-11R-A39C-08.1.bam
s13930,,,G67451.TCGA-RR-A6KB-01A-12R-A344-08.2.bam,G67451.TCGA-RR-A6KB-01A-12R-A344-08.2.fasta,51,TCGA-RR-A6KB-01A-12R-A344-08,580AFEC9-68A7-402D-8C83-42BE721F816E,TCGA-RR-A6KB,6f1b1011-d7a4-4a05-b6ff-288183e7254b,BAM,Aligned reads,Raw sequencing data,,Glioblastoma Multiforme,NOT HISPANIC OR LATINO,RNA-Seq,MALE,TCGA-GBM,Illumina HiSeq,Brain,BLACK OR AFRICAN AMERICAN,HG19_Broad_variant,TCGA-RR-A6KB-01A,Primary Tumor,3555B69D-2783-4811-9312-2B77C97500F5,Alive,s1360__G67451.TCGA-RR-A6KB-01A-12R-A344-08.2.bam
s13951,,,G78484.TCGA-4W-AA9R-01A-11R-A39C-08.1.bam,G78484.TCGA-4W-AA9R-01A-11R-A39C-08.1.fasta,61,TCGA-4W-AA9R-01A-11R-A39C-08,451E8218-8C5A-4E3E-96D7-CEB5EB15DA11,TCGA-4W-AA9R,f55a023c-84b8-4f7d-87db-7a9939733c59,BAM,Aligned reads,Raw sequencing data,,Glioblastoma Multiforme,NOT HISPANIC OR LATINO,RNA-Seq,MALE,TCGA-GBM,Illumina HiSeq,Brain,BLACK OR AFRICAN AMERICAN,HG19_Broad_variant,TCGA-4W-AA9R-01A,Primary Tumor,8BFCDBEF-692A-4CA8-BFDC-AFE7AE85ACFF,Alive,s1361__G78484.TCGA-4W-AA9R-01A-11R-A39C-08.1.bam
s13987,,,G78484.TCGA-06-A7TK-01A-21R-A39C-08.1.bam,G78484.TCGA-06-A7TK-01A-21R-A39C-08.1.fasta,64,TCGA-06-A7TK-01A-21R-A39C-08,F50A7996-B135-4925-A65B-6E74CB088E53,TCGA-06-A7TK,41685c5a-a548-483a-8a20-305ad8d61771,BAM,Aligned reads,Raw sequencing data,,Glioblastoma Multiforme,NOT HISPANIC OR LATINO,RNA-Seq,MALE,TCGA-GBM,Illumina HiSeq,Brain,BLACK OR AFRICAN AMERICAN,HG19_Broad_variant,TCGA-06-A7TK-01A,Primary Tumor,5012C42C-3FBB-4E0A-ABBB-78DCC6E03D9A,Alive,s1355__G78484.TCGA-06-A7TK-01A-21R-A39C-08.1.bam
s13988,,,G78484.TCGA-4W-AA9S-01A-11R-A39C-08.1.bam,G78484.TCGA-4W-AA9S-01A-11R-A39C-08.1.fasta,69,TCGA-4W-AA9S-01A-11R-A39C-08,B8F75FB0-8D56-4112-9FF5-5018DFC3C496,TCGA-4W-AA9S,7a8d2e13-315e-4512-85ce-b018d11c3bd3,BAM,Aligned reads,Raw sequencing data,,Glioblastoma Multiforme,NOT HISPANIC OR LATINO,RNA-Seq,MALE,TCGA-GBM,Illumina HiSeq,Brain,BLACK OR AFRICAN AMERICAN,HG19_Broad_variant,TCGA-4W-AA9S-01A,Primary Tumor,D0DE44E5-E959-4AEE-BB04-16A65FD4C0EE,Alive,s1392__G78484.TCGA-4W-AA9S-01A-11R-A39C-08.1.bam
s13990,,,TCGA-06-0137-10A-01D-0513-10_SOLiD.bam,TCGA-06-0137-10A-01D-0513-10_SOLiD.fasta,63,TCGA-06-0137-10A-01D-0513-10,BC2C55F9-A89B-4C45-BE1F-4F1CACB710C3,TCGA-06-0137,d0de6676-6ba1-4d79-a9b0-ec3f1e8a8775,BAM,Aligned reads,Raw sequencing data,812.0,Glioblastoma Multiforme,NOT HISPANIC OR LATINO,WGS,FEMALE,TCGA-GBM,ABI SOLiD,Brain,WHITE,NCBI36_BCM_variant,TCGA-06-0137-10A,Blood Derived Normal,8A7A84E1-C6BA-4E4C-BC48-695E8D79CFBA,Dead,s1526__TCGA-06-0137-10A-01D-0513-10_SOLiD.bam
s14030,,,TCGA-06-0208-01A-01D-0374-10_SOLiD.bam,TCGA-06-0208-01A-01D-0374-10_SOLiD.fasta,52,TCGA-06-0208-01A-01D-0374-10,DAAB51B9-7F92-4D73-830D-47DFE0F6DCBC,TCGA-06-0208,e3711a9b-6d4c-44df-bbab-0a675046a5df,BAM,Aligned reads,Raw sequencing data,256.0,Glioblastoma Multiforme,NOT HISPANIC OR LATINO,WGS,FEMALE,TCGA-GBM,ABI SOLiD,Brain,WHITE,NCBI36_BCM_variant,TCGA-06-0208-01A,Primary Tumor,F76E7120-75E8-4E94-8217-E659C09AD276,Dead,s1508__TCGA-06-0208-01A-01D-0374-10_SOLiD.bam
s14063,,,G78484.TCGA-06-A7TL-01A-11R-A39C-08.1.bam,G78484.TCGA-06-A7TL-01A-11R-A39C-08.1.fasta,30,TCGA-06-A7TL-01A-11R-A39C-08,B60001EC-42D3-4159-A411-2CF3BDE0FC0A,TCGA-06-A7TL,0ca72ebd-ff33-45b8-a97c-3f1435603d71,BAM,Aligned reads,Raw sequencing data,,Glioblastoma Multiforme,NOT HISPANIC OR LATINO,RNA-Seq,FEMALE,TCGA-GBM,Illumina HiSeq,Brain,BLACK OR AFRICAN AMERICAN,HG19_Broad_variant,TCGA-06-A7TL-01A,Primary Tumor,5398667C-8D52-4BA0-80E0-E8376DC8C87D,Alive,s1359__G78484.TCGA-06-A7TL-01A-11R-A39C-08.1.bam
s14095,,,G67451.TCGA-RR-A6KC-01A-31R-A344-08.2.bam,G67451.TCGA-RR-A6KC-01A-31R-A344-08.2.fasta,55,TCGA-RR-A6KC-01A-31R-A344-08,76796BAD-718D-4A25-842A-3CE4D4A728EC,TCGA-RR-A6KC,50d1d4af-7d62-4c48-88c1-890bb2370353,BAM,Aligned reads,Raw sequencing data,625.0,Glioblastoma Multiforme,NOT HISPANIC OR LATINO,RNA-Seq,MALE,TCGA-GBM,Illumina HiSeq,Brain,BLACK OR AFRICAN AMERICAN,HG19_Broad_variant,TCGA-RR-A6KC-01A,Primary Tumor,895B14A9-C8A1-4CFE-A161-1DB32B1EF426,Dead,s1353__G67451.TCGA-RR-A6KC-01A-31R-A344-08.2.bam


In [17]:
missingDataVsMeta = np.setdiff1d(list(vbDataBarnDF.index), list(dfMeta.index))
vbDataBarnDFReconciled = vbDataBarnDF.drop(missingDataVsMeta)
vbDataBarnDFReconciled.shape

(18116, 1993)

In [18]:
dfMetaReconciled = dfMeta.loc[vbDataBarnDFReconciled.index]
dfMetaReconciled.shape

(18116, 27)

# Creating a merged metadata file

Data was taken from CGC SBGenomics website to align filenames with their gdc_file_uuid, which is how the 3D patient metadata dictionary is set up to be read.

In [19]:
cgcDataminer = pd.read_csv('cgcDataminerTotalNoDuplicates.csv')
originalMappingFile = pd.read_csv('cgc_qiime_mapping_file_18154_files.csv')
cgcDataminer.head(2)

Unnamed: 0,filename,size,gdc_file_uuid
0,ff361cc3e22a9619323ddbddf3aac564.bam,127.7 GB,D104749E-66ED-4722-98A5-5D59447E5779
1,ff180684131876b77d5fa19e93aaeb1e.bam,131.5 GB,62A7D6F2-3B71-4449-83FE-DA6BEEA65873


During the webscraping, filenames that were too long to display were interrupted with a "..." midway through the text. This caused a significant issue when trying to subsequently merge tables on filename, as 1028 filenames didn't match due to the ellipsis. The code below searches for the filenames with the artificial "...", searches for the full name in the main qiime metadata file, and replaces it accordingly.

In [20]:
count=0
partialString = ''
fullString = ''
for ii in range(len(cgcDataminer)):
    if "..." in cgcDataminer.iloc[ii]['filename']:
        partialString = cgcDataminer.iloc[ii]['filename']
        splitPartialString = partialString.split('...')[0]
        fullString = originalMappingFile[originalMappingFile['filename'].str.contains(splitPartialString)]['filename'].values[0]
        # Replace
        cgcDataminer.iloc[ii]['filename'] = fullString
        
        count+=1
        
print(count)

1028


In [21]:
# Sanity check to show that the filename replacement code above worked (should have 0 missing values)
missing = np.setdiff1d(list(cgcDataminer.filename), list(originalMappingFile.filename))
print(len(missing))

0


In [22]:
dfMetaReconciled.insert(loc=0, column='sampleID', value=dfMetaReconciled.index)  # Extract sampleID so it's not lost in the PD merging
dfMetaReconciled.head(2)

Unnamed: 0,sampleID,BarcodeSequence,LinkerPrimerSequence,filename,filenames_new_suffix,age_at_diagnosis,aliquot_id,aliquot_uuid,case_id,case_uuid,data_format,data_subtype,data_type,days_to_death,disease_type,ethnicity,experimental_strategy,gender,investigation,platform,primary_site,race,reference_genome,sample_id,sample_type,sample_uuid,vital_status,Description
s17489,s17489,,,TCGA-09-1659-01B-01R-1564-13_GRCh37-lite_rnaseq.bam,TCGA-09-1659-01B-01R-1564-13_GRCh37-lite_rnaseq.fasta,51,TCGA-09-1659-01B-01R-1564-13,32D5209E-2EF2-4EF0-B902-585542D7D16B,TCGA-09-1659,644a88a7-dc56-468a-af5c-60278aab7642,BAM,Aligned reads,Raw sequencing data,304,Ovarian Serous Cystadenocarcinoma,NOT HISPANIC OR LATINO,RNA-Seq,FEMALE,TCGA-OV,Illumina HiSeq,Ovary,WHITE,GRCh37-lite,TCGA-09-1659-01B,Primary Tumor,01A6F7CD-10CC-4F33-AA75-B29ABD6C2EDB,Dead,s1570__TCGA-09-1659-01B-01R-1564-13_GRCh37-lite_rnaseq.bam
s17512,s17512,,,TCGA-24-1105-01A-01R-1565-13_GRCh37-lite_rnaseq.bam,TCGA-24-1105-01A-01R-1565-13_GRCh37-lite_rnaseq.fasta,36,TCGA-24-1105-01A-01R-1565-13,FD57ADC4-669C-4916-A2F5-E5F92CB0ADB6,TCGA-24-1105,7e01f23a-0b0e-46a2-9f30-c3a9d2c779d2,BAM,Aligned reads,Raw sequencing data,1442,Ovarian Serous Cystadenocarcinoma,Not available,RNA-Seq,FEMALE,TCGA-OV,Illumina HiSeq,Ovary,WHITE,GRCh37-lite,TCGA-24-1105-01A,Primary Tumor,AE4AA065-8B1D-42A3-98EF-357E19C1F1E1,Dead,s1911__TCGA-24-1105-01A-01R-1565-13_GRCh37-lite_rnaseq.bam


In [23]:
dfMetaReconciledUUID = pd.merge(dfMetaReconciled, cgcDataminer, how = 'left', on='filename')
print(dfMetaReconciledUUID.shape)
dfMetaReconciledUUID.head(3)

(18116, 30)


Unnamed: 0,sampleID,BarcodeSequence,LinkerPrimerSequence,filename,filenames_new_suffix,age_at_diagnosis,aliquot_id,aliquot_uuid,case_id,case_uuid,data_format,data_subtype,data_type,days_to_death,disease_type,ethnicity,experimental_strategy,gender,investigation,platform,primary_site,race,reference_genome,sample_id,sample_type,sample_uuid,vital_status,Description,size,gdc_file_uuid
0,s17489,,,TCGA-09-1659-01B-01R-1564-13_GRCh37-lite_rnaseq.bam,TCGA-09-1659-01B-01R-1564-13_GRCh37-lite_rnaseq.fasta,51,TCGA-09-1659-01B-01R-1564-13,32D5209E-2EF2-4EF0-B902-585542D7D16B,TCGA-09-1659,644a88a7-dc56-468a-af5c-60278aab7642,BAM,Aligned reads,Raw sequencing data,304.0,Ovarian Serous Cystadenocarcinoma,NOT HISPANIC OR LATINO,RNA-Seq,FEMALE,TCGA-OV,Illumina HiSeq,Ovary,WHITE,GRCh37-lite,TCGA-09-1659-01B,Primary Tumor,01A6F7CD-10CC-4F33-AA75-B29ABD6C2EDB,Dead,s1570__TCGA-09-1659-01B-01R-1564-13_GRCh37-lite_rnaseq.bam,14.6 GB,C38F96A6-0586-4F5E-B3E8-ADBB0358D9DC
1,s17512,,,TCGA-24-1105-01A-01R-1565-13_GRCh37-lite_rnaseq.bam,TCGA-24-1105-01A-01R-1565-13_GRCh37-lite_rnaseq.fasta,36,TCGA-24-1105-01A-01R-1565-13,FD57ADC4-669C-4916-A2F5-E5F92CB0ADB6,TCGA-24-1105,7e01f23a-0b0e-46a2-9f30-c3a9d2c779d2,BAM,Aligned reads,Raw sequencing data,1442.0,Ovarian Serous Cystadenocarcinoma,Not available,RNA-Seq,FEMALE,TCGA-OV,Illumina HiSeq,Ovary,WHITE,GRCh37-lite,TCGA-24-1105-01A,Primary Tumor,AE4AA065-8B1D-42A3-98EF-357E19C1F1E1,Dead,s1911__TCGA-24-1105-01A-01R-1565-13_GRCh37-lite_rnaseq.bam,13.1 GB,7B59A22A-5799-4413-BB29-0E79643C540E
2,s17498,,,TCGA-13-1498-01A-01R-1565-13_rnaseq.bam,TCGA-13-1498-01A-01R-1565-13_rnaseq.fasta,73,TCGA-13-1498-01A-01R-1565-13,176D1CA2-EB1D-4F49-9B10-C129C822FA5A,TCGA-13-1498,56a30462-2819-4c18-95be-8e73880a4921,BAM,Aligned reads,Raw sequencing data,,Ovarian Serous Cystadenocarcinoma,NOT HISPANIC OR LATINO,RNA-Seq,FEMALE,TCGA-OV,Illumina HiSeq,Ovary,WHITE,NCBI36_BCCAGSC_variant,TCGA-13-1498-01A,Primary Tumor,B38F209A-000B-4CBF-A4BD-0BC6F027FCE3,Alive,s1750__TCGA-13-1498-01A-01R-1565-13_rnaseq.bam,13.1 GB,53B17871-CE7E-4BD1-B4B5-A7D4D2003416


In [24]:
# Sanity check: All should have the same counts
tmp = dfMetaReconciledUUID[['filename','sampleID','gdc_file_uuid']]
tmp.count(axis=0)

filename         18116
sampleID         18116
gdc_file_uuid    18116
dtype: int64

In [25]:
dfMetaReconciledUUID.set_index('gdc_file_uuid', inplace=True)
dfMetaReconciledUUID.head(2)

Unnamed: 0_level_0,sampleID,BarcodeSequence,LinkerPrimerSequence,filename,filenames_new_suffix,age_at_diagnosis,aliquot_id,aliquot_uuid,case_id,case_uuid,data_format,data_subtype,data_type,days_to_death,disease_type,ethnicity,experimental_strategy,gender,investigation,platform,primary_site,race,reference_genome,sample_id,sample_type,sample_uuid,vital_status,Description,size
gdc_file_uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1
C38F96A6-0586-4F5E-B3E8-ADBB0358D9DC,s17489,,,TCGA-09-1659-01B-01R-1564-13_GRCh37-lite_rnaseq.bam,TCGA-09-1659-01B-01R-1564-13_GRCh37-lite_rnaseq.fasta,51,TCGA-09-1659-01B-01R-1564-13,32D5209E-2EF2-4EF0-B902-585542D7D16B,TCGA-09-1659,644a88a7-dc56-468a-af5c-60278aab7642,BAM,Aligned reads,Raw sequencing data,304,Ovarian Serous Cystadenocarcinoma,NOT HISPANIC OR LATINO,RNA-Seq,FEMALE,TCGA-OV,Illumina HiSeq,Ovary,WHITE,GRCh37-lite,TCGA-09-1659-01B,Primary Tumor,01A6F7CD-10CC-4F33-AA75-B29ABD6C2EDB,Dead,s1570__TCGA-09-1659-01B-01R-1564-13_GRCh37-lite_rnaseq.bam,14.6 GB
7B59A22A-5799-4413-BB29-0E79643C540E,s17512,,,TCGA-24-1105-01A-01R-1565-13_GRCh37-lite_rnaseq.bam,TCGA-24-1105-01A-01R-1565-13_GRCh37-lite_rnaseq.fasta,36,TCGA-24-1105-01A-01R-1565-13,FD57ADC4-669C-4916-A2F5-E5F92CB0ADB6,TCGA-24-1105,7e01f23a-0b0e-46a2-9f30-c3a9d2c779d2,BAM,Aligned reads,Raw sequencing data,1442,Ovarian Serous Cystadenocarcinoma,Not available,RNA-Seq,FEMALE,TCGA-OV,Illumina HiSeq,Ovary,WHITE,GRCh37-lite,TCGA-24-1105-01A,Primary Tumor,AE4AA065-8B1D-42A3-98EF-357E19C1F1E1,Dead,s1911__TCGA-24-1105-01A-01R-1565-13_GRCh37-lite_rnaseq.bam,13.1 GB


## QIIME metadata cleaning

In [26]:
cols_to_remove = list(['size',
                       'BarcodeSequence',
                       'LinkerPrimerSequence',
                       'filenames_new_suffix',
                       'aliquot_id',
                       'case_id',
                       'data_format',
                       'data_subtype',
                       'data_type',
                       'sample_id',
                       'Description'])

keyMetaReconciledUUID = dfMetaReconciledUUID.drop(labels = cols_to_remove, axis = 'columns')
keyMetaReconciledUUID.head(2)

Unnamed: 0_level_0,sampleID,filename,age_at_diagnosis,aliquot_uuid,case_uuid,days_to_death,disease_type,ethnicity,experimental_strategy,gender,investigation,platform,primary_site,race,reference_genome,sample_type,sample_uuid,vital_status
gdc_file_uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
C38F96A6-0586-4F5E-B3E8-ADBB0358D9DC,s17489,TCGA-09-1659-01B-01R-1564-13_GRCh37-lite_rnaseq.bam,51,32D5209E-2EF2-4EF0-B902-585542D7D16B,644a88a7-dc56-468a-af5c-60278aab7642,304,Ovarian Serous Cystadenocarcinoma,NOT HISPANIC OR LATINO,RNA-Seq,FEMALE,TCGA-OV,Illumina HiSeq,Ovary,WHITE,GRCh37-lite,Primary Tumor,01A6F7CD-10CC-4F33-AA75-B29ABD6C2EDB,Dead
7B59A22A-5799-4413-BB29-0E79643C540E,s17512,TCGA-24-1105-01A-01R-1565-13_GRCh37-lite_rnaseq.bam,36,FD57ADC4-669C-4916-A2F5-E5F92CB0ADB6,7e01f23a-0b0e-46a2-9f30-c3a9d2c779d2,1442,Ovarian Serous Cystadenocarcinoma,Not available,RNA-Seq,FEMALE,TCGA-OV,Illumina HiSeq,Ovary,WHITE,GRCh37-lite,Primary Tumor,AE4AA065-8B1D-42A3-98EF-357E19C1F1E1,Dead


## Extracting static data from Pickle file into keyMetaUUID

In [27]:
staticMetaPickle = list([
    'tissue_source_site_label',
    'data_submitting_center_label',
    'country_of_sample_procurement',
    'histological_diagnosis_label',
    'pathologic_t_label',
    'pathologic_n_label',
    'pathologic_stage_label',
    'icd03_histology_label',
    'icd03_histology_site',
    'icd10',
    'portion_is_ffpe',
    'new_tumor_event_after_initial_trtmt',
    'primary_therapy_outcome_success_label',
    'portion_weight',
    'aliquot_concentration',
    'analyte_A260A280Ratio',
    'analyte_amount',
    'analyte_type_label',
    'radiation_therapy_code_label',
    'radiation_therapy_site_label',
    'radiation_therapy_type_label',
    'year_of_diagnosis',
    'vital_status_label'
])

appended_data = []
appended_ind = []
for ii in range(len(keyMetaReconciledUUID)):
    file_uuid = keyMetaReconciledUUID.index[ii]
    appended_ind.append(file_uuid)
    dfHolder = fileNameTesting[file_uuid]
    appended_data.append(pd.DataFrame(dfHolder[staticMetaPickle].iloc[0]))

# Concatenate extracted data into Dataframe object
extractedStaticPickleDF = pd.concat(appended_data, axis=1).transpose()
extractedStaticPickleDF['gdc_file_uuid'] = appended_ind
extractedStaticPickleDF.set_index('gdc_file_uuid', inplace=True)
print(extractedStaticPickleDF.shape)
extractedStaticPickleDF.head()

(18116, 23)


Unnamed: 0_level_0,tissue_source_site_label,data_submitting_center_label,country_of_sample_procurement,histological_diagnosis_label,pathologic_t_label,pathologic_n_label,pathologic_stage_label,icd03_histology_label,icd03_histology_site,icd10,portion_is_ffpe,new_tumor_event_after_initial_trtmt,primary_therapy_outcome_success_label,portion_weight,aliquot_concentration,analyte_A260A280Ratio,analyte_amount,analyte_type_label,radiation_therapy_code_label,radiation_therapy_site_label,radiation_therapy_type_label,year_of_diagnosis,vital_status_label
gdc_file_uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
C38F96A6-0586-4F5E-B3E8-ADBB0358D9DC,UCSF,Canada's Michael Smith Genome Sciences Centre,,Serous Cystadenocarcinoma,Not available,Not available,Not available,8441/3,C56.9,C56.9,NO,,Not available,110,0.16,1.8,269.0,RNA,,,,1997,Dead
7B59A22A-5799-4413-BB29-0E79643C540E,Washington University,Canada's Michael Smith Genome Sciences Centre,,Serous Cystadenocarcinoma,Not available,Not available,Not available,8441/3,C56.9,C56.9,NO,,Not available,130,0.15,1.8,236.0,RNA,06461927-0E16-4D01-B765-C5F76574560D,Distant Recurrence,EXTERNAL BEAM,2001,Dead
53B17871-CE7E-4BD1-B4B5-A7D4D2003416,Memorial Sloan Kettering,Canada's Michael Smith Genome Sciences Centre,,Serous Cystadenocarcinoma,Not available,Not available,Not available,8441/3,C56.9,C56.9,NO,,Not available,131,0.16,1.8,392.0,RNA,,,,2006,Alive
B98E97B9-6C7A-437E-9724-4D76D5FEFC06,Duke,Canada's Michael Smith Genome Sciences Centre,,Serous Cystadenocarcinoma,Not available,Not available,Not available,8441/3,C56.9,C56.9,NO,,Not available,104,0.16,1.8,53.7,RNA,,,,2005,Dead
E9EC871C-163C-4FD8-AE67-02BA37605467,Duke,Canada's Michael Smith Genome Sciences Centre,,Serous Cystadenocarcinoma,Not available,Not available,Not available,8441/3,C56.9,C56.9,NO,,Not available,147,0.17,1.8,117.0,RNA,,,,2008,Alive


## Merge extracted data into one main metadata dataframe

In [28]:
tcgaMetadataPickledReconciled = pd.merge(keyMetaReconciledUUID,extractedStaticPickleDF,how='left',left_index=True,right_index=True)
print(keyMetaReconciledUUID.shape)
print(extractedStaticPickleDF.shape)
print(tcgaMetadataPickledReconciled.shape)
tcgaMetadataPickledReconciled.head()

(18116, 18)
(18116, 23)
(18116, 41)


Unnamed: 0_level_0,sampleID,filename,age_at_diagnosis,aliquot_uuid,case_uuid,days_to_death,disease_type,ethnicity,experimental_strategy,gender,investigation,platform,primary_site,race,reference_genome,sample_type,sample_uuid,vital_status,tissue_source_site_label,data_submitting_center_label,country_of_sample_procurement,histological_diagnosis_label,pathologic_t_label,pathologic_n_label,pathologic_stage_label,icd03_histology_label,icd03_histology_site,icd10,portion_is_ffpe,new_tumor_event_after_initial_trtmt,primary_therapy_outcome_success_label,portion_weight,aliquot_concentration,analyte_A260A280Ratio,analyte_amount,analyte_type_label,radiation_therapy_code_label,radiation_therapy_site_label,radiation_therapy_type_label,year_of_diagnosis,vital_status_label
gdc_file_uuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1
C38F96A6-0586-4F5E-B3E8-ADBB0358D9DC,s17489,TCGA-09-1659-01B-01R-1564-13_GRCh37-lite_rnaseq.bam,51,32D5209E-2EF2-4EF0-B902-585542D7D16B,644a88a7-dc56-468a-af5c-60278aab7642,304.0,Ovarian Serous Cystadenocarcinoma,NOT HISPANIC OR LATINO,RNA-Seq,FEMALE,TCGA-OV,Illumina HiSeq,Ovary,WHITE,GRCh37-lite,Primary Tumor,01A6F7CD-10CC-4F33-AA75-B29ABD6C2EDB,Dead,UCSF,Canada's Michael Smith Genome Sciences Centre,,Serous Cystadenocarcinoma,Not available,Not available,Not available,8441/3,C56.9,C56.9,NO,,Not available,110,0.16,1.8,269.0,RNA,,,,1997,Dead
7B59A22A-5799-4413-BB29-0E79643C540E,s17512,TCGA-24-1105-01A-01R-1565-13_GRCh37-lite_rnaseq.bam,36,FD57ADC4-669C-4916-A2F5-E5F92CB0ADB6,7e01f23a-0b0e-46a2-9f30-c3a9d2c779d2,1442.0,Ovarian Serous Cystadenocarcinoma,Not available,RNA-Seq,FEMALE,TCGA-OV,Illumina HiSeq,Ovary,WHITE,GRCh37-lite,Primary Tumor,AE4AA065-8B1D-42A3-98EF-357E19C1F1E1,Dead,Washington University,Canada's Michael Smith Genome Sciences Centre,,Serous Cystadenocarcinoma,Not available,Not available,Not available,8441/3,C56.9,C56.9,NO,,Not available,130,0.15,1.8,236.0,RNA,06461927-0E16-4D01-B765-C5F76574560D,Distant Recurrence,EXTERNAL BEAM,2001,Dead
53B17871-CE7E-4BD1-B4B5-A7D4D2003416,s17498,TCGA-13-1498-01A-01R-1565-13_rnaseq.bam,73,176D1CA2-EB1D-4F49-9B10-C129C822FA5A,56a30462-2819-4c18-95be-8e73880a4921,,Ovarian Serous Cystadenocarcinoma,NOT HISPANIC OR LATINO,RNA-Seq,FEMALE,TCGA-OV,Illumina HiSeq,Ovary,WHITE,NCBI36_BCCAGSC_variant,Primary Tumor,B38F209A-000B-4CBF-A4BD-0BC6F027FCE3,Alive,Memorial Sloan Kettering,Canada's Michael Smith Genome Sciences Centre,,Serous Cystadenocarcinoma,Not available,Not available,Not available,8441/3,C56.9,C56.9,NO,,Not available,131,0.16,1.8,392.0,RNA,,,,2006,Alive
B98E97B9-6C7A-437E-9724-4D76D5FEFC06,s17528,TCGA-29-1705-02A-01R-1567-13_GRCh37-lite_rnaseq.bam,47,CC3AFD8C-3639-437F-A22B-AADC9F6C5AAA,f9c835db-2ab6-4bf5-826f-48723493c0ec,555.0,Ovarian Serous Cystadenocarcinoma,Not available,RNA-Seq,FEMALE,TCGA-OV,Illumina HiSeq,Ovary,WHITE,GRCh37-lite,Recurrent Tumor,B5163587-A94D-4674-9F11-76A5ADC9151D,Dead,Duke,Canada's Michael Smith Genome Sciences Centre,,Serous Cystadenocarcinoma,Not available,Not available,Not available,8441/3,C56.9,C56.9,NO,,Not available,104,0.16,1.8,53.7,RNA,,,,2005,Dead
E9EC871C-163C-4FD8-AE67-02BA37605467,s17535,TCGA-29-1777-01A-01R-1567-13_GRCh37-lite_rnaseq.bam,47,CBEBA637-1A32-45F1-83F3-614725665436,5ced5b58-fab4-4bcd-b706-d6e49ce0cfc5,,Ovarian Serous Cystadenocarcinoma,Not available,RNA-Seq,FEMALE,TCGA-OV,Illumina HiSeq,Ovary,WHITE,GRCh37-lite,Primary Tumor,AFB43292-69C2-4D84-84F5-0BDE9A6E9EA5,Alive,Duke,Canada's Michael Smith Genome Sciences Centre,,Serous Cystadenocarcinoma,Not available,Not available,Not available,8441/3,C56.9,C56.9,NO,,Not available,147,0.17,1.8,117.0,RNA,,,,2008,Alive


In [29]:
tcgaMetadataPickledReconciled.radiation_therapy_code_label.value_counts()

4F455D1B-8492-4C47-BC16-01DE2AC1898A    10
CF5E2AC7-1264-4B95-86C2-6C3748C3F63E    9 
B55C8E7E-F2D6-452F-9D89-E51054CFACA4    8 
79AB7E62-6F27-4A0B-BD9D-6604C379142E    7 
0170BB9D-3F24-4932-92EE-9BAA1D3AE3DC    7 
636B9443-3E2C-48BD-AE5B-4830A28CDEE5    7 
C89EC5A7-544E-49E0-9CD1-C0D2C31EA305    7 
0D9A7590-E13D-4EE4-8799-90929FC8818F    7 
E4E00581-06DC-4638-B03A-43CCCDD89A46    7 
F92B3052-5815-4AFE-85DD-80ACEC4563A4    7 
9939E1CA-953B-4E0A-A861-225FEE9920C8    7 
64AF2B41-56BC-4089-8B6F-3DBE74A9273A    7 
EED3A9E7-41B1-468D-A4D4-62275344027C    7 
597CFDF4-47EE-4CFD-8D2B-66BD1D75DE2B    7 
DC4F3B20-A952-41B7-B484-E35D5B0FB589    6 
659B8763-1226-49D0-8ED4-E5BDB14548E6    6 
F747E175-7B33-42B7-A077-B3581BBCF094    6 
1A7DD856-99F0-40D6-90A7-1BA2202515F0    6 
ED646923-A759-4AE3-A882-C6928015194F    6 
F3DA80CD-337F-4782-90F1-F7D94AA9A17C    6 
B1912257-7FD6-4E0B-9609-96C0BFAB93A9    6 
133ADBF7-0275-4A02-8BB4-5EC6F14E1AD9    6 
DF9E861E-D0E8-4C2D-808B-352783005D25    6 
873291FF-F5

# Exporting data to csv for manipulation in R

In [30]:
# vbBarnDF.to_csv('vbBarnDF_all_cancers.csv')
# vbBarnDFFilt.to_csv('vbBarnFilt_4_cancers.csv')
vbDataBarnDFReconciled.to_csv('vbDataBarnDFReconciled_18116_files.csv')
tcgaMetadataPickledReconciled.to_csv('tcgaMetadataPickledReconciled_18116_files.csv')