# Subsampling 

to balance y=0 and y=1

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import os
import pickle
from collections import Counter

import pandas as pd
import polars as pl

from sklearn.preprocessing import MaxAbsScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.model_selection import GroupKFold
from imblearn.under_sampling import RandomUnderSampler

In [3]:
training_file = 'data/all_gene_annotations.added_incompleteness_and_contamination.training.tsv'
y = 'data/bacdive_scrape_20230315.json.parsed.anaerobe_vs_aerobe.with_cyanos.csv'

model_output_dir = 'data/subsampling_balanced_bacdive_scrape_20230315.json.parsed.anaerobe_vs_rest.with_cyanos.csv_SS.models'
cross_validation_data_output_dir = 'data/subsampling_balenced_bacdive_scrape_20230315.json.parsed.anaerobe_vs_rest.with_cyanos.csv_SS.cv_data'

TARGET_COLUMN = 'oxytolerance'
threads = 64

model_output_dir = 'data/SS.models'
cross_validation_data_output_dir = 'data/SS.cv_data'


# Ready output directory
if not os.path.exists(model_output_dir):
    os.makedirs(model_output_dir)
if not os.path.exists(cross_validation_data_output_dir):
    os.makedirs(cross_validation_data_output_dir)

target_column = TARGET_COLUMN 

## Data

In [4]:
#Read GTDB
gtdb = pl.concat([
    pl.read_csv('data/bac120_metadata_r202.tsv', separator="\t"),
    pl.read_csv('data/ar122_metadata_r202.tsv', separator="\t")
])
gtdb = gtdb.filter(pl.col("gtdb_representative") == "t")
print("Read in {} GTDB reps".format(len(gtdb)))
gtdb = gtdb.with_columns(pl.col("gtdb_taxonomy").str.split(';').list.get(1).alias("phylum"))
gtdb = gtdb.with_columns(pl.col("gtdb_taxonomy").str.split(';').list.get(2).alias("class"))
gtdb = gtdb.with_columns(pl.col("gtdb_taxonomy").str.split(';').list.get(3).alias("order"))
gtdb = gtdb.with_columns(pl.col("gtdb_taxonomy").str.split(';').list.get(4).alias("family"))
gtdb = gtdb.with_columns(pl.col("gtdb_taxonomy").str.split(';').list.get(5).alias("genus"))

gtdb

Read in 47894 GTDB reps


accession,ambiguous_bases,checkm_completeness,checkm_contamination,checkm_marker_count,checkm_marker_lineage,checkm_marker_set_count,checkm_strain_heterogeneity,coding_bases,coding_density,contig_count,gc_count,gc_percentage,genome_size,gtdb_genome_representative,gtdb_representative,gtdb_taxonomy,gtdb_type_designation,gtdb_type_designation_sources,gtdb_type_species_of_genus,l50_contigs,l50_scaffolds,longest_contig,longest_scaffold,lsu_23s_contig_len,lsu_23s_count,lsu_23s_length,lsu_23s_query_id,lsu_5s_contig_len,lsu_5s_count,lsu_5s_length,lsu_5s_query_id,lsu_silva_23s_blast_align_len,lsu_silva_23s_blast_bitscore,lsu_silva_23s_blast_evalue,lsu_silva_23s_blast_perc_identity,lsu_silva_23s_blast_subject_id,…,ncbi_taxonomy,ncbi_taxonomy_unfiltered,ncbi_total_gap_length,ncbi_total_length,ncbi_translation_table,ncbi_trna_count,ncbi_type_material_designation,ncbi_ungapped_length,ncbi_unspanned_gaps,ncbi_wgs_master,protein_count,scaffold_count,ssu_contig_len,ssu_count,ssu_gg_blast_align_len,ssu_gg_blast_bitscore,ssu_gg_blast_evalue,ssu_gg_blast_perc_identity,ssu_gg_blast_subject_id,ssu_gg_taxonomy,ssu_length,ssu_query_id,ssu_silva_blast_align_len,ssu_silva_blast_bitscore,ssu_silva_blast_evalue,ssu_silva_blast_perc_identity,ssu_silva_blast_subject_id,ssu_silva_taxonomy,total_gap_length,trna_aa_count,trna_count,trna_selenocysteine_count,phylum,class,order,family,genus
str,i64,f64,f64,i64,str,i64,f64,i64,f64,i64,i64,f64,i64,str,str,str,str,str,str,i64,i64,i64,i64,str,i64,str,str,str,i64,str,str,str,str,str,str,str,…,str,str,i64,i64,str,str,str,i64,i64,str,i64,i64,str,i64,str,str,str,str,str,str,str,str,str,str,str,str,str,str,i64,i64,i64,i64,str,str,str,str,str
"""GB_GCA_000024525.1""",0,100.0,0.89,454,"""o__Cytophagales (UID2936)""",336,25.0,7481814,88.111962,9,4258276,50.148941,8491258,"""GB_GCA_000024525.1""","""t""","""d__Bacteria;p__Bacteroidota;c_…","""type strain of species""","""LPSN; DSMZ""","""t""",1,1,8078757,8078757,"""8078757""",4,"""2807""","""CP001769.1""","""none""",0,"""none""","""none""","""2807""","""5184""","""0""","""100""","""CP001769.6728674.6731505""",…,"""d__Bacteria;p__Bacteroidetes;c…","""d__Bacteria;x__FCB group;x__Ba…",0,8491258,"""11""","""49""","""assembly from type material""",8491258,0,"""none""",7129,9,"""8078757""",4,"""none""","""none""","""none""","""none""","""none""","""none""","""1503""","""CP001769.1""","""1495""","""2761""","""0""","""100""","""CP001769.6726840.6728334""","""Bacteria;Bacteroidota;Bacteroi…",0,20,47,0,"""p__Bacteroidota""","""c__Bacteroidia""","""o__Cytophagales""","""f__Spirosomaceae""","""g__Spirosoma"""
"""GB_GCA_000285855.2""",7,99.37,2.24,278,"""o__Clostridiales (UID1226)""",158,0.0,3538921,68.355703,526,1633519,39.182768,5177214,"""GB_GCA_000285855.2""","""t""","""d__Bacteria;p__Firmicutes_A;c_…","""not type material""","""none""","""f""",18,1,215746,3529697,"""5075""",1,"""2826""","""HE978695.1""","""none""",0,"""none""","""none""","""2826""","""5219""","""0""","""100""","""CAHL01000240.52.2884""",…,"""d__Bacteria;p__Firmicutes;c__C…","""d__Bacteria;x__Terrabacteria g…",1008234,5177214,"""none""","""none""","""none""",4168980,0,"""CAHL00000000.1""",4285,53,"""5075""",1,"""none""","""none""","""none""","""none""","""none""","""none""","""1419""","""HE978695.1""","""1419""","""2569""","""0""","""99.366""","""AB559636.1.1522""","""Bacteria;Firmicutes;Clostridia…",1008234,18,54,0,"""p__Firmicutes_A""","""c__Clostridia""","""o__Lachnospirales""","""f__Lachnospiraceae""","""g__Blautia_A"""
"""GB_GCA_000307935.1""",0,68.88,0.0,451,"""c__Gammaproteobacteria (UID444…",270,0.0,1340520,95.97079,13,505174,36.166523,1396800,"""GB_GCA_000307935.1""","""t""","""d__Bacteria;p__Proteobacteria;…","""not type material""","""none""","""f""",2,2,402564,402564,"""53766""",1,"""2885""","""AMWX01000006.1""","""53766""",1,"""109""","""AMWX01000006.1""","""2885""","""5328""","""0""","""100""","""AMWX01000006.6511.9405""",…,"""d__Bacteria;p__Proteobacteria;…","""d__Bacteria;p__Proteobacteria;…",0,1396800,"""11""","""33""","""none""",1396800,0,"""AMWX00000000.1""",1401,13,"""53766""",1,"""none""","""none""","""none""","""none""","""none""","""none""","""1530""","""AMWX01000006.1""","""1526""","""2819""","""0""","""100""","""AMWX01000006.4726.6251""","""Bacteria;Proteobacteria;Gammap…",0,19,33,0,"""p__Proteobacteria""","""c__Gammaproteobacteria""","""o__SAR86""","""f__D2472""","""g__D2472"""
"""GB_GCA_000307955.1""",0,100.0,0.0,280,"""c__Deltaproteobacteria (UID321…",168,0.0,3850881,87.689063,489,2885526,65.706801,4391518,"""GB_GCA_000307955.1""","""t""","""d__Bacteria;p__Desulfobacterot…","""not type material""","""none""","""f""",92,92,55591,55591,"""2079""",2,"""1737""","""ALAO01000319.1""","""813""",1,"""109""","""ALAO01000446.1""","""1737""","""3142""","""0""","""99.309""","""LP130708.1826778.1829698""",…,"""d__Bacteria;p__Proteobacteria;…","""d__Bacteria;p__Proteobacteria;…",0,4391518,"""11""","""48""","""none""",4391518,0,"""ALAO00000000.1""",3939,489,"""511""",2,"""none""","""none""","""none""","""none""","""none""","""none""","""509""","""ALAO01000288.1""","""503""","""929""","""0""","""100""","""JAGC01000009.588616.590144""","""Bacteria;Desulfobacterota;Desu…",0,20,44,1,"""p__Desulfobacterota""","""c__Desulfovibrionia""","""o__Desulfovibrionales""","""f__Desulfovibrionaceae""","""g__Solidesulfovibrio"""
"""GB_GCA_000372185.1""",11,74.4,0.79,315,"""o__Actinomycetales (UID1696)""",190,0.0,1080418,94.899209,43,542017,47.608871,1138490,"""GB_GCA_000372185.1""","""t""","""d__Bacteria;p__Actinobacteriot…","""not type material""","""none""","""f""",5,5,179113,179113,"""94645""",1,"""3084""","""AQUA01000007.1""","""94645""",1,"""110""","""AQUA01000007.1""","""3084""","""5696""","""0""","""100""","""AQUA01000007.16680.19769""",…,"""d__Bacteria;p__Actinobacteria;…","""d__Bacteria;x__Terrabacteria g…",0,1138490,"""none""","""none""","""none""",1138490,0,"""AQUA00000000.1""",1195,43,"""94645""",1,"""none""","""none""","""none""","""none""","""none""","""none""","""1510""","""AQUA01000007.1""","""1510""","""2778""","""0""","""99.868""","""CP016773.631592.633114""","""Bacteria;Actinobacteriota;Acti…",0,19,39,0,"""p__Actinobacteriota""","""c__Actinomycetia""","""o__Nanopelagicales""","""f__Nanopelagicaceae""","""g__Planktophila"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""RS_GCF_012726115.1""",0,99.93,0.0,395,"""f__Halobacteriaceae (UID85)""",250,0.0,3490522,86.974577,29,2460153,61.308909,4013267,"""RS_GCF_012726115.1""","""t""","""d__Archaea;p__Halobacteriota;c…","""not type material""","""none""","""f""",3,3,935943,1152139,"""237202""",3,"""1244""","""NZ_WOWA01000006.1""","""237202""",3,"""116""","""NZ_WOWA01000006.1""","""1244""","""2276""","""0""","""99.678""","""BCNA01000001.2541322.2544245""",…,"""d__Archaea;p__Euryarchaeota;c_…","""d__Archaea;p__Euryarchaeota;x_…",550,4013267,"""11""","""47""","""none""",4012717,0,"""WOWA00000000.1""",4163,11,"""1152139""",3,"""none""","""none""","""none""","""none""","""none""","""none""","""1479""","""NZ_WOWA01000004.1""","""1478""","""2654""","""0""","""99.12""","""AB663351.1.1472""","""Archaea;Halobacterota;Halobact…",550,18,39,0,"""p__Halobacteriota""","""c__Halobacteria""","""o__Halobacteriales""","""f__Haloarculaceae""","""g__Haloarcula"""
"""RS_GCF_013343295.1""",0,99.4,1.19,217,"""c__Thermoprotei (UID147)""",168,0.0,1843266,84.674011,1,975244,44.799731,2176897,"""RS_GCF_013343295.1""","""t""","""d__Archaea;p__Thermoproteota;c…","""type strain of species""","""LPSN; DSMZ""","""f""",1,1,2176897,2176897,"""2176897""",1,"""3032""","""NZ_CP049074.1""","""none""",0,"""none""","""none""","""3030""","""4809""","""0""","""95.347""","""CP008822.1706923.1709976""",…,"""d__Archaea;p__Crenarchaeota;c_…","""d__Archaea;x__TACK group;p__Cr…",0,2176897,"""11""","""47""","""assembly from type material""",2176897,0,"""none""",2335,1,"""2176897""",1,"""none""","""none""","""none""","""none""","""none""","""none""","""1494""","""NZ_CP049074.1""","""1430""","""2630""","""0""","""99.86""","""KJ735100.1.1430""","""Archaea;Crenarchaeota;Thermopr…",0,17,36,0,"""p__Thermoproteota""","""c__Thermoproteia""","""o__Sulfolobales""","""f__Sulfolobaceae""","""g__Metallosphaera"""
"""RS_GCF_013390375.1""",0,99.76,2.91,145,"""k__Archaea (UID2)""",103,100.0,968349,92.180872,38,379132,36.091379,1050488,"""RS_GCF_013390375.1""","""t""","""d__Archaea;p__Thermoproteota;c…","""not type material""","""none""","""f""",8,8,115526,115526,"""66930""",1,"""2982""","""NZ_JACATA010000004.1""","""115526""",1,"""111""","""NZ_JACATA010000001.1""","""2982""","""5463""","""0""","""99.732""","""EU686642.23401.26415""",…,"""d__Archaea;p__Thaumarchaeota;c…","""d__Archaea;x__TACK group;p__Th…",10,1050488,"""11""","""45""","""none""",1050478,0,"""JACATA000000000.1""",1309,37,"""66930""",1,"""none""","""none""","""none""","""none""","""none""","""none""","""1470""","""NZ_JACATA010000004.1""","""1468""","""2712""","""0""","""100""","""EU686642.21788.23258""","""Archaea;Crenarchaeota;Nitrosos…",10,17,37,0,"""p__Thermoproteota""","""c__Nitrososphaeria""","""o__Nitrososphaerales""","""f__Nitrosopumilaceae""","""g__Nitrosopelagicus"""
"""RS_GCF_900109065.1""",0,99.19,2.76,417,"""f__Halobacteriaceae (UID96)""",263,30.0,2854756,86.913618,78,1931497,58.804813,3284590,"""RS_GCF_900109065.1""","""t""","""d__Archaea;p__Halobacteriota;c…","""type strain of species""","""LPSN; DSMZ""","""f""",11,11,265031,265031,"""5775""",1,"""2910""","""NZ_FNYR01000064.1""","""5775""",1,"""106""","""NZ_FNYR01000064.1""","""2907""","""5369""","""0""","""100""","""CP024845.2095298.2098207""",…,"""d__Archaea;p__Euryarchaeota;c_…","""d__Archaea;p__Euryarchaeota;x_…",0,3284590,"""11""","""48""","""assembly from type material""",3284590,0,"""FNYR00000000.1""",3443,78,"""5775""",1,"""1471""","""2712""","""0""","""99.932""","""4338133""","""k__Archaea;p__Euryarchaeota;c_…","""1472""","""NZ_FNYR01000064.1""","""1471""","""2717""","""0""","""100""","""CP024845.40969.42441""","""Archaea;Halobacterota;Halobact…",0,20,47,0,"""p__Halobacteriota""","""c__Halobacteria""","""o__Halobacteriales""","""f__Haloferacaceae""","""g__Halohasta"""


In [5]:
# Read training data
d = pl.read_csv(training_file,separator="\t")
print("Read training data:", d.shape)

# Ignore all but training data
d2 = d.join(gtdb.select(['accession','phylum','class','order','family','genus']), on="accession", how="left")

d2

Read training data: (114192, 2680)


accession,false_negative_rate,false_positive_rate,COG0001,COG0002,COG0003,COG0004,COG0005,COG0007,COG0008,COG0010,COG0012,COG0013,COG0015,COG0016,COG0017,COG0018,COG0019,COG0022,COG0023,COG0025,COG0026,COG0027,COG0028,COG0029,COG0030,COG0033,COG0035,COG0036,COG0038,COG0039,COG0041,COG0042,COG0043,COG0044,COG0045,COG0046,…,COG5520,COG5524,COG5525,COG5531,COG5542,COG5553,COG5554,COG5555,COG5557,COG5565,COG5569,COG5570,COG5571,COG5573,COG5581,COG5590,COG5592,COG5597,COG5598,COG5605,COG5606,COG5610,COG5611,COG5615,COG5621,COG5625,COG5632,COG5640,COG5643,COG5650,COG5652,COG5663,phylum,class,order,family,genus
str,f64,f64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,…,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,str,str,str,str,str
"""GB_GCA_000010565.1""",0.0,0.0,1,1,0,1,2,1,1,1,1,2,1,1,1,1,1,0,0,0,0,0,2,1,1,0,0,1,0,1,1,1,1,1,1,0,…,0,0,0,0,1,0,0,0,1,0,0,0,0,3,2,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,1,"""p__Firmicutes_B""","""c__Desulfotomaculia""","""o__Desulfotomaculales""","""f__Pelotomaculaceae""","""g__Pelotomaculum"""
"""GB_GCA_000010565.1""",0.0,0.1,3,1,0,2,2,1,1,1,2,4,1,1,1,2,2,0,0,0,0,0,2,1,1,0,0,1,0,1,2,1,1,1,1,0,…,0,0,0,0,1,0,0,0,1,0,0,0,0,3,2,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,1,"""p__Firmicutes_B""","""c__Desulfotomaculia""","""o__Desulfotomaculales""","""f__Pelotomaculaceae""","""g__Pelotomaculum"""
"""GB_GCA_000010565.1""",0.0,0.2,1,1,1,2,2,1,1,1,1,2,1,1,1,1,1,0,0,1,0,0,5,2,2,0,1,1,0,2,3,1,1,1,1,0,…,0,0,0,0,1,0,0,0,1,0,0,0,0,3,2,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,1,1,"""p__Firmicutes_B""","""c__Desulfotomaculia""","""o__Desulfotomaculales""","""f__Pelotomaculaceae""","""g__Pelotomaculum"""
"""GB_GCA_000010565.1""",0.0,0.3,2,1,0,3,3,1,2,1,2,2,2,5,2,1,2,0,0,0,0,0,3,2,2,1,0,2,0,1,1,2,1,1,1,0,…,0,0,1,0,1,0,0,0,1,0,0,0,0,3,2,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,1,"""p__Firmicutes_B""","""c__Desulfotomaculia""","""o__Desulfotomaculales""","""f__Pelotomaculaceae""","""g__Pelotomaculum"""
"""GB_GCA_000010565.1""",0.0,0.4,3,1,1,4,2,1,1,2,2,6,1,1,1,4,3,0,0,0,1,1,3,1,2,0,2,2,1,1,1,2,1,2,3,0,…,1,0,0,0,1,0,0,0,2,0,0,0,0,3,2,0,0,0,2,0,0,0,0,0,0,0,2,0,0,0,0,1,"""p__Firmicutes_B""","""c__Desulfotomaculia""","""o__Desulfotomaculales""","""f__Pelotomaculaceae""","""g__Pelotomaculum"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""RS_GCF_903970205.1""",0.5,0.1,3,0,0,3,0,1,0,0,2,1,0,0,1,1,1,1,0,1,1,0,2,0,1,0,0,0,2,0,0,1,0,0,1,1,…,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,"""p__Acidobacteriota""","""c__Acidobacteriae""","""o__Bryobacterales""","""f__Bryobacteraceae""","""g__Bog-113"""
"""RS_GCF_903970205.1""",0.5,0.2,0,2,0,0,0,1,1,1,1,0,0,1,2,3,1,3,0,1,1,0,2,0,2,0,0,0,2,1,0,1,0,2,1,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,"""p__Acidobacteriota""","""c__Acidobacteriae""","""o__Bryobacterales""","""f__Bryobacteraceae""","""g__Bog-113"""
"""RS_GCF_903970205.1""",0.5,0.3,3,1,0,0,0,0,1,0,0,0,2,0,0,1,2,0,1,2,2,0,0,1,0,1,0,1,2,0,0,3,1,3,1,0,…,1,0,0,0,0,0,0,0,2,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,"""p__Acidobacteriota""","""c__Acidobacteriae""","""o__Bryobacterales""","""f__Bryobacteraceae""","""g__Bog-113"""
"""RS_GCF_903970205.1""",0.5,0.4,2,2,0,2,1,1,0,2,1,2,1,1,3,2,0,0,1,2,1,1,5,1,2,0,0,1,2,3,3,2,0,2,3,0,…,2,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,1,0,0,0,2,0,0,0,0,0,1,1,"""p__Acidobacteriota""","""c__Acidobacteriae""","""o__Bryobacterales""","""f__Bryobacteraceae""","""g__Bog-113"""


In [6]:
# Read y
y0 = pl.read_csv(y, separator="\t")
y1 = y0.unique() # There are some duplicates in the cyanos, so dedup

print("Read y: ", y1.shape)
print("Counts of each class amongst unique accessions: %s", y1.group_by(target_column).agg(pl.len()))

y1

Read y:  (3161, 2)
Counts of each class amongst unique accessions: %s shape: (2, 2)
┌──────────────┬──────┐
│ oxytolerance ┆ len  │
│ ---          ┆ ---  │
│ str          ┆ u32  │
╞══════════════╪══════╡
│ anaerobe     ┆ 1055 │
│ aerobe       ┆ 2106 │
└──────────────┴──────┘


accession,oxytolerance
str,str
"""RS_GCF_000262405.1""","""aerobe"""
"""GB_GCA_003265685.1""","""aerobe"""
"""RS_GCF_000828125.2""","""aerobe"""
"""RS_GCF_001719615.1""","""aerobe"""
"""RS_GCF_014201655.1""","""anaerobe"""
…,…
"""RS_GCF_014109725.1""","""aerobe"""
"""GB_GCA_014640095.1""","""aerobe"""
"""RS_GCF_000381525.1""","""anaerobe"""
"""GB_GCA_014641075.1""","""aerobe"""


In [7]:
d3 = d2.join(y1, on="accession", how="inner") # Inner join because test accessions are in y1 but not in d2

print("Counts of each class in training/test data: %s", d3.group_by(target_column).agg(pl.len()))

d3

Counts of each class in training/test data: %s shape: (2, 2)
┌──────────────┬───────┐
│ oxytolerance ┆ len   │
│ ---          ┆ ---   │
│ str          ┆ u32   │
╞══════════════╪═══════╡
│ anaerobe     ┆ 29808 │
│ aerobe       ┆ 58356 │
└──────────────┴───────┘


accession,false_negative_rate,false_positive_rate,COG0001,COG0002,COG0003,COG0004,COG0005,COG0007,COG0008,COG0010,COG0012,COG0013,COG0015,COG0016,COG0017,COG0018,COG0019,COG0022,COG0023,COG0025,COG0026,COG0027,COG0028,COG0029,COG0030,COG0033,COG0035,COG0036,COG0038,COG0039,COG0041,COG0042,COG0043,COG0044,COG0045,COG0046,…,COG5524,COG5525,COG5531,COG5542,COG5553,COG5554,COG5555,COG5557,COG5565,COG5569,COG5570,COG5571,COG5573,COG5581,COG5590,COG5592,COG5597,COG5598,COG5605,COG5606,COG5610,COG5611,COG5615,COG5621,COG5625,COG5632,COG5640,COG5643,COG5650,COG5652,COG5663,phylum,class,order,family,genus,oxytolerance
str,f64,f64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,…,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,str,str,str,str,str,str
"""GB_GCA_000010565.1""",0.0,0.0,1,1,0,1,2,1,1,1,1,2,1,1,1,1,1,0,0,0,0,0,2,1,1,0,0,1,0,1,1,1,1,1,1,0,…,0,0,0,1,0,0,0,1,0,0,0,0,3,2,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,1,"""p__Firmicutes_B""","""c__Desulfotomaculia""","""o__Desulfotomaculales""","""f__Pelotomaculaceae""","""g__Pelotomaculum""","""anaerobe"""
"""GB_GCA_000010565.1""",0.0,0.1,3,1,0,2,2,1,1,1,2,4,1,1,1,2,2,0,0,0,0,0,2,1,1,0,0,1,0,1,2,1,1,1,1,0,…,0,0,0,1,0,0,0,1,0,0,0,0,3,2,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,1,"""p__Firmicutes_B""","""c__Desulfotomaculia""","""o__Desulfotomaculales""","""f__Pelotomaculaceae""","""g__Pelotomaculum""","""anaerobe"""
"""GB_GCA_000010565.1""",0.0,0.2,1,1,1,2,2,1,1,1,1,2,1,1,1,1,1,0,0,1,0,0,5,2,2,0,1,1,0,2,3,1,1,1,1,0,…,0,0,0,1,0,0,0,1,0,0,0,0,3,2,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,1,1,"""p__Firmicutes_B""","""c__Desulfotomaculia""","""o__Desulfotomaculales""","""f__Pelotomaculaceae""","""g__Pelotomaculum""","""anaerobe"""
"""GB_GCA_000010565.1""",0.0,0.3,2,1,0,3,3,1,2,1,2,2,2,5,2,1,2,0,0,0,0,0,3,2,2,1,0,2,0,1,1,2,1,1,1,0,…,0,1,0,1,0,0,0,1,0,0,0,0,3,2,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,1,"""p__Firmicutes_B""","""c__Desulfotomaculia""","""o__Desulfotomaculales""","""f__Pelotomaculaceae""","""g__Pelotomaculum""","""anaerobe"""
"""GB_GCA_000010565.1""",0.0,0.4,3,1,1,4,2,1,1,2,2,6,1,1,1,4,3,0,0,0,1,1,3,1,2,0,2,2,1,1,1,2,1,2,3,0,…,0,0,0,1,0,0,0,2,0,0,0,0,3,2,0,0,0,2,0,0,0,0,0,0,0,2,0,0,0,0,1,"""p__Firmicutes_B""","""c__Desulfotomaculia""","""o__Desulfotomaculales""","""f__Pelotomaculaceae""","""g__Pelotomaculum""","""anaerobe"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""RS_GCF_903970205.1""",0.5,0.1,3,0,0,3,0,1,0,0,2,1,0,0,1,1,1,1,0,1,1,0,2,0,1,0,0,0,2,0,0,1,0,0,1,1,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,"""p__Acidobacteriota""","""c__Acidobacteriae""","""o__Bryobacterales""","""f__Bryobacteraceae""","""g__Bog-113""","""aerobe"""
"""RS_GCF_903970205.1""",0.5,0.2,0,2,0,0,0,1,1,1,1,0,0,1,2,3,1,3,0,1,1,0,2,0,2,0,0,0,2,1,0,1,0,2,1,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,"""p__Acidobacteriota""","""c__Acidobacteriae""","""o__Bryobacterales""","""f__Bryobacteraceae""","""g__Bog-113""","""aerobe"""
"""RS_GCF_903970205.1""",0.5,0.3,3,1,0,0,0,0,1,0,0,0,2,0,0,1,2,0,1,2,2,0,0,1,0,1,0,1,2,0,0,3,1,3,1,0,…,0,0,0,0,0,0,0,2,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,"""p__Acidobacteriota""","""c__Acidobacteriae""","""o__Bryobacterales""","""f__Bryobacteraceae""","""g__Bog-113""","""aerobe"""
"""RS_GCF_903970205.1""",0.5,0.4,2,2,0,2,1,1,0,2,1,2,1,1,3,2,0,0,1,2,1,1,5,1,2,0,0,1,2,3,3,2,0,2,3,0,…,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,1,0,0,0,2,0,0,0,0,0,1,1,"""p__Acidobacteriota""","""c__Acidobacteriae""","""o__Bryobacterales""","""f__Bryobacteraceae""","""g__Bog-113""","""aerobe"""


In [8]:
d3

accession,false_negative_rate,false_positive_rate,COG0001,COG0002,COG0003,COG0004,COG0005,COG0007,COG0008,COG0010,COG0012,COG0013,COG0015,COG0016,COG0017,COG0018,COG0019,COG0022,COG0023,COG0025,COG0026,COG0027,COG0028,COG0029,COG0030,COG0033,COG0035,COG0036,COG0038,COG0039,COG0041,COG0042,COG0043,COG0044,COG0045,COG0046,…,COG5524,COG5525,COG5531,COG5542,COG5553,COG5554,COG5555,COG5557,COG5565,COG5569,COG5570,COG5571,COG5573,COG5581,COG5590,COG5592,COG5597,COG5598,COG5605,COG5606,COG5610,COG5611,COG5615,COG5621,COG5625,COG5632,COG5640,COG5643,COG5650,COG5652,COG5663,phylum,class,order,family,genus,oxytolerance
str,f64,f64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,…,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,str,str,str,str,str,str
"""GB_GCA_000010565.1""",0.0,0.0,1,1,0,1,2,1,1,1,1,2,1,1,1,1,1,0,0,0,0,0,2,1,1,0,0,1,0,1,1,1,1,1,1,0,…,0,0,0,1,0,0,0,1,0,0,0,0,3,2,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,1,"""p__Firmicutes_B""","""c__Desulfotomaculia""","""o__Desulfotomaculales""","""f__Pelotomaculaceae""","""g__Pelotomaculum""","""anaerobe"""
"""GB_GCA_000010565.1""",0.0,0.1,3,1,0,2,2,1,1,1,2,4,1,1,1,2,2,0,0,0,0,0,2,1,1,0,0,1,0,1,2,1,1,1,1,0,…,0,0,0,1,0,0,0,1,0,0,0,0,3,2,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,1,"""p__Firmicutes_B""","""c__Desulfotomaculia""","""o__Desulfotomaculales""","""f__Pelotomaculaceae""","""g__Pelotomaculum""","""anaerobe"""
"""GB_GCA_000010565.1""",0.0,0.2,1,1,1,2,2,1,1,1,1,2,1,1,1,1,1,0,0,1,0,0,5,2,2,0,1,1,0,2,3,1,1,1,1,0,…,0,0,0,1,0,0,0,1,0,0,0,0,3,2,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,1,1,"""p__Firmicutes_B""","""c__Desulfotomaculia""","""o__Desulfotomaculales""","""f__Pelotomaculaceae""","""g__Pelotomaculum""","""anaerobe"""
"""GB_GCA_000010565.1""",0.0,0.3,2,1,0,3,3,1,2,1,2,2,2,5,2,1,2,0,0,0,0,0,3,2,2,1,0,2,0,1,1,2,1,1,1,0,…,0,1,0,1,0,0,0,1,0,0,0,0,3,2,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,1,"""p__Firmicutes_B""","""c__Desulfotomaculia""","""o__Desulfotomaculales""","""f__Pelotomaculaceae""","""g__Pelotomaculum""","""anaerobe"""
"""GB_GCA_000010565.1""",0.0,0.4,3,1,1,4,2,1,1,2,2,6,1,1,1,4,3,0,0,0,1,1,3,1,2,0,2,2,1,1,1,2,1,2,3,0,…,0,0,0,1,0,0,0,2,0,0,0,0,3,2,0,0,0,2,0,0,0,0,0,0,0,2,0,0,0,0,1,"""p__Firmicutes_B""","""c__Desulfotomaculia""","""o__Desulfotomaculales""","""f__Pelotomaculaceae""","""g__Pelotomaculum""","""anaerobe"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""RS_GCF_903970205.1""",0.5,0.1,3,0,0,3,0,1,0,0,2,1,0,0,1,1,1,1,0,1,1,0,2,0,1,0,0,0,2,0,0,1,0,0,1,1,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,"""p__Acidobacteriota""","""c__Acidobacteriae""","""o__Bryobacterales""","""f__Bryobacteraceae""","""g__Bog-113""","""aerobe"""
"""RS_GCF_903970205.1""",0.5,0.2,0,2,0,0,0,1,1,1,1,0,0,1,2,3,1,3,0,1,1,0,2,0,2,0,0,0,2,1,0,1,0,2,1,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,"""p__Acidobacteriota""","""c__Acidobacteriae""","""o__Bryobacterales""","""f__Bryobacteraceae""","""g__Bog-113""","""aerobe"""
"""RS_GCF_903970205.1""",0.5,0.3,3,1,0,0,0,0,1,0,0,0,2,0,0,1,2,0,1,2,2,0,0,1,0,1,0,1,2,0,0,3,1,3,1,0,…,0,0,0,0,0,0,0,2,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,"""p__Acidobacteriota""","""c__Acidobacteriae""","""o__Bryobacterales""","""f__Bryobacteraceae""","""g__Bog-113""","""aerobe"""
"""RS_GCF_903970205.1""",0.5,0.4,2,2,0,2,1,1,0,2,1,2,1,1,3,2,0,0,1,2,1,1,5,1,2,0,0,1,2,3,3,2,0,2,3,0,…,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,1,0,0,0,2,0,0,0,0,0,1,1,"""p__Acidobacteriota""","""c__Acidobacteriae""","""o__Bryobacterales""","""f__Bryobacteraceae""","""g__Bog-113""","""aerobe"""


In [9]:
y = y1

classes_map = {
    'anaerobe': 0,
    'aerobe': 1,
}

y = d3.with_columns(
    pl.col(target_column)
    .replace_strict(classes_map, default='unknown')
    .alias(target_column)
)

y = y.with_columns(
    pl.col(target_column).cast(pl.Int32)
)
            
print("Counts of y: %s", y.group_by(target_column).agg(pl.len()))

y

Counts of y: %s shape: (2, 2)
┌──────────────┬───────┐
│ oxytolerance ┆ len   │
│ ---          ┆ ---   │
│ i32          ┆ u32   │
╞══════════════╪═══════╡
│ 1            ┆ 58356 │
│ 0            ┆ 29808 │
└──────────────┴───────┘


accession,false_negative_rate,false_positive_rate,COG0001,COG0002,COG0003,COG0004,COG0005,COG0007,COG0008,COG0010,COG0012,COG0013,COG0015,COG0016,COG0017,COG0018,COG0019,COG0022,COG0023,COG0025,COG0026,COG0027,COG0028,COG0029,COG0030,COG0033,COG0035,COG0036,COG0038,COG0039,COG0041,COG0042,COG0043,COG0044,COG0045,COG0046,…,COG5524,COG5525,COG5531,COG5542,COG5553,COG5554,COG5555,COG5557,COG5565,COG5569,COG5570,COG5571,COG5573,COG5581,COG5590,COG5592,COG5597,COG5598,COG5605,COG5606,COG5610,COG5611,COG5615,COG5621,COG5625,COG5632,COG5640,COG5643,COG5650,COG5652,COG5663,phylum,class,order,family,genus,oxytolerance
str,f64,f64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,…,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,str,str,str,str,str,i32
"""GB_GCA_000010565.1""",0.0,0.0,1,1,0,1,2,1,1,1,1,2,1,1,1,1,1,0,0,0,0,0,2,1,1,0,0,1,0,1,1,1,1,1,1,0,…,0,0,0,1,0,0,0,1,0,0,0,0,3,2,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,1,"""p__Firmicutes_B""","""c__Desulfotomaculia""","""o__Desulfotomaculales""","""f__Pelotomaculaceae""","""g__Pelotomaculum""",0
"""GB_GCA_000010565.1""",0.0,0.1,3,1,0,2,2,1,1,1,2,4,1,1,1,2,2,0,0,0,0,0,2,1,1,0,0,1,0,1,2,1,1,1,1,0,…,0,0,0,1,0,0,0,1,0,0,0,0,3,2,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,1,"""p__Firmicutes_B""","""c__Desulfotomaculia""","""o__Desulfotomaculales""","""f__Pelotomaculaceae""","""g__Pelotomaculum""",0
"""GB_GCA_000010565.1""",0.0,0.2,1,1,1,2,2,1,1,1,1,2,1,1,1,1,1,0,0,1,0,0,5,2,2,0,1,1,0,2,3,1,1,1,1,0,…,0,0,0,1,0,0,0,1,0,0,0,0,3,2,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,1,1,"""p__Firmicutes_B""","""c__Desulfotomaculia""","""o__Desulfotomaculales""","""f__Pelotomaculaceae""","""g__Pelotomaculum""",0
"""GB_GCA_000010565.1""",0.0,0.3,2,1,0,3,3,1,2,1,2,2,2,5,2,1,2,0,0,0,0,0,3,2,2,1,0,2,0,1,1,2,1,1,1,0,…,0,1,0,1,0,0,0,1,0,0,0,0,3,2,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,1,"""p__Firmicutes_B""","""c__Desulfotomaculia""","""o__Desulfotomaculales""","""f__Pelotomaculaceae""","""g__Pelotomaculum""",0
"""GB_GCA_000010565.1""",0.0,0.4,3,1,1,4,2,1,1,2,2,6,1,1,1,4,3,0,0,0,1,1,3,1,2,0,2,2,1,1,1,2,1,2,3,0,…,0,0,0,1,0,0,0,2,0,0,0,0,3,2,0,0,0,2,0,0,0,0,0,0,0,2,0,0,0,0,1,"""p__Firmicutes_B""","""c__Desulfotomaculia""","""o__Desulfotomaculales""","""f__Pelotomaculaceae""","""g__Pelotomaculum""",0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""RS_GCF_903970205.1""",0.5,0.1,3,0,0,3,0,1,0,0,2,1,0,0,1,1,1,1,0,1,1,0,2,0,1,0,0,0,2,0,0,1,0,0,1,1,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,"""p__Acidobacteriota""","""c__Acidobacteriae""","""o__Bryobacterales""","""f__Bryobacteraceae""","""g__Bog-113""",1
"""RS_GCF_903970205.1""",0.5,0.2,0,2,0,0,0,1,1,1,1,0,0,1,2,3,1,3,0,1,1,0,2,0,2,0,0,0,2,1,0,1,0,2,1,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,"""p__Acidobacteriota""","""c__Acidobacteriae""","""o__Bryobacterales""","""f__Bryobacteraceae""","""g__Bog-113""",1
"""RS_GCF_903970205.1""",0.5,0.3,3,1,0,0,0,0,1,0,0,0,2,0,0,1,2,0,1,2,2,0,0,1,0,1,0,1,2,0,0,3,1,3,1,0,…,0,0,0,0,0,0,0,2,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,"""p__Acidobacteriota""","""c__Acidobacteriae""","""o__Bryobacterales""","""f__Bryobacteraceae""","""g__Bog-113""",1
"""RS_GCF_903970205.1""",0.5,0.4,2,2,0,2,1,1,0,2,1,2,1,1,3,2,0,0,1,2,1,1,5,1,2,0,0,1,2,3,3,2,0,2,3,0,…,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,1,0,0,0,2,0,0,0,0,0,1,1,"""p__Acidobacteriota""","""c__Acidobacteriae""","""o__Bryobacterales""","""f__Bryobacteraceae""","""g__Bog-113""",1


In [10]:
y = y.to_pandas()
y = y.iloc[:, -1]

y

0        0
1        0
2        0
3        0
4        0
        ..
88159    1
88160    1
88161    1
88162    1
88163    1
Name: oxytolerance, Length: 88164, dtype: int32

In [11]:
groups = d3['family'].to_list()

Counter(groups)

Counter({'f__Flavobacteriaceae': 4464,
         'f__Enterobacteriaceae': 2772,
         'f__Microbacteriaceae': 2304,
         'f__Sphingomonadaceae': 1764,
         'f__Rhizobiaceae': 1656,
         'f__Alteromonadaceae': 1620,
         'f__Micrococcaceae': 1548,
         'f__Paenibacillaceae': 1368,
         'f__Mycobacteriaceae': 1332,
         'f__Acetobacteraceae': 1224,
         'f__Chitinophagaceae': 1188,
         'f__Cyclobacteriaceae': 1188,
         'f__Streptosporangiaceae': 1188,
         'f__Xanthomonadaceae': 1080,
         'f__Spirosomaceae': 972,
         'f__Amphibacillaceae': 900,
         'f__Planococcaceae': 792,
         'f__Haloferacaceae': 792,
         'f__Halomonadaceae': 720,
         'f__Sphingobacteriaceae': 720,
         'f__Desulfovibrionaceae': 720,
         'f__Pseudomonadaceae': 720,
         'f__Rhodanobacteraceae': 684,
         'f__Methanobacteriaceae': 684,
         'f__Cellulomonadaceae': 684,
         'f__Beijerinckiaceae': 648,
         'f__Xant

In [12]:
d_gtdb = d3.to_pandas()

d_gtdb

Unnamed: 0,accession,false_negative_rate,false_positive_rate,COG0001,COG0002,COG0003,COG0004,COG0005,COG0007,COG0008,...,COG5643,COG5650,COG5652,COG5663,phylum,class,order,family,genus,oxytolerance
0,GB_GCA_000010565.1,0.0,0.0,1,1,0,1,2,1,1,...,0,0,0,1,p__Firmicutes_B,c__Desulfotomaculia,o__Desulfotomaculales,f__Pelotomaculaceae,g__Pelotomaculum,anaerobe
1,GB_GCA_000010565.1,0.0,0.1,3,1,0,2,2,1,1,...,0,0,0,1,p__Firmicutes_B,c__Desulfotomaculia,o__Desulfotomaculales,f__Pelotomaculaceae,g__Pelotomaculum,anaerobe
2,GB_GCA_000010565.1,0.0,0.2,1,1,1,2,2,1,1,...,0,0,1,1,p__Firmicutes_B,c__Desulfotomaculia,o__Desulfotomaculales,f__Pelotomaculaceae,g__Pelotomaculum,anaerobe
3,GB_GCA_000010565.1,0.0,0.3,2,1,0,3,3,1,2,...,0,0,0,1,p__Firmicutes_B,c__Desulfotomaculia,o__Desulfotomaculales,f__Pelotomaculaceae,g__Pelotomaculum,anaerobe
4,GB_GCA_000010565.1,0.0,0.4,3,1,1,4,2,1,1,...,0,0,0,1,p__Firmicutes_B,c__Desulfotomaculia,o__Desulfotomaculales,f__Pelotomaculaceae,g__Pelotomaculum,anaerobe
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88159,RS_GCF_903970205.1,0.5,0.1,3,0,0,3,0,1,0,...,0,0,0,0,p__Acidobacteriota,c__Acidobacteriae,o__Bryobacterales,f__Bryobacteraceae,g__Bog-113,aerobe
88160,RS_GCF_903970205.1,0.5,0.2,0,2,0,0,0,1,1,...,0,0,1,0,p__Acidobacteriota,c__Acidobacteriae,o__Bryobacterales,f__Bryobacteraceae,g__Bog-113,aerobe
88161,RS_GCF_903970205.1,0.5,0.3,3,1,0,0,0,0,1,...,0,0,0,0,p__Acidobacteriota,c__Acidobacteriae,o__Bryobacterales,f__Bryobacteraceae,g__Bog-113,aerobe
88162,RS_GCF_903970205.1,0.5,0.4,2,2,0,2,1,1,0,...,0,0,1,1,p__Acidobacteriota,c__Acidobacteriae,o__Bryobacterales,f__Bryobacteraceae,g__Bog-113,aerobe


In [13]:
X = d3.select(pl.exclude(['accession',target_column,'phylum','class','order','family','genus','false_negative_rate','false_positive_rate'])).to_pandas()

# Blacklist these as they aren't in the current ancestral file, not sure why
X = X.drop(['COG0411', 'COG0459', 'COG0564', 'COG1344', 'COG4177'],axis=1)
X

Unnamed: 0,COG0001,COG0002,COG0003,COG0004,COG0005,COG0007,COG0008,COG0010,COG0012,COG0013,...,COG5611,COG5615,COG5621,COG5625,COG5632,COG5640,COG5643,COG5650,COG5652,COG5663
0,1,1,0,1,2,1,1,1,1,2,...,0,0,0,0,2,0,0,0,0,1
1,3,1,0,2,2,1,1,1,2,4,...,0,0,0,0,2,0,0,0,0,1
2,1,1,1,2,2,1,1,1,1,2,...,0,0,0,0,2,0,0,0,1,1
3,2,1,0,3,3,1,2,1,2,2,...,0,0,0,0,3,0,0,0,0,1
4,3,1,1,4,2,1,1,2,2,6,...,0,0,0,0,2,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88159,3,0,0,3,0,1,0,0,2,1,...,0,0,0,0,0,0,0,0,0,0
88160,0,2,0,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,1,0
88161,3,1,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
88162,2,2,0,2,1,1,0,2,1,2,...,0,0,2,0,0,0,0,0,1,1


## Model

In [14]:
n_jobs=threads

pipe = make_pipeline(
    MaxAbsScaler(),
    XGBClassifier(n_jobs=n_jobs, use_label_encoder=False)
)

In [15]:
pipe

## Self-made

In [17]:
proportions_to_balance = {'orig_proc': {0: 29808, 1: 58356}, '30-70': {0: 25010, 1: 58356}, 
                          '40-60': {0: 29808, 1: 44712},'50-50': {0: 29808, 1: 29808}, 
                          '60-40': {0: 29808, 1: 19872}, '70-30': {0: 29808, 1: 12775}, 
                          '(500-500)': {0: 500, 1: 500}}

X_y_dict = {}

X['group'] = groups
cur_groups = X['group']

X_y_dict['orig_init'] = (X.drop('group', axis=1), y, cur_groups)
X_y_dict

{'orig_init': (       COG0001  COG0002  COG0003  COG0004  COG0005  COG0007  COG0008  COG0010  \
  0            1        1        0        1        2        1        1        1   
  1            3        1        0        2        2        1        1        1   
  2            1        1        1        2        2        1        1        1   
  3            2        1        0        3        3        1        2        1   
  4            3        1        1        4        2        1        1        2   
  ...        ...      ...      ...      ...      ...      ...      ...      ...   
  88159        3        0        0        3        0        1        0        0   
  88160        0        2        0        0        0        1        1        1   
  88161        3        1        0        0        0        0        1        0   
  88162        2        2        0        2        1        1        0        2   
  88163        1        1        1        3        0        1        1    

In [18]:
for key, value in (proportions_to_balance).items():
    print(f'\t {key, value}')
    
    df = X
    df['target'] = y

    #sample 0
    df_0 = df[df['target'] == 0].head(value[0])

    #sample 1
    df_1 = df[df['target'] == 1].head(value[1])

    display(df)
    display(df_0)
    display(df_1)

    #concatenate
    df_sampled = pd.concat([df_0, df_1], ignore_index=True)

    y_sampled = df_sampled['target']
    X_sampled = df_sampled.drop('target', axis=1)
    

    print("\tCounts of y after balancing: %s", Counter(y_sampled))
    
    # Extract groups
    cur_groups = X_sampled['group']
    
    X_sampled = X_sampled.drop('group', axis=1)
    X_y_dict[key] = (X_sampled, y_sampled, cur_groups)

	 ('orig_proc', {0: 29808, 1: 58356})


Unnamed: 0,COG0001,COG0002,COG0003,COG0004,COG0005,COG0007,COG0008,COG0010,COG0012,COG0013,...,COG5621,COG5625,COG5632,COG5640,COG5643,COG5650,COG5652,COG5663,group,target
0,1,1,0,1,2,1,1,1,1,2,...,0,0,2,0,0,0,0,1,f__Pelotomaculaceae,0
1,3,1,0,2,2,1,1,1,2,4,...,0,0,2,0,0,0,0,1,f__Pelotomaculaceae,0
2,1,1,1,2,2,1,1,1,1,2,...,0,0,2,0,0,0,1,1,f__Pelotomaculaceae,0
3,2,1,0,3,3,1,2,1,2,2,...,0,0,3,0,0,0,0,1,f__Pelotomaculaceae,0
4,3,1,1,4,2,1,1,2,2,6,...,0,0,2,0,0,0,0,1,f__Pelotomaculaceae,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88159,3,0,0,3,0,1,0,0,2,1,...,0,0,0,0,0,0,0,0,f__Bryobacteraceae,1
88160,0,2,0,0,0,1,1,1,1,0,...,0,0,0,0,0,0,1,0,f__Bryobacteraceae,1
88161,3,1,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,f__Bryobacteraceae,1
88162,2,2,0,2,1,1,0,2,1,2,...,2,0,0,0,0,0,1,1,f__Bryobacteraceae,1


Unnamed: 0,COG0001,COG0002,COG0003,COG0004,COG0005,COG0007,COG0008,COG0010,COG0012,COG0013,...,COG5621,COG5625,COG5632,COG5640,COG5643,COG5650,COG5652,COG5663,group,target
0,1,1,0,1,2,1,1,1,1,2,...,0,0,2,0,0,0,0,1,f__Pelotomaculaceae,0
1,3,1,0,2,2,1,1,1,2,4,...,0,0,2,0,0,0,0,1,f__Pelotomaculaceae,0
2,1,1,1,2,2,1,1,1,1,2,...,0,0,2,0,0,0,1,1,f__Pelotomaculaceae,0
3,2,1,0,3,3,1,2,1,2,2,...,0,0,3,0,0,0,0,1,f__Pelotomaculaceae,0
4,3,1,1,4,2,1,1,2,2,6,...,0,0,2,0,0,0,0,1,f__Pelotomaculaceae,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88051,1,1,1,0,1,1,1,1,1,2,...,0,0,0,0,0,0,0,0,f__Selenomonadaceae,0
88052,1,1,1,2,1,0,1,1,1,1,...,0,0,0,0,0,0,0,0,f__Selenomonadaceae,0
88053,1,2,1,1,1,1,0,1,2,1,...,0,0,0,0,0,0,1,0,f__Selenomonadaceae,0
88054,2,1,1,0,0,0,1,3,3,0,...,0,0,0,0,0,1,1,0,f__Selenomonadaceae,0


Unnamed: 0,COG0001,COG0002,COG0003,COG0004,COG0005,COG0007,COG0008,COG0010,COG0012,COG0013,...,COG5621,COG5625,COG5632,COG5640,COG5643,COG5650,COG5652,COG5663,group,target
36,1,1,1,3,3,0,1,1,1,2,...,0,0,1,1,0,0,0,0,f__Herpetosiphonaceae,1
37,1,1,2,3,3,0,1,1,1,2,...,0,0,2,1,0,0,0,0,f__Herpetosiphonaceae,1
38,1,2,1,3,3,1,2,1,1,2,...,1,0,1,1,0,0,0,0,f__Herpetosiphonaceae,1
39,1,1,4,4,3,0,1,1,3,3,...,0,0,1,1,0,0,1,0,f__Herpetosiphonaceae,1
40,1,1,1,6,3,2,1,1,2,2,...,0,0,2,1,0,0,0,1,f__Herpetosiphonaceae,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88159,3,0,0,3,0,1,0,0,2,1,...,0,0,0,0,0,0,0,0,f__Bryobacteraceae,1
88160,0,2,0,0,0,1,1,1,1,0,...,0,0,0,0,0,0,1,0,f__Bryobacteraceae,1
88161,3,1,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,f__Bryobacteraceae,1
88162,2,2,0,2,1,1,0,2,1,2,...,2,0,0,0,0,0,1,1,f__Bryobacteraceae,1


	Counts of y after balancing: %s Counter({1: 58356, 0: 29808})
	 ('30-70', {0: 25010, 1: 58356})


Unnamed: 0,COG0001,COG0002,COG0003,COG0004,COG0005,COG0007,COG0008,COG0010,COG0012,COG0013,...,COG5621,COG5625,COG5632,COG5640,COG5643,COG5650,COG5652,COG5663,group,target
0,1,1,0,1,2,1,1,1,1,2,...,0,0,2,0,0,0,0,1,f__Pelotomaculaceae,0
1,3,1,0,2,2,1,1,1,2,4,...,0,0,2,0,0,0,0,1,f__Pelotomaculaceae,0
2,1,1,1,2,2,1,1,1,1,2,...,0,0,2,0,0,0,1,1,f__Pelotomaculaceae,0
3,2,1,0,3,3,1,2,1,2,2,...,0,0,3,0,0,0,0,1,f__Pelotomaculaceae,0
4,3,1,1,4,2,1,1,2,2,6,...,0,0,2,0,0,0,0,1,f__Pelotomaculaceae,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88159,3,0,0,3,0,1,0,0,2,1,...,0,0,0,0,0,0,0,0,f__Bryobacteraceae,1
88160,0,2,0,0,0,1,1,1,1,0,...,0,0,0,0,0,0,1,0,f__Bryobacteraceae,1
88161,3,1,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,f__Bryobacteraceae,1
88162,2,2,0,2,1,1,0,2,1,2,...,2,0,0,0,0,0,1,1,f__Bryobacteraceae,1


Unnamed: 0,COG0001,COG0002,COG0003,COG0004,COG0005,COG0007,COG0008,COG0010,COG0012,COG0013,...,COG5621,COG5625,COG5632,COG5640,COG5643,COG5650,COG5652,COG5663,group,target
0,1,1,0,1,2,1,1,1,1,2,...,0,0,2,0,0,0,0,1,f__Pelotomaculaceae,0
1,3,1,0,2,2,1,1,1,2,4,...,0,0,2,0,0,0,0,1,f__Pelotomaculaceae,0
2,1,1,1,2,2,1,1,1,1,2,...,0,0,2,0,0,0,1,1,f__Pelotomaculaceae,0
3,2,1,0,3,3,1,2,1,2,2,...,0,0,3,0,0,0,0,1,f__Pelotomaculaceae,0
4,3,1,1,4,2,1,1,2,2,6,...,0,0,2,0,0,0,0,1,f__Pelotomaculaceae,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75297,3,0,0,1,0,1,2,2,0,3,...,0,0,0,0,0,0,3,0,f__Peptostreptococcaceae,0
75298,1,1,0,3,0,0,1,0,2,2,...,0,0,0,0,0,0,2,0,f__Peptostreptococcaceae,0
75299,2,0,0,1,2,1,0,1,2,4,...,0,0,0,0,0,0,4,0,f__Peptostreptococcaceae,0
75300,0,0,0,0,0,0,1,0,0,2,...,0,0,0,0,0,0,2,0,f__Peptostreptococcaceae,0


Unnamed: 0,COG0001,COG0002,COG0003,COG0004,COG0005,COG0007,COG0008,COG0010,COG0012,COG0013,...,COG5621,COG5625,COG5632,COG5640,COG5643,COG5650,COG5652,COG5663,group,target
36,1,1,1,3,3,0,1,1,1,2,...,0,0,1,1,0,0,0,0,f__Herpetosiphonaceae,1
37,1,1,2,3,3,0,1,1,1,2,...,0,0,2,1,0,0,0,0,f__Herpetosiphonaceae,1
38,1,2,1,3,3,1,2,1,1,2,...,1,0,1,1,0,0,0,0,f__Herpetosiphonaceae,1
39,1,1,4,4,3,0,1,1,3,3,...,0,0,1,1,0,0,1,0,f__Herpetosiphonaceae,1
40,1,1,1,6,3,2,1,1,2,2,...,0,0,2,1,0,0,0,1,f__Herpetosiphonaceae,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88159,3,0,0,3,0,1,0,0,2,1,...,0,0,0,0,0,0,0,0,f__Bryobacteraceae,1
88160,0,2,0,0,0,1,1,1,1,0,...,0,0,0,0,0,0,1,0,f__Bryobacteraceae,1
88161,3,1,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,f__Bryobacteraceae,1
88162,2,2,0,2,1,1,0,2,1,2,...,2,0,0,0,0,0,1,1,f__Bryobacteraceae,1


	Counts of y after balancing: %s Counter({1: 58356, 0: 25010})
	 ('40-60', {0: 29808, 1: 44712})


Unnamed: 0,COG0001,COG0002,COG0003,COG0004,COG0005,COG0007,COG0008,COG0010,COG0012,COG0013,...,COG5621,COG5625,COG5632,COG5640,COG5643,COG5650,COG5652,COG5663,group,target
0,1,1,0,1,2,1,1,1,1,2,...,0,0,2,0,0,0,0,1,f__Pelotomaculaceae,0
1,3,1,0,2,2,1,1,1,2,4,...,0,0,2,0,0,0,0,1,f__Pelotomaculaceae,0
2,1,1,1,2,2,1,1,1,1,2,...,0,0,2,0,0,0,1,1,f__Pelotomaculaceae,0
3,2,1,0,3,3,1,2,1,2,2,...,0,0,3,0,0,0,0,1,f__Pelotomaculaceae,0
4,3,1,1,4,2,1,1,2,2,6,...,0,0,2,0,0,0,0,1,f__Pelotomaculaceae,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88159,3,0,0,3,0,1,0,0,2,1,...,0,0,0,0,0,0,0,0,f__Bryobacteraceae,1
88160,0,2,0,0,0,1,1,1,1,0,...,0,0,0,0,0,0,1,0,f__Bryobacteraceae,1
88161,3,1,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,f__Bryobacteraceae,1
88162,2,2,0,2,1,1,0,2,1,2,...,2,0,0,0,0,0,1,1,f__Bryobacteraceae,1


Unnamed: 0,COG0001,COG0002,COG0003,COG0004,COG0005,COG0007,COG0008,COG0010,COG0012,COG0013,...,COG5621,COG5625,COG5632,COG5640,COG5643,COG5650,COG5652,COG5663,group,target
0,1,1,0,1,2,1,1,1,1,2,...,0,0,2,0,0,0,0,1,f__Pelotomaculaceae,0
1,3,1,0,2,2,1,1,1,2,4,...,0,0,2,0,0,0,0,1,f__Pelotomaculaceae,0
2,1,1,1,2,2,1,1,1,1,2,...,0,0,2,0,0,0,1,1,f__Pelotomaculaceae,0
3,2,1,0,3,3,1,2,1,2,2,...,0,0,3,0,0,0,0,1,f__Pelotomaculaceae,0
4,3,1,1,4,2,1,1,2,2,6,...,0,0,2,0,0,0,0,1,f__Pelotomaculaceae,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88051,1,1,1,0,1,1,1,1,1,2,...,0,0,0,0,0,0,0,0,f__Selenomonadaceae,0
88052,1,1,1,2,1,0,1,1,1,1,...,0,0,0,0,0,0,0,0,f__Selenomonadaceae,0
88053,1,2,1,1,1,1,0,1,2,1,...,0,0,0,0,0,0,1,0,f__Selenomonadaceae,0
88054,2,1,1,0,0,0,1,3,3,0,...,0,0,0,0,0,1,1,0,f__Selenomonadaceae,0


Unnamed: 0,COG0001,COG0002,COG0003,COG0004,COG0005,COG0007,COG0008,COG0010,COG0012,COG0013,...,COG5621,COG5625,COG5632,COG5640,COG5643,COG5650,COG5652,COG5663,group,target
36,1,1,1,3,3,0,1,1,1,2,...,0,0,1,1,0,0,0,0,f__Herpetosiphonaceae,1
37,1,1,2,3,3,0,1,1,1,2,...,0,0,2,1,0,0,0,0,f__Herpetosiphonaceae,1
38,1,2,1,3,3,1,2,1,1,2,...,1,0,1,1,0,0,0,0,f__Herpetosiphonaceae,1
39,1,1,4,4,3,0,1,1,3,3,...,0,0,1,1,0,0,1,0,f__Herpetosiphonaceae,1
40,1,1,1,6,3,2,1,1,2,2,...,0,0,2,1,0,0,0,1,f__Herpetosiphonaceae,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68359,1,0,0,2,0,0,0,0,1,2,...,1,0,0,0,0,0,0,0,f__Halomonadaceae,1
68360,1,2,1,2,0,0,0,2,1,0,...,1,0,0,0,0,0,0,0,f__Halomonadaceae,1
68361,1,1,1,2,0,0,0,3,1,1,...,1,0,0,0,0,0,0,0,f__Halomonadaceae,1
68362,1,4,1,2,0,0,0,0,1,1,...,0,0,0,0,0,0,0,1,f__Halomonadaceae,1


	Counts of y after balancing: %s Counter({1: 44712, 0: 29808})
	 ('50-50', {0: 29808, 1: 29808})


Unnamed: 0,COG0001,COG0002,COG0003,COG0004,COG0005,COG0007,COG0008,COG0010,COG0012,COG0013,...,COG5621,COG5625,COG5632,COG5640,COG5643,COG5650,COG5652,COG5663,group,target
0,1,1,0,1,2,1,1,1,1,2,...,0,0,2,0,0,0,0,1,f__Pelotomaculaceae,0
1,3,1,0,2,2,1,1,1,2,4,...,0,0,2,0,0,0,0,1,f__Pelotomaculaceae,0
2,1,1,1,2,2,1,1,1,1,2,...,0,0,2,0,0,0,1,1,f__Pelotomaculaceae,0
3,2,1,0,3,3,1,2,1,2,2,...,0,0,3,0,0,0,0,1,f__Pelotomaculaceae,0
4,3,1,1,4,2,1,1,2,2,6,...,0,0,2,0,0,0,0,1,f__Pelotomaculaceae,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88159,3,0,0,3,0,1,0,0,2,1,...,0,0,0,0,0,0,0,0,f__Bryobacteraceae,1
88160,0,2,0,0,0,1,1,1,1,0,...,0,0,0,0,0,0,1,0,f__Bryobacteraceae,1
88161,3,1,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,f__Bryobacteraceae,1
88162,2,2,0,2,1,1,0,2,1,2,...,2,0,0,0,0,0,1,1,f__Bryobacteraceae,1


Unnamed: 0,COG0001,COG0002,COG0003,COG0004,COG0005,COG0007,COG0008,COG0010,COG0012,COG0013,...,COG5621,COG5625,COG5632,COG5640,COG5643,COG5650,COG5652,COG5663,group,target
0,1,1,0,1,2,1,1,1,1,2,...,0,0,2,0,0,0,0,1,f__Pelotomaculaceae,0
1,3,1,0,2,2,1,1,1,2,4,...,0,0,2,0,0,0,0,1,f__Pelotomaculaceae,0
2,1,1,1,2,2,1,1,1,1,2,...,0,0,2,0,0,0,1,1,f__Pelotomaculaceae,0
3,2,1,0,3,3,1,2,1,2,2,...,0,0,3,0,0,0,0,1,f__Pelotomaculaceae,0
4,3,1,1,4,2,1,1,2,2,6,...,0,0,2,0,0,0,0,1,f__Pelotomaculaceae,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88051,1,1,1,0,1,1,1,1,1,2,...,0,0,0,0,0,0,0,0,f__Selenomonadaceae,0
88052,1,1,1,2,1,0,1,1,1,1,...,0,0,0,0,0,0,0,0,f__Selenomonadaceae,0
88053,1,2,1,1,1,1,0,1,2,1,...,0,0,0,0,0,0,1,0,f__Selenomonadaceae,0
88054,2,1,1,0,0,0,1,3,3,0,...,0,0,0,0,0,1,1,0,f__Selenomonadaceae,0


Unnamed: 0,COG0001,COG0002,COG0003,COG0004,COG0005,COG0007,COG0008,COG0010,COG0012,COG0013,...,COG5621,COG5625,COG5632,COG5640,COG5643,COG5650,COG5652,COG5663,group,target
36,1,1,1,3,3,0,1,1,1,2,...,0,0,1,1,0,0,0,0,f__Herpetosiphonaceae,1
37,1,1,2,3,3,0,1,1,1,2,...,0,0,2,1,0,0,0,0,f__Herpetosiphonaceae,1
38,1,2,1,3,3,1,2,1,1,2,...,1,0,1,1,0,0,0,0,f__Herpetosiphonaceae,1
39,1,1,4,4,3,0,1,1,3,3,...,0,0,1,1,0,0,1,0,f__Herpetosiphonaceae,1
40,1,1,1,6,3,2,1,1,2,2,...,0,0,2,1,0,0,0,1,f__Herpetosiphonaceae,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49171,0,1,0,2,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,f__Vibrionaceae,1
49172,0,2,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,f__Vibrionaceae,1
49173,3,0,1,1,0,0,0,0,1,1,...,0,0,0,0,0,0,2,0,f__Vibrionaceae,1
49174,0,1,0,4,0,0,0,2,2,2,...,1,0,0,0,0,0,0,0,f__Vibrionaceae,1


	Counts of y after balancing: %s Counter({0: 29808, 1: 29808})
	 ('60-40', {0: 29808, 1: 19872})


Unnamed: 0,COG0001,COG0002,COG0003,COG0004,COG0005,COG0007,COG0008,COG0010,COG0012,COG0013,...,COG5621,COG5625,COG5632,COG5640,COG5643,COG5650,COG5652,COG5663,group,target
0,1,1,0,1,2,1,1,1,1,2,...,0,0,2,0,0,0,0,1,f__Pelotomaculaceae,0
1,3,1,0,2,2,1,1,1,2,4,...,0,0,2,0,0,0,0,1,f__Pelotomaculaceae,0
2,1,1,1,2,2,1,1,1,1,2,...,0,0,2,0,0,0,1,1,f__Pelotomaculaceae,0
3,2,1,0,3,3,1,2,1,2,2,...,0,0,3,0,0,0,0,1,f__Pelotomaculaceae,0
4,3,1,1,4,2,1,1,2,2,6,...,0,0,2,0,0,0,0,1,f__Pelotomaculaceae,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88159,3,0,0,3,0,1,0,0,2,1,...,0,0,0,0,0,0,0,0,f__Bryobacteraceae,1
88160,0,2,0,0,0,1,1,1,1,0,...,0,0,0,0,0,0,1,0,f__Bryobacteraceae,1
88161,3,1,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,f__Bryobacteraceae,1
88162,2,2,0,2,1,1,0,2,1,2,...,2,0,0,0,0,0,1,1,f__Bryobacteraceae,1


Unnamed: 0,COG0001,COG0002,COG0003,COG0004,COG0005,COG0007,COG0008,COG0010,COG0012,COG0013,...,COG5621,COG5625,COG5632,COG5640,COG5643,COG5650,COG5652,COG5663,group,target
0,1,1,0,1,2,1,1,1,1,2,...,0,0,2,0,0,0,0,1,f__Pelotomaculaceae,0
1,3,1,0,2,2,1,1,1,2,4,...,0,0,2,0,0,0,0,1,f__Pelotomaculaceae,0
2,1,1,1,2,2,1,1,1,1,2,...,0,0,2,0,0,0,1,1,f__Pelotomaculaceae,0
3,2,1,0,3,3,1,2,1,2,2,...,0,0,3,0,0,0,0,1,f__Pelotomaculaceae,0
4,3,1,1,4,2,1,1,2,2,6,...,0,0,2,0,0,0,0,1,f__Pelotomaculaceae,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88051,1,1,1,0,1,1,1,1,1,2,...,0,0,0,0,0,0,0,0,f__Selenomonadaceae,0
88052,1,1,1,2,1,0,1,1,1,1,...,0,0,0,0,0,0,0,0,f__Selenomonadaceae,0
88053,1,2,1,1,1,1,0,1,2,1,...,0,0,0,0,0,0,1,0,f__Selenomonadaceae,0
88054,2,1,1,0,0,0,1,3,3,0,...,0,0,0,0,0,1,1,0,f__Selenomonadaceae,0


Unnamed: 0,COG0001,COG0002,COG0003,COG0004,COG0005,COG0007,COG0008,COG0010,COG0012,COG0013,...,COG5621,COG5625,COG5632,COG5640,COG5643,COG5650,COG5652,COG5663,group,target
36,1,1,1,3,3,0,1,1,1,2,...,0,0,1,1,0,0,0,0,f__Herpetosiphonaceae,1
37,1,1,2,3,3,0,1,1,1,2,...,0,0,2,1,0,0,0,0,f__Herpetosiphonaceae,1
38,1,2,1,3,3,1,2,1,1,2,...,1,0,1,1,0,0,0,0,f__Herpetosiphonaceae,1
39,1,1,4,4,3,0,1,1,3,3,...,0,0,1,1,0,0,1,0,f__Herpetosiphonaceae,1
40,1,1,1,6,3,2,1,1,2,2,...,0,0,2,1,0,0,0,1,f__Herpetosiphonaceae,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34123,1,0,0,1,0,0,0,1,2,0,...,0,0,0,0,0,0,0,0,f__Flavobacteriaceae,1
34124,0,0,0,2,0,0,0,2,0,1,...,0,0,0,0,0,0,2,0,f__Flavobacteriaceae,1
34125,0,2,1,1,0,1,1,0,2,0,...,0,0,0,0,0,0,2,0,f__Flavobacteriaceae,1
34126,0,3,0,2,0,0,1,0,1,2,...,0,0,0,0,0,0,1,0,f__Flavobacteriaceae,1


	Counts of y after balancing: %s Counter({0: 29808, 1: 19872})
	 ('70-30', {0: 29808, 1: 12775})


Unnamed: 0,COG0001,COG0002,COG0003,COG0004,COG0005,COG0007,COG0008,COG0010,COG0012,COG0013,...,COG5621,COG5625,COG5632,COG5640,COG5643,COG5650,COG5652,COG5663,group,target
0,1,1,0,1,2,1,1,1,1,2,...,0,0,2,0,0,0,0,1,f__Pelotomaculaceae,0
1,3,1,0,2,2,1,1,1,2,4,...,0,0,2,0,0,0,0,1,f__Pelotomaculaceae,0
2,1,1,1,2,2,1,1,1,1,2,...,0,0,2,0,0,0,1,1,f__Pelotomaculaceae,0
3,2,1,0,3,3,1,2,1,2,2,...,0,0,3,0,0,0,0,1,f__Pelotomaculaceae,0
4,3,1,1,4,2,1,1,2,2,6,...,0,0,2,0,0,0,0,1,f__Pelotomaculaceae,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88159,3,0,0,3,0,1,0,0,2,1,...,0,0,0,0,0,0,0,0,f__Bryobacteraceae,1
88160,0,2,0,0,0,1,1,1,1,0,...,0,0,0,0,0,0,1,0,f__Bryobacteraceae,1
88161,3,1,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,f__Bryobacteraceae,1
88162,2,2,0,2,1,1,0,2,1,2,...,2,0,0,0,0,0,1,1,f__Bryobacteraceae,1


Unnamed: 0,COG0001,COG0002,COG0003,COG0004,COG0005,COG0007,COG0008,COG0010,COG0012,COG0013,...,COG5621,COG5625,COG5632,COG5640,COG5643,COG5650,COG5652,COG5663,group,target
0,1,1,0,1,2,1,1,1,1,2,...,0,0,2,0,0,0,0,1,f__Pelotomaculaceae,0
1,3,1,0,2,2,1,1,1,2,4,...,0,0,2,0,0,0,0,1,f__Pelotomaculaceae,0
2,1,1,1,2,2,1,1,1,1,2,...,0,0,2,0,0,0,1,1,f__Pelotomaculaceae,0
3,2,1,0,3,3,1,2,1,2,2,...,0,0,3,0,0,0,0,1,f__Pelotomaculaceae,0
4,3,1,1,4,2,1,1,2,2,6,...,0,0,2,0,0,0,0,1,f__Pelotomaculaceae,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88051,1,1,1,0,1,1,1,1,1,2,...,0,0,0,0,0,0,0,0,f__Selenomonadaceae,0
88052,1,1,1,2,1,0,1,1,1,1,...,0,0,0,0,0,0,0,0,f__Selenomonadaceae,0
88053,1,2,1,1,1,1,0,1,2,1,...,0,0,0,0,0,0,1,0,f__Selenomonadaceae,0
88054,2,1,1,0,0,0,1,3,3,0,...,0,0,0,0,0,1,1,0,f__Selenomonadaceae,0


Unnamed: 0,COG0001,COG0002,COG0003,COG0004,COG0005,COG0007,COG0008,COG0010,COG0012,COG0013,...,COG5621,COG5625,COG5632,COG5640,COG5643,COG5650,COG5652,COG5663,group,target
36,1,1,1,3,3,0,1,1,1,2,...,0,0,1,1,0,0,0,0,f__Herpetosiphonaceae,1
37,1,1,2,3,3,0,1,1,1,2,...,0,0,2,1,0,0,0,0,f__Herpetosiphonaceae,1
38,1,2,1,3,3,1,2,1,1,2,...,1,0,1,1,0,0,0,0,f__Herpetosiphonaceae,1
39,1,1,4,4,3,0,1,1,3,3,...,0,0,1,1,0,0,1,0,f__Herpetosiphonaceae,1
40,1,1,1,6,3,2,1,1,2,2,...,0,0,2,1,0,0,0,1,f__Herpetosiphonaceae,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22598,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,f__Alteromonadaceae,1
22599,1,3,2,3,0,1,0,0,1,4,...,1,0,0,0,0,0,0,0,f__Alteromonadaceae,1
22600,0,2,0,3,2,1,1,4,2,1,...,0,0,0,0,0,0,0,1,f__Alteromonadaceae,1
22601,2,3,0,1,0,2,2,4,2,1,...,0,0,0,0,0,0,0,0,f__Alteromonadaceae,1


	Counts of y after balancing: %s Counter({0: 29808, 1: 12775})
	 ('(500-500)', {0: 500, 1: 500})


Unnamed: 0,COG0001,COG0002,COG0003,COG0004,COG0005,COG0007,COG0008,COG0010,COG0012,COG0013,...,COG5621,COG5625,COG5632,COG5640,COG5643,COG5650,COG5652,COG5663,group,target
0,1,1,0,1,2,1,1,1,1,2,...,0,0,2,0,0,0,0,1,f__Pelotomaculaceae,0
1,3,1,0,2,2,1,1,1,2,4,...,0,0,2,0,0,0,0,1,f__Pelotomaculaceae,0
2,1,1,1,2,2,1,1,1,1,2,...,0,0,2,0,0,0,1,1,f__Pelotomaculaceae,0
3,2,1,0,3,3,1,2,1,2,2,...,0,0,3,0,0,0,0,1,f__Pelotomaculaceae,0
4,3,1,1,4,2,1,1,2,2,6,...,0,0,2,0,0,0,0,1,f__Pelotomaculaceae,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88159,3,0,0,3,0,1,0,0,2,1,...,0,0,0,0,0,0,0,0,f__Bryobacteraceae,1
88160,0,2,0,0,0,1,1,1,1,0,...,0,0,0,0,0,0,1,0,f__Bryobacteraceae,1
88161,3,1,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,f__Bryobacteraceae,1
88162,2,2,0,2,1,1,0,2,1,2,...,2,0,0,0,0,0,1,1,f__Bryobacteraceae,1


Unnamed: 0,COG0001,COG0002,COG0003,COG0004,COG0005,COG0007,COG0008,COG0010,COG0012,COG0013,...,COG5621,COG5625,COG5632,COG5640,COG5643,COG5650,COG5652,COG5663,group,target
0,1,1,0,1,2,1,1,1,1,2,...,0,0,2,0,0,0,0,1,f__Pelotomaculaceae,0
1,3,1,0,2,2,1,1,1,2,4,...,0,0,2,0,0,0,0,1,f__Pelotomaculaceae,0
2,1,1,1,2,2,1,1,1,1,2,...,0,0,2,0,0,0,1,1,f__Pelotomaculaceae,0
3,2,1,0,3,3,1,2,1,2,2,...,0,0,3,0,0,0,0,1,f__Pelotomaculaceae,0
4,3,1,1,4,2,1,1,2,2,6,...,0,0,2,0,0,0,0,1,f__Pelotomaculaceae,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1791,1,1,1,0,1,0,1,1,2,2,...,0,0,1,0,0,0,2,0,f__Coriobacteriaceae,0
1792,1,2,0,3,0,0,1,1,1,1,...,0,0,1,0,0,0,1,0,f__Coriobacteriaceae,0
1793,0,2,1,3,0,0,1,0,2,1,...,0,0,1,0,0,0,0,0,f__Coriobacteriaceae,0
1794,0,0,0,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,f__Coriobacteriaceae,0


Unnamed: 0,COG0001,COG0002,COG0003,COG0004,COG0005,COG0007,COG0008,COG0010,COG0012,COG0013,...,COG5621,COG5625,COG5632,COG5640,COG5643,COG5650,COG5652,COG5663,group,target
36,1,1,1,3,3,0,1,1,1,2,...,0,0,1,1,0,0,0,0,f__Herpetosiphonaceae,1
37,1,1,2,3,3,0,1,1,1,2,...,0,0,2,1,0,0,0,0,f__Herpetosiphonaceae,1
38,1,2,1,3,3,1,2,1,1,2,...,1,0,1,1,0,0,0,0,f__Herpetosiphonaceae,1
39,1,1,4,4,3,0,1,1,3,3,...,0,0,1,1,0,0,1,0,f__Herpetosiphonaceae,1
40,1,1,1,6,3,2,1,1,2,2,...,0,0,2,1,0,0,0,1,f__Herpetosiphonaceae,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
891,2,1,3,2,1,1,1,0,1,1,...,0,0,0,0,0,0,0,0,f__Geitlerinemaceae,1
892,1,1,4,1,3,1,0,1,1,1,...,0,0,0,0,0,0,1,0,f__Geitlerinemaceae,1
893,1,3,2,2,0,1,0,1,2,1,...,1,0,0,0,0,0,0,0,f__Geitlerinemaceae,1
894,0,0,1,1,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,f__Geitlerinemaceae,1


	Counts of y after balancing: %s Counter({0: 500, 1: 500})


In [19]:
pipe

In [20]:
gkf = GroupKFold(n_splits=5)
model_name = 'xgb'

for key, value in X_y_dict.items():
    print(f'\n>>{key}<<')
    
    (X, y, groups) = value

    for i, (train, test) in enumerate(gkf.split(X, y, groups=groups)):
        print(test, len(test), len(train))
        print("Fold %i, Training model %s .." % (i, 'xgb'))
        
        pipe.fit(X.iloc[train], y.iloc[train])
        y_pred = pipe.predict(X.iloc[test])
        
        y_actual = y.iloc[test]

        pp = pipe.predict_proba(X.iloc[test])
        
        df1 = pd.DataFrame(
            pp,
            columns=[f"probability_{classes_map[k]}" for k in classes_map.keys()],
        )
        df1['prediction'] = y_pred

        df1['accession'] = d_gtdb.loc[test, 'accession'].values
        df1['y_actual'] = y_actual.to_list()#use to_list since 
        df1['false_negative_rate'] = d_gtdb.loc[test, 'false_negative_rate'].values
        df1['false_positive_rate'] = d_gtdb.loc[test, 'false_positive_rate'].values
        df1['predictor'] = model_name

        #print(Counter(df1['y_actual']))
        
        #display(df1)

        print("\t".join([
            str(i),
            model_name,
            str(accuracy_score(y_actual, y_pred)),
        ]))

        df1.to_csv('{}/prediction_probabilities_{}_{}_{}.csv'.format(
            cross_validation_data_output_dir, model_name, i, key), index=False, sep="\t", header=True)

    # Generate final predictors that include no cross-validation removal of samples
    print("Creating final predictor")
    
    print("Creating non-cross-validation predictor for xgb")
    pipe.fit(X, y)

    model_filename = os.path.join(model_output_dir, '{}_{}.model'.format('xgb', key))
    with open(model_filename,'wb') as f:
        pickle.dump(pipe['xgbclassifier'], f)
    
    print("Done")


>>orig_init<<
[    0     1     2 ... 88089 88090 88091] 17640 70524
Fold 0, Training model xgb ..
0	xgb	0.9667233560090703
[   36    37    38 ... 88017 88018 88019] 17640 70524
Fold 1, Training model xgb ..
1	xgb	0.9523242630385488
[  144   145   146 ... 88125 88126 88127] 17640 70524
Fold 2, Training model xgb ..
2	xgb	0.9777210884353742
[  108   109   110 ... 87837 87838 87839] 17640 70524
Fold 3, Training model xgb ..
3	xgb	0.9547619047619048
[   72    73    74 ... 88161 88162 88163] 17604 70560
Fold 4, Training model xgb ..
4	xgb	0.9559191092933425
Creating final predictor
Creating non-cross-validation predictor for xgb
Done

>>orig_proc<<
[    0     1     2 ... 88089 88090 88091] 17640 70524
Fold 0, Training model xgb ..
0	xgb	0.9667233560090703
[  648   649   650 ... 88017 88018 88019] 17640 70524
Fold 1, Training model xgb ..
1	xgb	0.9523242630385488
[  108   109   110 ... 88125 88126 88127] 17640 70524
Fold 2, Training model xgb ..
2	xgb	0.9777210884353742
[   72    73    74 .