In [1]:
import pandas as pd
import os

In [2]:
def get_gtdbtk_failed_id(file_path):
    df = pd.read_csv(file_path, sep='\t', names=["bin_id", "gtdbtk_filtered_comment"])
    return df

df = pd.concat([
    get_gtdbtk_failed_id("lineages_umgs/gtdbtk.bac120.filtered.tsv"),
    get_gtdbtk_failed_id("lineages_kmgs/gtdbtk.bac120.filtered.tsv"),
    get_gtdbtk_failed_id("lineages_kmgs/gtdbtk.ar122.filtered.tsv")
])

In [3]:
stats = pd.read_csv("/ldfssz1/ST_META/share/User/zhujie/database/ehomd/ehomd_fna_seqkit_stats.tsv", sep='\t', skiprows=[1, 2])
stats["bin_id"] = stats.file.apply(lambda x: os.path.basename(x).rpartition(".")[0])

stats_e = pd.read_csv("/ldfssz1/ST_META/share/User/zhujie/database/ehomd/ehomd_seqid_info.tsv",
                      sep='\t', thousands=',')\
            .rename(columns={"SEQF_ID": "bin_id"})
#stats_e.head()

In [4]:
stats_df = stats.merge(stats_e).loc[:, ["bin_id", "Contigs", "Size_bp", "num_seqs", "sum_len"]]
stats_df["diff"] = stats_df["Size_bp"] - stats_df["sum_len"]

In [5]:
##

In [6]:
ehomd_issue_dict = {
    "SEQF1005": "d__Bacteria;p__Campylobacterota;c__Campylobacteria;o__Campylobacterales;f__Campylobacteraceae;g__Campylobacter_B;s__Campylobacter_B gracilis",
    "SEQF1020": "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Alloprevotella;s__Alloprevotella rava",
    "SEQF1058": "d__Bacteria;p__Firmicutes_C;c__Negativicutes;o__Veillonellales;f__Veillonellaceae;g__Veillonella;s__Veillonella parvula_A",
    "SEQF1079": "d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Enterococcaceae;g__Enterococcus;s__Enterococcus faecalis",
    "SEQF1089": "d__Bacteria;p__Campylobacterota;c__Campylobacteria;o__Campylobacterales;f__Campylobacteraceae;g__Campylobacter_A;s__Campylobacter_A rectus",
    "SEQF1091": "d__Bacteria;p__Campylobacterota;c__Campylobacteria;o__Campylobacterales;f__Campylobacteraceae;g__Campylobacter_A;s__Campylobacter_A showae_C",
    "SEQF1108": "d__Bacteria;p__Firmicutes_A;c__Clostridia;o__Peptostreptococcales;f__Anaerovoracaceae;g__Eubacterium_B;s__Eubacterium_B infirmum",
    "SEQF1152": "d__Bacteria;p__Firmicutes;c__Bacilli;o__Erysipelotrichales;f__Erysipelotrichaceae;g__Solobacterium;s__Solobacterium moorei",
    "SEQF1284": "d__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhizobiales;f__Rhizobiaceae;g__Ochrobactrum;s__Ochrobactrum anthropi",
    "SEQF1468": "d__Bacteria;p__Patescibacteria;c__Saccharimonadia;o__Saccharimonadales;f__Saccharimonadaceae;g__TM7x;s__TM7x sp000803625",
    "SEQF1474": "d__Bacteria;p__Synergistota;c__Synergistia;o__Synergistales;f__Dethiosulfovibrionaceae;g__Pyramidobacter;s__Pyramidobacter piscolens",
    "SEQF1487": "d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Pasteurellaceae;g__Haemophilus;s__Haemophilus influenzae",
    "SEQF1707": "d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Pasteurellaceae;g__Aggregatibacter;s__Aggregatibacter actinomycetemcomitans",
    "SEQF2006": "d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Streptococcaceae;g__Streptococcus;s__Streptococcus salivarius",
    "SEQF2278": "d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Enterococcaceae;g__Enterococcus;s__Enterococcus faecalis" 
}
ehomd_issue_df = pd.DataFrame(list(ehomd_issue_dict.items())).rename(columns={0: "bin_id", 1: "classification"})

In [7]:
###

In [45]:
LINEAGES = ["superkingdom", "phylum", "class", "order", "family", "genus", "species", "strain"]
LINEAGES_DICT = {
    "strain": LINEAGES,
    "species": LINEAGES[0:7],
    "genus": LINEAGES[0:6],
    "family": LINEAGES[0:5],
    "order": LINEAGES[0:4],
    "class": LINEAGES[0:3],
    "phylum": LINEAGES[0:2],
    "superkingdom": LINEAGES[0:1],
}

LEVEL_DICT = {
    "strain": "t",
    "species": "s",
    "genus": "g",
    "family": "f",
    "order": "o",
    "class": "c",
    "phylum": "p",
    "superkingdom": "k"
}


def set_lineages_to(row, key, level):
    lineages_dict = {
        "k": "k__unclassified_" + row["mgs_id"],
        "p": "p__unclassified_" + row["mgs_id"],
        "c": "c__unclassified_" + row["mgs_id"],
        "o": "o__unclassified_" + row["mgs_id"],
        "f": "f__unclassified_" + row["mgs_id"],
        "g": "g__unclassified_" + row["mgs_id"],
        "s": "s__unclassified_" + row["mgs_id"],
        "t": "t__unclassified_" + row["mgs_id"],
    }
    for line in row[key].split(";"):
        lev, tax = line.split("__")
        if lev == "d":
            lev = "k"
        if tax != "":
            lineages_dict[lev] = lev + "__" + tax
            if lev == "s" or lev == "t":
                lineages_dict[lev] = lineages_dict[lev] + "_" + row["mgs_id"]
                
    lineages = []
    for i in LINEAGES_DICT[level]:
        lineages.append(lineages_dict[LEVEL_DICT[i]])
        
    return "|".join(lineages)

In [9]:
def set_bin_id(x, key, save_suffix=False):
    for suffix in [".fna.gz", ".fa.gz", ".fna", ".fa"]:
        if suffix in x[key]:
            if save_suffix:
                return os.path.basename(x[key]).split(suffix)[0] + suffix
            else:
                return os.path.basename(x[key]).split(suffix)[0]
    return os.path.basename(x[key])

def get_what(x, key, sep):
    lineages = x[key].split(";")
    for tax in lineages:
        if sep in tax:
            tax_ = tax.split(sep)[1]
            if tax_ == "":
                return "unclassified"
            else:
                return tax_
    return "unclassified"

In [10]:
mgs_info = pd.read_csv("oral_mgs_representative.tsv", sep='\t')
mgs_tax_old = pd.read_csv("taxonomy_oral_mgs_representative.tsv", sep='\t').rename(columns={"mgs_id": "mgs_id_old"})

In [11]:
mgs_info = mgs_info.merge(mgs_tax_old.loc[:, ["mgs_id_old", "representative", "rep_path"]])
mgs_info["bin_id"] = mgs_info.apply(lambda x: set_bin_id(x, "rep_path", False), axis=1)

In [12]:
mgs_info.head()

Unnamed: 0,mgs_id,size,mtype,representative,MAG,oral_genome,rep_path,genome_lst,mgs_id_old,bin_id
0,mgs_1,1289,kMGS,GCF_002013135.1_ASM201313v1,0,12,/hwfssz1/pub/database/ftp.ncbi.nih.gov/genomes...,"GCF_000349405.1_ASM34940v1,GCF_004013755.1_ASM...",mgs_d0.05_1,GCF_002013135.1_ASM201313v1_genomic
1,mgs_2,1576,kMGS,YS000214_saliva.17,1572,1574,/hwfssz1/ST_META/P18Z10200N0127_MA/zhujie/yunn...,"RSZAXPI002486-19_RAH_saliva.49,RDPYD18300072_A...",mgs_d0.05_5,YS000214_saliva.spades.bin.17
2,mgs_3,6,kMGS,1150461.5.patric.fna,0,2,/hwfssz1/ST_META/P18Z10200N0127_MA/database/IG...,"SEQF2707,1150461.5.patric.fna,GCF_001005065.1_...",mgs_d0.05_9,1150461.5.patric
3,mgs_4,8301,kMGS,GCF_900050145.1_6938_4_11,0,52,/hwfssz1/pub/database/ftp.ncbi.nih.gov/genomes...,"GCF_001141485.1_6925_1_59,GCF_900022795.1_1186...",mgs_d0.05_13,GCF_900050145.1_6938_4_11_genomic
4,mgs_5,1590,kMGS,GCF_001544055.1_ASM154405v1,0,1,/hwfssz1/pub/database/ftp.ncbi.nih.gov/genomes...,"GCF_000391785.1_Ente_faec_7230532-1_V1,GCF_002...",mgs_d0.05_14,GCF_001544055.1_ASM154405v1_genomic


In [13]:
##

In [14]:
kmgs_bac_gtdb = pd.read_csv("lineages_kmgs/classify/gtdbtk.bac120.summary.tsv", sep='\t')
kmgs_arr_gtdb = pd.read_csv("lineages_kmgs/classify/gtdbtk.ar122.summary.tsv", sep='\t')
umgs_bac_gtdb = pd.read_csv("lineages_umgs/classify/gtdbtk.bac120.summary.tsv", sep='\t')

kmgs_bac_gtdb_ = kmgs_bac_gtdb.loc[:, ["user_genome", "classification"]].rename(columns={"user_genome": "bin_id"})
kmgs_arr_gtdb_ = kmgs_arr_gtdb.loc[:, ["user_genome", "classification"]].rename(columns={"user_genome": "bin_id"})
umgs_bac_gtdb_ = umgs_bac_gtdb.loc[:, ["user_genome", "classification"]].rename(columns={"user_genome": "bin_id"})

In [15]:
sum([len(kmgs_bac_gtdb_), len(kmgs_arr_gtdb_), len(umgs_bac_gtdb_), len(ehomd_issue_df)])

3589

In [16]:
mgs_tax = pd.concat([kmgs_bac_gtdb_, kmgs_arr_gtdb_, umgs_bac_gtdb_, ehomd_issue_df])

In [17]:
len(mgs_tax)

3589

In [18]:
mgs_tax.head()

Unnamed: 0,bin_id,classification
0,RSZYD18187400_A_saliva.metaspades.bin.51,d__Bacteria;p__Proteobacteria;c__Gammaproteoba...
1,SEQF2771,d__Bacteria;p__Spirochaetota;c__Spirochaetia;o...
2,1769.6.patric,d__Bacteria;p__Actinobacteriota;c__Actinobacte...
3,GCF_000972565.1_ASM97256v1_genomic,d__Bacteria;p__Firmicutes;c__Bacilli;o__Staphy...
4,111015.3.patric,d__Bacteria;p__Actinobacteriota;c__Actinobacte...


In [19]:
mgs_info = mgs_info.merge(mgs_tax)

In [20]:
mgs_info.head()

Unnamed: 0,mgs_id,size,mtype,representative,MAG,oral_genome,rep_path,genome_lst,mgs_id_old,bin_id,classification
0,mgs_1,1289,kMGS,GCF_002013135.1_ASM201313v1,0,12,/hwfssz1/pub/database/ftp.ncbi.nih.gov/genomes...,"GCF_000349405.1_ASM34940v1,GCF_004013755.1_ASM...",mgs_d0.05_1,GCF_002013135.1_ASM201313v1_genomic,d__Bacteria;p__Campylobacterota;c__Campylobact...
1,mgs_2,1576,kMGS,YS000214_saliva.17,1572,1574,/hwfssz1/ST_META/P18Z10200N0127_MA/zhujie/yunn...,"RSZAXPI002486-19_RAH_saliva.49,RDPYD18300072_A...",mgs_d0.05_5,YS000214_saliva.spades.bin.17,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__...
2,mgs_3,6,kMGS,1150461.5.patric.fna,0,2,/hwfssz1/ST_META/P18Z10200N0127_MA/database/IG...,"SEQF2707,1150461.5.patric.fna,GCF_001005065.1_...",mgs_d0.05_9,1150461.5.patric,d__Bacteria;p__Actinobacteriota;c__Actinobacte...
3,mgs_4,8301,kMGS,GCF_900050145.1_6938_4_11,0,52,/hwfssz1/pub/database/ftp.ncbi.nih.gov/genomes...,"GCF_001141485.1_6925_1_59,GCF_900022795.1_1186...",mgs_d0.05_13,GCF_900050145.1_6938_4_11_genomic,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...
4,mgs_5,1590,kMGS,GCF_001544055.1_ASM154405v1,0,1,/hwfssz1/pub/database/ftp.ncbi.nih.gov/genomes...,"GCF_000391785.1_Ente_faec_7230532-1_V1,GCF_002...",mgs_d0.05_14,GCF_001544055.1_ASM154405v1_genomic,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...


In [21]:
mgs_info["lineages_superkingdom"] = mgs_info.apply(lambda x: set_lineages_to(x, "classification", "superkingdom"), axis=1)
mgs_info["lineages_phylum"] = mgs_info.apply(lambda x: set_lineages_to(x, "classification", "phylum"), axis=1)
mgs_info["lineages_class"] = mgs_info.apply(lambda x: set_lineages_to(x, "classification", "class"), axis=1)
mgs_info["lineages_order"] = mgs_info.apply(lambda x: set_lineages_to(x, "classification", "order"), axis=1)
mgs_info["lineages_family"] = mgs_info.apply(lambda x: set_lineages_to(x, "classification", "family"), axis=1)
mgs_info["lineages_genus"] = mgs_info.apply(lambda x: set_lineages_to(x, "classification", "genus"), axis=1)
mgs_info["lineages_species"] = mgs_info.apply(lambda x: set_lineages_to(x, "classification", "species"), axis=1)
mgs_info["lineages_strain"] = mgs_info.apply(lambda x: set_lineages_to(x, "classification", "strain"), axis=1)

In [22]:
mgs_info.head()

Unnamed: 0,mgs_id,size,mtype,representative,MAG,oral_genome,rep_path,genome_lst,mgs_id_old,bin_id,classification,lineages_superkingdom,lineages_phylum,lineages_class,lineages_order,lineages_family,lineages_genus,lineages_species,lineages_strain
0,mgs_1,1289,kMGS,GCF_002013135.1_ASM201313v1,0,12,/hwfssz1/pub/database/ftp.ncbi.nih.gov/genomes...,"GCF_000349405.1_ASM34940v1,GCF_004013755.1_ASM...",mgs_d0.05_1,GCF_002013135.1_ASM201313v1_genomic,d__Bacteria;p__Campylobacterota;c__Campylobact...,k__Bacteria,k__Bacteria|p__Campylobacterota,k__Bacteria|p__Campylobacterota|c__Campylobact...,k__Bacteria|p__Campylobacterota|c__Campylobact...,k__Bacteria|p__Campylobacterota|c__Campylobact...,k__Bacteria|p__Campylobacterota|c__Campylobact...,k__Bacteria|p__Campylobacterota|c__Campylobact...,k__Bacteria|p__Campylobacterota|c__Campylobact...
1,mgs_2,1576,kMGS,YS000214_saliva.17,1572,1574,/hwfssz1/ST_META/P18Z10200N0127_MA/zhujie/yunn...,"RSZAXPI002486-19_RAH_saliva.49,RDPYD18300072_A...",mgs_d0.05_5,YS000214_saliva.spades.bin.17,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__...,k__Bacteria,k__Bacteria|p__Bacteroidota,k__Bacteria|p__Bacteroidota|c__Bacteroidia,k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__...,k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__...,k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__...,k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__...,k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__...
2,mgs_3,6,kMGS,1150461.5.patric.fna,0,2,/hwfssz1/ST_META/P18Z10200N0127_MA/database/IG...,"SEQF2707,1150461.5.patric.fna,GCF_001005065.1_...",mgs_d0.05_9,1150461.5.patric,d__Bacteria;p__Actinobacteriota;c__Actinobacte...,k__Bacteria,k__Bacteria|p__Actinobacteriota,k__Bacteria|p__Actinobacteriota|c__Actinobacteria,k__Bacteria|p__Actinobacteriota|c__Actinobacte...,k__Bacteria|p__Actinobacteriota|c__Actinobacte...,k__Bacteria|p__Actinobacteriota|c__Actinobacte...,k__Bacteria|p__Actinobacteriota|c__Actinobacte...,k__Bacteria|p__Actinobacteriota|c__Actinobacte...
3,mgs_4,8301,kMGS,GCF_900050145.1_6938_4_11,0,52,/hwfssz1/pub/database/ftp.ncbi.nih.gov/genomes...,"GCF_001141485.1_6925_1_59,GCF_900022795.1_1186...",mgs_d0.05_13,GCF_900050145.1_6938_4_11_genomic,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...,k__Bacteria,k__Bacteria|p__Firmicutes,k__Bacteria|p__Firmicutes|c__Bacilli,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...
4,mgs_5,1590,kMGS,GCF_001544055.1_ASM154405v1,0,1,/hwfssz1/pub/database/ftp.ncbi.nih.gov/genomes...,"GCF_000391785.1_Ente_faec_7230532-1_V1,GCF_002...",mgs_d0.05_14,GCF_001544055.1_ASM154405v1_genomic,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...,k__Bacteria,k__Bacteria|p__Firmicutes,k__Bacteria|p__Firmicutes|c__Bacilli,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...


In [24]:
sorted(list(mgs_info["lineages_superkingdom"].unique()))

['k__Archaea', 'k__Bacteria']

In [25]:
sorted(list(mgs_info["lineages_phylum"].unique()))

['k__Archaea|p__Euryarchaeota',
 'k__Archaea|p__Nanohaloarchaeota',
 'k__Bacteria|p__Actinobacteriota',
 'k__Bacteria|p__Bacteroidota',
 'k__Bacteria|p__Campylobacterota',
 'k__Bacteria|p__Chloroflexota',
 'k__Bacteria|p__Cyanobacteria',
 'k__Bacteria|p__Desulfobacterota',
 'k__Bacteria|p__Desulfobacterota_A',
 'k__Bacteria|p__Elusimicrobiota',
 'k__Bacteria|p__Firmicutes',
 'k__Bacteria|p__Firmicutes_A',
 'k__Bacteria|p__Firmicutes_B',
 'k__Bacteria|p__Firmicutes_C',
 'k__Bacteria|p__Firmicutes_I',
 'k__Bacteria|p__Fusobacteriota',
 'k__Bacteria|p__Patescibacteria',
 'k__Bacteria|p__Proteobacteria',
 'k__Bacteria|p__Spirochaetota',
 'k__Bacteria|p__Synergistota',
 'k__Bacteria|p__Verrucomicrobiota',
 'k__Bacteria|p__Verrucomicrobiota_A']

In [30]:
phylum_change = {
    'p__Desulfobacterota_A': 'p__Desulfobacterota',
    'p__Firmicutes_A': 'p__Firmicutes',
    'p__Firmicutes_B': 'p__Firmicutes',
    'p__Firmicutes_C': 'p__Firmicutes',
    'p__Firmicutes_I': 'p__Firmicutes',
    'p__Verrucomicrobiota_A': 'p__Verrucomicrobiota'
}

In [26]:
sorted(list(mgs_info["lineages_class"].unique()))

['k__Archaea|p__Euryarchaeota|c__Methanobacteria',
 'k__Archaea|p__Nanohaloarchaeota|c__Nanosalinia',
 'k__Bacteria|p__Actinobacteriota|c__Actinobacteria',
 'k__Bacteria|p__Actinobacteriota|c__Coriobacteriia',
 'k__Bacteria|p__Bacteroidota|c__Bacteroidia',
 'k__Bacteria|p__Bacteroidota|c__Chlorobia',
 'k__Bacteria|p__Bacteroidota|c__Ignavibacteria',
 'k__Bacteria|p__Campylobacterota|c__Campylobacteria',
 'k__Bacteria|p__Chloroflexota|c__Anaerolineae',
 'k__Bacteria|p__Cyanobacteria|c__Cyanobacteriia',
 'k__Bacteria|p__Desulfobacterota_A|c__Desulfovibrionia',
 'k__Bacteria|p__Desulfobacterota|c__Desulfobulbia',
 'k__Bacteria|p__Elusimicrobiota|c__Endomicrobia',
 'k__Bacteria|p__Firmicutes_A|c__Clostridia',
 'k__Bacteria|p__Firmicutes_B|c__Peptococcia',
 'k__Bacteria|p__Firmicutes_C|c__Negativicutes',
 'k__Bacteria|p__Firmicutes_I|c__Bacilli_A',
 'k__Bacteria|p__Firmicutes|c__Bacilli',
 'k__Bacteria|p__Fusobacteriota|c__Fusobacteriia',
 'k__Bacteria|p__Patescibacteria|c__Gracilibacteria'

In [31]:
class_change = {
    'c__Bacilli_A': 'c__Bacilli'
}

In [32]:
sorted(list(mgs_info["lineages_order"].unique()))

['k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales',
 'k__Archaea|p__Nanohaloarchaeota|c__Nanosalinia|o__unclassified_mgs_3553',
 'k__Bacteria|p__Actinobacteriota|c__Actinobacteria|o__Actinomycetales',
 'k__Bacteria|p__Actinobacteriota|c__Actinobacteria|o__Mycobacteriales',
 'k__Bacteria|p__Actinobacteriota|c__Actinobacteria|o__Propionibacteriales',
 'k__Bacteria|p__Actinobacteriota|c__Coriobacteriia|o__Coriobacteriales',
 'k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__Bacteroidales',
 'k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__Flavobacteriales',
 'k__Bacteria|p__Bacteroidota|c__Chlorobia|o__Chlorobiales',
 'k__Bacteria|p__Bacteroidota|c__Ignavibacteria|o__Ignavibacteriales',
 'k__Bacteria|p__Campylobacterota|c__Campylobacteria|o__Campylobacterales',
 'k__Bacteria|p__Chloroflexota|c__Anaerolineae|o__Anaerolineales',
 'k__Bacteria|p__Chloroflexota|c__Anaerolineae|o__Caldilineales',
 'k__Bacteria|p__Cyanobacteria|c__Cyanobacteriia|o__Cyanobacteriales',
 'k__Bacter

In [33]:
order_change = {
    'o__Bacillales_A': 'o__Bacillales'
}

In [28]:
sorted(list(mgs_info["lineages_family"].unique()))

['k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales|f__Methanobacteriaceae',
 'k__Archaea|p__Nanohaloarchaeota|c__Nanosalinia|o__unclassified_mgs_3553|f__unclassified_mgs_3553',
 'k__Bacteria|p__Actinobacteriota|c__Actinobacteria|o__Actinomycetales|f__Actinomycetaceae',
 'k__Bacteria|p__Actinobacteriota|c__Actinobacteria|o__Actinomycetales|f__Bifidobacteriaceae',
 'k__Bacteria|p__Actinobacteriota|c__Actinobacteria|o__Actinomycetales|f__Cellulomonadaceae',
 'k__Bacteria|p__Actinobacteriota|c__Actinobacteria|o__Actinomycetales|f__Dermabacteraceae',
 'k__Bacteria|p__Actinobacteriota|c__Actinobacteria|o__Actinomycetales|f__Dermatophilaceae',
 'k__Bacteria|p__Actinobacteriota|c__Actinobacteria|o__Actinomycetales|f__Microbacteriaceae',
 'k__Bacteria|p__Actinobacteriota|c__Actinobacteria|o__Actinomycetales|f__Micrococcaceae',
 'k__Bacteria|p__Actinobacteriota|c__Actinobacteria|o__Mycobacteriales|f__Mycobacteriaceae',
 'k__Bacteria|p__Actinobacteriota|c__Actinobacteria|o__Pr

In [35]:
family_change = {
    'f__Bacteroidaceae_A' : 'f__Bacteroidaceae',
    'f__Bacillaceae_D': 'f__Bacillaceae',
    'f__Bacillaceae_G': 'f__Bacillaceae'
}

In [29]:
sorted(list(mgs_info["lineages_genus"].unique()))

['k__Archaea|p__Euryarchaeota|c__Methanobacteria|o__Methanobacteriales|f__Methanobacteriaceae|g__Methanobrevibacter_A',
 'k__Archaea|p__Nanohaloarchaeota|c__Nanosalinia|o__unclassified_mgs_3553|f__unclassified_mgs_3553|g__unclassified_mgs_3553',
 'k__Bacteria|p__Actinobacteriota|c__Actinobacteria|o__Actinomycetales|f__Actinomycetaceae|g__Actinomyces',
 'k__Bacteria|p__Actinobacteriota|c__Actinobacteria|o__Actinomycetales|f__Actinomycetaceae|g__Arcanobacterium',
 'k__Bacteria|p__Actinobacteriota|c__Actinobacteria|o__Actinomycetales|f__Actinomycetaceae|g__F0332',
 'k__Bacteria|p__Actinobacteriota|c__Actinobacteria|o__Actinomycetales|f__Actinomycetaceae|g__Mobiluncus',
 'k__Bacteria|p__Actinobacteriota|c__Actinobacteria|o__Actinomycetales|f__Actinomycetaceae|g__Pauljensenia',
 'k__Bacteria|p__Actinobacteriota|c__Actinobacteria|o__Actinomycetales|f__Bifidobacteriaceae|g__Alloscardovia',
 'k__Bacteria|p__Actinobacteriota|c__Actinobacteria|o__Actinomycetales|f__Bifidobacteriaceae|g__Bifidoba

In [36]:
# https://gtdb.ecogenomic.org/faq#family_and_higher_rank_end_with_alpha_suffix

In [37]:
len(sorted(list(mgs_info["lineages_genus"].unique())))

645

In [38]:
change_name = {
    'p__Desulfobacterota_A': 'p__Desulfobacterota',
    'p__Firmicutes_A': 'p__Firmicutes',
    'p__Firmicutes_B': 'p__Firmicutes',
    'p__Firmicutes_C': 'p__Firmicutes',
    'p__Firmicutes_I': 'p__Firmicutes',
    'p__Verrucomicrobiota_A': 'p__Verrucomicrobiota',
    'c__Bacilli_A': 'c__Bacilli',
    'o__Bacillales_A': 'o__Bacillales',
    'f__Bacteroidaceae_A' : 'f__Bacteroidaceae',
    'f__Bacillaceae_D': 'f__Bacillaceae',
    'f__Bacillaceae_G': 'f__Bacillaceae',
}

In [42]:
def change_classification(row, key):
    lineages = row[key]
    for i in change_name:
        if i in lineages:
            lineages = lineages.replace(i, change_name[i])
    return lineages

In [43]:
mgs_info["classification_new"] = mgs_info.apply(lambda x: change_classification(x, "classification"), axis=1)

In [44]:
mgs_info.head()

Unnamed: 0,mgs_id,size,mtype,representative,MAG,oral_genome,rep_path,genome_lst,mgs_id_old,bin_id,classification,lineages_superkingdom,lineages_phylum,lineages_class,lineages_order,lineages_family,lineages_genus,lineages_species,lineages_strain,classification_new
0,mgs_1,1289,kMGS,GCF_002013135.1_ASM201313v1,0,12,/hwfssz1/pub/database/ftp.ncbi.nih.gov/genomes...,"GCF_000349405.1_ASM34940v1,GCF_004013755.1_ASM...",mgs_d0.05_1,GCF_002013135.1_ASM201313v1_genomic,d__Bacteria;p__Campylobacterota;c__Campylobact...,k__Bacteria,k__Bacteria|p__Campylobacterota,k__Bacteria|p__Campylobacterota|c__Campylobact...,k__Bacteria|p__Campylobacterota|c__Campylobact...,k__Bacteria|p__Campylobacterota|c__Campylobact...,k__Bacteria|p__Campylobacterota|c__Campylobact...,k__Bacteria|p__Campylobacterota|c__Campylobact...,k__Bacteria|p__Campylobacterota|c__Campylobact...,d__Bacteria;p__Campylobacterota;c__Campylobact...
1,mgs_2,1576,kMGS,YS000214_saliva.17,1572,1574,/hwfssz1/ST_META/P18Z10200N0127_MA/zhujie/yunn...,"RSZAXPI002486-19_RAH_saliva.49,RDPYD18300072_A...",mgs_d0.05_5,YS000214_saliva.spades.bin.17,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__...,k__Bacteria,k__Bacteria|p__Bacteroidota,k__Bacteria|p__Bacteroidota|c__Bacteroidia,k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__...,k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__...,k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__...,k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__...,k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__...,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__...
2,mgs_3,6,kMGS,1150461.5.patric.fna,0,2,/hwfssz1/ST_META/P18Z10200N0127_MA/database/IG...,"SEQF2707,1150461.5.patric.fna,GCF_001005065.1_...",mgs_d0.05_9,1150461.5.patric,d__Bacteria;p__Actinobacteriota;c__Actinobacte...,k__Bacteria,k__Bacteria|p__Actinobacteriota,k__Bacteria|p__Actinobacteriota|c__Actinobacteria,k__Bacteria|p__Actinobacteriota|c__Actinobacte...,k__Bacteria|p__Actinobacteriota|c__Actinobacte...,k__Bacteria|p__Actinobacteriota|c__Actinobacte...,k__Bacteria|p__Actinobacteriota|c__Actinobacte...,k__Bacteria|p__Actinobacteriota|c__Actinobacte...,d__Bacteria;p__Actinobacteriota;c__Actinobacte...
3,mgs_4,8301,kMGS,GCF_900050145.1_6938_4_11,0,52,/hwfssz1/pub/database/ftp.ncbi.nih.gov/genomes...,"GCF_001141485.1_6925_1_59,GCF_900022795.1_1186...",mgs_d0.05_13,GCF_900050145.1_6938_4_11_genomic,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...,k__Bacteria,k__Bacteria|p__Firmicutes,k__Bacteria|p__Firmicutes|c__Bacilli,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...
4,mgs_5,1590,kMGS,GCF_001544055.1_ASM154405v1,0,1,/hwfssz1/pub/database/ftp.ncbi.nih.gov/genomes...,"GCF_000391785.1_Ente_faec_7230532-1_V1,GCF_002...",mgs_d0.05_14,GCF_001544055.1_ASM154405v1_genomic,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...,k__Bacteria,k__Bacteria|p__Firmicutes,k__Bacteria|p__Firmicutes|c__Bacilli,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...


In [46]:
mgs_info["lineages_superkingdom_new"] = mgs_info.apply(lambda x: set_lineages_to(x, "classification_new", "superkingdom"), axis=1)
mgs_info["lineages_phylum_new"] = mgs_info.apply(lambda x: set_lineages_to(x, "classification_new", "phylum"), axis=1)
mgs_info["lineages_class_new"] = mgs_info.apply(lambda x: set_lineages_to(x, "classification_new", "class"), axis=1)
mgs_info["lineages_order_new"] = mgs_info.apply(lambda x: set_lineages_to(x, "classification_new", "order"), axis=1)
mgs_info["lineages_family_new"] = mgs_info.apply(lambda x: set_lineages_to(x, "classification_new", "family"), axis=1)
mgs_info["lineages_genus_new"] = mgs_info.apply(lambda x: set_lineages_to(x, "classification_new", "genus"), axis=1)
mgs_info["lineages_species_new"] = mgs_info.apply(lambda x: set_lineages_to(x, "classification_new", "species"), axis=1)
mgs_info["lineages_strain_new"] = mgs_info.apply(lambda x: set_lineages_to(x, "classification_new", "strain"), axis=1)

In [47]:
sorted(list(mgs_info["lineages_phylum_new"].unique()))

['k__Archaea|p__Euryarchaeota',
 'k__Archaea|p__Nanohaloarchaeota',
 'k__Bacteria|p__Actinobacteriota',
 'k__Bacteria|p__Bacteroidota',
 'k__Bacteria|p__Campylobacterota',
 'k__Bacteria|p__Chloroflexota',
 'k__Bacteria|p__Cyanobacteria',
 'k__Bacteria|p__Desulfobacterota',
 'k__Bacteria|p__Elusimicrobiota',
 'k__Bacteria|p__Firmicutes',
 'k__Bacteria|p__Fusobacteriota',
 'k__Bacteria|p__Patescibacteria',
 'k__Bacteria|p__Proteobacteria',
 'k__Bacteria|p__Spirochaetota',
 'k__Bacteria|p__Synergistota',
 'k__Bacteria|p__Verrucomicrobiota']

In [48]:
# mgs_info.to_csv("lineages_gtdb/oral_mgs_gtdb_taxonomy.tsv", sep='\t', index=False)

In [1]:
## analysis umgs-isolated ani

In [2]:
mgs_info = pd.read_csv("lineages_gtdb/oral_mgs_gtdb_taxonomy.tsv", sep='\t')
umgs_ani = pd.read_csv("umgs_ani/umgs_fastani.txt", sep='\t',
                       names=["rep_path", "iso_path", "ani", "orthologous_matches", "fragments"])
umgs_info = mgs_info.query('mtype == "uMGS"').merge(umgs_ani)
len(umgs_info)

72278

In [56]:
umgs_info.head()

Unnamed: 0,mgs_id,size,mtype,representative,MAG,oral_genome,rep_path,genome_lst,mgs_id_old,bin_id,...,lineages_class_new,lineages_order_new,lineages_family_new,lineages_genus_new,lineages_species_new,lineages_strain_new,iso_path,ani,orthologous_matches,fragments
0,mgs_959,1,uMGS,SRR8114062.5,1,1,/hwfssz1/ST_META/P18Z10200N0127_MA/zhujie/pub_...,SRR8114062.5,mgs_d0.05_28756,SRR8114062.metaspades.bin.5,...,k__Bacteria|p__Firmicutes|c__Bacilli,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,/zfssz3/ST_META/ST_META/USER/luoguangwen/HUMzj...,94.447,179,185
1,mgs_959,1,uMGS,SRR8114062.5,1,1,/hwfssz1/ST_META/P18Z10200N0127_MA/zhujie/pub_...,SRR8114062.5,mgs_d0.05_28756,SRR8114062.metaspades.bin.5,...,k__Bacteria|p__Firmicutes|c__Bacilli,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,/ldfssz1/ST_META/P17Z10200N0048_PRO_ZYQ/BGD/B0...,94.3882,179,185
2,mgs_959,1,uMGS,SRR8114062.5,1,1,/hwfssz1/ST_META/P18Z10200N0127_MA/zhujie/pub_...,SRR8114062.5,mgs_d0.05_28756,SRR8114062.metaspades.bin.5,...,k__Bacteria|p__Firmicutes|c__Bacilli,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,/ldfssz1/ST_META/P17Z10200N0048_PRO_ZYQ/BGD/B0...,94.3721,172,185
3,mgs_959,1,uMGS,SRR8114062.5,1,1,/hwfssz1/ST_META/P18Z10200N0127_MA/zhujie/pub_...,SRR8114062.5,mgs_d0.05_28756,SRR8114062.metaspades.bin.5,...,k__Bacteria|p__Firmicutes|c__Bacilli,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,/ldfssz1/ST_META/P17Z10200N0048_PRO_ZYQ/BGD/B0...,94.3692,179,185
4,mgs_959,1,uMGS,SRR8114062.5,1,1,/hwfssz1/ST_META/P18Z10200N0127_MA/zhujie/pub_...,SRR8114062.5,mgs_d0.05_28756,SRR8114062.metaspades.bin.5,...,k__Bacteria|p__Firmicutes|c__Bacilli,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,/ldfssz1/ST_META/P17Z10200N0048_PRO_ZYQ/BGD/B0...,94.3149,181,185


In [58]:
umgs_info_known = umgs_info.query('ani > 95.00')
len(umgs_info_known)

1837

In [60]:
[len(umgs_info_known["iso_path"].unique()), len(umgs_info_known["rep_path"].unique())]

[353, 347]

In [73]:
umgs_info_known["lineages_genus_new"].unique()

array(['k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Streptococcaceae|g__Streptococcus',
       'k__Bacteria|p__Actinobacteriota|c__Actinobacteria|o__Actinomycetales|f__Actinomycetaceae|g__Pauljensenia',
       'k__Bacteria|p__Actinobacteriota|c__Coriobacteriia|o__Coriobacteriales|f__Atopobiaceae|g__Lancefieldella',
       'k__Bacteria|p__Actinobacteriota|c__Actinobacteria|o__Actinomycetales|f__Micrococcaceae|g__Rothia',
       'k__Bacteria|p__Firmicutes|c__Bacilli|o__Staphylococcales|f__Gemellaceae|g__Gemella',
       'k__Bacteria|p__Firmicutes|c__Negativicutes|o__Veillonellales|f__Veillonellaceae|g__Veillonella',
       'k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Aerococcaceae|g__Granulicatella',
       'k__Bacteria|p__Fusobacteriota|c__Fusobacteriia|o__Fusobacteriales|f__Fusobacteriaceae|g__Fusobacterium',
       'k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacterales|f__Enterobacteriaceae|g__Enterobacter',
       'k__Bacteria|p__Prote

In [81]:
##
## Here we show that BPSCSK reduces growth of VRE by secreting a lantibiotic that is similar to the nisin-A produced by 
## Lactococcus lactis. 

## 这个菌可是类似唾液好菌哦
##   Lactococcus lactis

In [3]:
mgs_info.head()

Unnamed: 0,mgs_id,size,mtype,representative,MAG,oral_genome,rep_path,genome_lst,mgs_id_old,bin_id,...,lineages_strain,classification_new,lineages_superkingdom_new,lineages_phylum_new,lineages_class_new,lineages_order_new,lineages_family_new,lineages_genus_new,lineages_species_new,lineages_strain_new
0,mgs_1,1289,kMGS,GCF_002013135.1_ASM201313v1,0,12,/hwfssz1/pub/database/ftp.ncbi.nih.gov/genomes...,"GCF_000349405.1_ASM34940v1,GCF_004013755.1_ASM...",mgs_d0.05_1,GCF_002013135.1_ASM201313v1_genomic,...,k__Bacteria|p__Campylobacterota|c__Campylobact...,d__Bacteria;p__Campylobacterota;c__Campylobact...,k__Bacteria,k__Bacteria|p__Campylobacterota,k__Bacteria|p__Campylobacterota|c__Campylobact...,k__Bacteria|p__Campylobacterota|c__Campylobact...,k__Bacteria|p__Campylobacterota|c__Campylobact...,k__Bacteria|p__Campylobacterota|c__Campylobact...,k__Bacteria|p__Campylobacterota|c__Campylobact...,k__Bacteria|p__Campylobacterota|c__Campylobact...
1,mgs_2,1576,kMGS,YS000214_saliva.17,1572,1574,/hwfssz1/ST_META/P18Z10200N0127_MA/zhujie/yunn...,"RSZAXPI002486-19_RAH_saliva.49,RDPYD18300072_A...",mgs_d0.05_5,YS000214_saliva.spades.bin.17,...,k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__...,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__...,k__Bacteria,k__Bacteria|p__Bacteroidota,k__Bacteria|p__Bacteroidota|c__Bacteroidia,k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__...,k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__...,k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__...,k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__...,k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__...
2,mgs_3,6,kMGS,1150461.5.patric.fna,0,2,/hwfssz1/ST_META/P18Z10200N0127_MA/database/IG...,"SEQF2707,1150461.5.patric.fna,GCF_001005065.1_...",mgs_d0.05_9,1150461.5.patric,...,k__Bacteria|p__Actinobacteriota|c__Actinobacte...,d__Bacteria;p__Actinobacteriota;c__Actinobacte...,k__Bacteria,k__Bacteria|p__Actinobacteriota,k__Bacteria|p__Actinobacteriota|c__Actinobacteria,k__Bacteria|p__Actinobacteriota|c__Actinobacte...,k__Bacteria|p__Actinobacteriota|c__Actinobacte...,k__Bacteria|p__Actinobacteriota|c__Actinobacte...,k__Bacteria|p__Actinobacteriota|c__Actinobacte...,k__Bacteria|p__Actinobacteriota|c__Actinobacte...
3,mgs_4,8301,kMGS,GCF_900050145.1_6938_4_11,0,52,/hwfssz1/pub/database/ftp.ncbi.nih.gov/genomes...,"GCF_001141485.1_6925_1_59,GCF_900022795.1_1186...",mgs_d0.05_13,GCF_900050145.1_6938_4_11_genomic,...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...,k__Bacteria,k__Bacteria|p__Firmicutes,k__Bacteria|p__Firmicutes|c__Bacilli,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...
4,mgs_5,1590,kMGS,GCF_001544055.1_ASM154405v1,0,1,/hwfssz1/pub/database/ftp.ncbi.nih.gov/genomes...,"GCF_000391785.1_Ente_faec_7230532-1_V1,GCF_002...",mgs_d0.05_14,GCF_001544055.1_ASM154405v1_genomic,...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...,k__Bacteria,k__Bacteria|p__Firmicutes,k__Bacteria|p__Firmicutes|c__Bacilli,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...


In [22]:
for i, j in enumerate(mgs_info["lineages_strain_new"]):
    if "Lactococcus" in j:
        print(i, j)

68 k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Streptococcaceae|g__Lactococcus|s__Lactococcus lactis_E_mgs_69|t__unclassified_mgs_69
126 k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactobacillales|f__Streptococcaceae|g__Lactococcus|s__Lactococcus lactis_mgs_127|t__unclassified_mgs_127


In [12]:
mgs_info.iloc[68, ]

mgs_id                                                                  mgs_69
size                                                                        47
mtype                                                                     kMGS
representative                                     GCF_001005395.1_ASM100539v1
MAG                                                                          0
oral_genome                                                                  6
rep_path                     /hwfssz1/pub/database/ftp.ncbi.nih.gov/genomes...
genome_lst                   GCF_001005395.1_ASM100539v1,GCF_001005455.1_AS...
mgs_id_old                                                       mgs_d0.05_245
bin_id                                     GCF_001005395.1_ASM100539v1_genomic
classification               d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...
lineages_superkingdom                                              k__Bacteria
lineages_phylum                                     

In [18]:
mgs_info.iloc[126, ]

mgs_id                                                                 mgs_127
size                                                                       130
mtype                                                                     kMGS
representative                                   GCF_900248205.1_ASM90024820v2
MAG                                                                          8
oral_genome                                                                 15
rep_path                     /hwfssz1/pub/database/ftp.ncbi.nih.gov/genomes...
genome_lst                   GCF_001456685.1_ASM145668v1,GCF_001017475.1_AS...
mgs_id_old                                                       mgs_d0.05_650
bin_id                                   GCF_900248205.1_ASM90024820v2_genomic
classification               d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...
lineages_superkingdom                                              k__Bacteria
lineages_phylum                                     

In [16]:
mgs_info.iloc[68, ]["genome_lst"].split(",")

['GCF_001005395.1_ASM100539v1',
 'GCF_001005455.1_ASM100545v1',
 'GCF_001645365.1_WG2',
 'GCF_001622405.1_ASM162240v1',
 'GCF_001622315.1_ASM162231v1',
 'GCF_000731635.1_LlactisGE214',
 'GCF_002078765.2_ASM207876v2',
 'GCF_002441825.1_ASM244182v1',
 'SEQF2151',
 'SEQF2150',
 'GCF_004354515.1_ASM435451v1',
 'GCF_002078375.2_ASM207837v2',
 'GCF_002441765.1_ASM244176v1',
 'GCF_000447925.1_LLT3.01',
 'GCF_001591705.1_ASM159170v1',
 '1295826.3.patric.fna',
 'GCF_001622205.1_ASM162220v1',
 'GCF_002563595.1_ASM256359v1',
 'GCF_900240855.1_CF101',
 'GCF_002078955.1_ASM207895v1',
 'GCF_000447885.1_LLT1.01',
 'GCF_003966935.1_ASM396693v1',
 'GCF_900240895.1_CF108',
 'GCF_000534815.1_HPAssemblyV1',
 'GCF_001622285.1_ASM162228v1',
 'SEQF2559',
 'GCF_900240905.1_CF103',
 'GCF_001856165.1_IBB477v1',
 'GCF_000969475.1_ASM96947v1',
 'SEQF2570',
 'GCF_001622225.1_ASM162222v1',
 'GCF_003394085.1_ASM339408v1',
 'GCF_900240865.1_CF105',
 'GCF_002078935.1_ASM207893v1',
 'GCF_001622295.1_ASM162229v1',
 'GCF

In [17]:
mgs_info.iloc[126, ]["genome_lst"].split(",")

['GCF_001456685.1_ASM145668v1',
 'GCF_001017475.1_ASM101747v1',
 'RSZAXPI002488-21_2_RAH_saliva.10',
 'GCF_002563455.1_ASM256345v1',
 'GCF_002441785.1_ASM244178v1',
 'SEQF2560',
 'SEQF2561',
 'GCF_001456525.1_ASM145652v1',
 'GCF_002276835.1_ASM227683v1',
 'GCF_002804185.1_ASM280418v1',
 'GCF_001672265.1_Llactis1.0',
 'GCF_002554775.1_ASM255477v1',
 'GCF_003839715.1_ASM383971v1',
 'GCF_002078475.1_ASM207847v1',
 'GCF_000348965.1_YF11_v1',
 'GCF_900240965.1_CF104',
 'GCF_002926075.1_ASM292607v1',
 'GCF_002563535.1_ASM256353v1',
 'GCF_003838315.1_ASM383831v1',
 'GCF_003838305.1_ASM383830v1',
 'GCF_001456505.1_ASM145650v1',
 'GCF_001591725.1_ASM159172v1',
 'GCF_001456795.1_ASM145679v1',
 'GCF_001456675.1_ASM145667v1',
 'GCF_002078435.1_ASM207843v1',
 'SEQF2473',
 'RSZAXPI002471-106_2_RAH_saliva.2',
 'GCF_003839805.1_ASM383980v1',
 'GCF_002078995.2_ASM207899v2',
 'GCF_002078855.1_ASM207885v1',
 'GCF_003841345.1_ASM384134v1',
 'GCF_004354485.1_ASM435448v1',
 'GCF_004022375.1_ASM402237v1',
 '

In [27]:
mgs_info.groupby("lineages_phylum_new").size()

lineages_phylum_new
k__Archaea|p__Euryarchaeota            1
k__Archaea|p__Nanohaloarchaeota        1
k__Bacteria|p__Actinobacteriota      490
k__Bacteria|p__Bacteroidota          368
k__Bacteria|p__Campylobacterota      280
k__Bacteria|p__Chloroflexota           4
k__Bacteria|p__Cyanobacteria           1
k__Bacteria|p__Desulfobacterota        8
k__Bacteria|p__Elusimicrobiota         1
k__Bacteria|p__Firmicutes           1248
k__Bacteria|p__Fusobacteriota        145
k__Bacteria|p__Patescibacteria       596
k__Bacteria|p__Proteobacteria        364
k__Bacteria|p__Spirochaetota          67
k__Bacteria|p__Synergistota           12
k__Bacteria|p__Verrucomicrobiota       3
dtype: int64

In [37]:
mgs_info.set_index("lineages_phylum_new").loc["k__Archaea|p__Euryarchaeota", "oral_genome"]

1

In [38]:
mgs_info.set_index("lineages_phylum_new").loc["k__Archaea|p__Nanohaloarchaeota", "oral_genome"]

2

In [39]:
sum(mgs_info.set_index("lineages_phylum_new").loc["k__Bacteria|p__Actinobacteriota", "oral_genome"])

6477

In [40]:
sum(mgs_info.set_index("lineages_phylum_new").loc["k__Bacteria|p__Bacteroidota", "oral_genome"])

23409

In [41]:
sum(mgs_info.set_index("lineages_phylum_new").loc["k__Bacteria|p__Campylobacterota", "oral_genome"])

1841

In [42]:
sum(mgs_info.set_index("lineages_phylum_new").loc["k__Bacteria|p__Chloroflexota", "oral_genome"])

7

In [44]:
mgs_info.set_index("lineages_phylum_new").loc["k__Bacteria|p__Cyanobacteria", "oral_genome"]

1

In [45]:
sum(mgs_info.set_index("lineages_phylum_new").loc["k__Bacteria|p__Desulfobacterota", "oral_genome"])

13

In [47]:
mgs_info.set_index("lineages_phylum_new").loc["k__Bacteria|p__Elusimicrobiota", "oral_genome"]

1

In [49]:
sum(mgs_info.set_index("lineages_phylum_new").loc["k__Bacteria|p__Firmicutes", "oral_genome"])

12307

In [50]:
sum(mgs_info.set_index("lineages_phylum_new").loc["k__Bacteria|p__Fusobacteriota", "oral_genome"])

1998

In [51]:
sum(mgs_info.set_index("lineages_phylum_new").loc["k__Bacteria|p__Patescibacteria", "oral_genome"])

4006

In [52]:
sum(mgs_info.set_index("lineages_phylum_new").loc["k__Bacteria|p__Proteobacteria", "oral_genome"])

7570

In [53]:
sum(mgs_info.set_index("lineages_phylum_new").loc["k__Bacteria|p__Spirochaetota", "oral_genome"])

900

In [54]:
sum(mgs_info.set_index("lineages_phylum_new").loc["k__Bacteria|p__Synergistota", "oral_genome"])

44

In [55]:
sum(mgs_info.set_index("lineages_phylum_new").loc["k__Bacteria|p__Verrucomicrobiota", "oral_genome"])

6

In [31]:
sum(mgs_info["oral_genome"])

58583

In [56]:
mgs_info.head()

Unnamed: 0,mgs_id,size,mtype,representative,MAG,oral_genome,rep_path,genome_lst,mgs_id_old,bin_id,...,lineages_strain,classification_new,lineages_superkingdom_new,lineages_phylum_new,lineages_class_new,lineages_order_new,lineages_family_new,lineages_genus_new,lineages_species_new,lineages_strain_new
0,mgs_1,1289,kMGS,GCF_002013135.1_ASM201313v1,0,12,/hwfssz1/pub/database/ftp.ncbi.nih.gov/genomes...,"GCF_000349405.1_ASM34940v1,GCF_004013755.1_ASM...",mgs_d0.05_1,GCF_002013135.1_ASM201313v1_genomic,...,k__Bacteria|p__Campylobacterota|c__Campylobact...,d__Bacteria;p__Campylobacterota;c__Campylobact...,k__Bacteria,k__Bacteria|p__Campylobacterota,k__Bacteria|p__Campylobacterota|c__Campylobact...,k__Bacteria|p__Campylobacterota|c__Campylobact...,k__Bacteria|p__Campylobacterota|c__Campylobact...,k__Bacteria|p__Campylobacterota|c__Campylobact...,k__Bacteria|p__Campylobacterota|c__Campylobact...,k__Bacteria|p__Campylobacterota|c__Campylobact...
1,mgs_2,1576,kMGS,YS000214_saliva.17,1572,1574,/hwfssz1/ST_META/P18Z10200N0127_MA/zhujie/yunn...,"RSZAXPI002486-19_RAH_saliva.49,RDPYD18300072_A...",mgs_d0.05_5,YS000214_saliva.spades.bin.17,...,k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__...,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__...,k__Bacteria,k__Bacteria|p__Bacteroidota,k__Bacteria|p__Bacteroidota|c__Bacteroidia,k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__...,k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__...,k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__...,k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__...,k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__...
2,mgs_3,6,kMGS,1150461.5.patric.fna,0,2,/hwfssz1/ST_META/P18Z10200N0127_MA/database/IG...,"SEQF2707,1150461.5.patric.fna,GCF_001005065.1_...",mgs_d0.05_9,1150461.5.patric,...,k__Bacteria|p__Actinobacteriota|c__Actinobacte...,d__Bacteria;p__Actinobacteriota;c__Actinobacte...,k__Bacteria,k__Bacteria|p__Actinobacteriota,k__Bacteria|p__Actinobacteriota|c__Actinobacteria,k__Bacteria|p__Actinobacteriota|c__Actinobacte...,k__Bacteria|p__Actinobacteriota|c__Actinobacte...,k__Bacteria|p__Actinobacteriota|c__Actinobacte...,k__Bacteria|p__Actinobacteriota|c__Actinobacte...,k__Bacteria|p__Actinobacteriota|c__Actinobacte...
3,mgs_4,8301,kMGS,GCF_900050145.1_6938_4_11,0,52,/hwfssz1/pub/database/ftp.ncbi.nih.gov/genomes...,"GCF_001141485.1_6925_1_59,GCF_900022795.1_1186...",mgs_d0.05_13,GCF_900050145.1_6938_4_11_genomic,...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...,k__Bacteria,k__Bacteria|p__Firmicutes,k__Bacteria|p__Firmicutes|c__Bacilli,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...
4,mgs_5,1590,kMGS,GCF_001544055.1_ASM154405v1,0,1,/hwfssz1/pub/database/ftp.ncbi.nih.gov/genomes...,"GCF_000391785.1_Ente_faec_7230532-1_V1,GCF_002...",mgs_d0.05_14,GCF_001544055.1_ASM154405v1_genomic,...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...,k__Bacteria,k__Bacteria|p__Firmicutes,k__Bacteria|p__Firmicutes|c__Bacilli,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...


In [64]:
old_mgs_info = pd.read_csv("/hwfssz1/ST_META/P18Z10200N0127_MA/zhujie/igo/assay/taxonomy/lineages_formated/taxonomy_oral_mgs_representative.tsv", sep='\t')\
                 .rename(columns={"mgs_id": "mgs_id_old",
                                  "lineages": "lineages_old"})\
                 .loc[:, ["mgs_id_old", "lineages_old"]]

In [65]:
old_mgs_info.head()

Unnamed: 0,mgs_id_old,lineages_old
0,mgs_d0.05_1,k_Bacteria|p_Proteobacteria|c_Epsilonproteobac...
1,mgs_d0.05_5,k_Bacteria|p_Bacteroidetes|c_Bacteroidia|o_Bac...
2,mgs_d0.05_9,k_Bacteria|p_Actinobacteria|c_Actinobacteria|o...
3,mgs_d0.05_13,k_Bacteria|p_Firmicutes|c_Bacilli|o_Lactobacil...
4,mgs_d0.05_14,k_Bacteria|p_Firmicutes|c_Bacilli|o_Lactobacil...


In [66]:
mgs_info2 = mgs_info.merge(old_mgs_info)

In [67]:
mgs_info2.head()

Unnamed: 0,mgs_id,size,mtype,representative,MAG,oral_genome,rep_path,genome_lst,mgs_id_old,bin_id,...,classification_new,lineages_superkingdom_new,lineages_phylum_new,lineages_class_new,lineages_order_new,lineages_family_new,lineages_genus_new,lineages_species_new,lineages_strain_new,lineages_old
0,mgs_1,1289,kMGS,GCF_002013135.1_ASM201313v1,0,12,/hwfssz1/pub/database/ftp.ncbi.nih.gov/genomes...,"GCF_000349405.1_ASM34940v1,GCF_004013755.1_ASM...",mgs_d0.05_1,GCF_002013135.1_ASM201313v1_genomic,...,d__Bacteria;p__Campylobacterota;c__Campylobact...,k__Bacteria,k__Bacteria|p__Campylobacterota,k__Bacteria|p__Campylobacterota|c__Campylobact...,k__Bacteria|p__Campylobacterota|c__Campylobact...,k__Bacteria|p__Campylobacterota|c__Campylobact...,k__Bacteria|p__Campylobacterota|c__Campylobact...,k__Bacteria|p__Campylobacterota|c__Campylobact...,k__Bacteria|p__Campylobacterota|c__Campylobact...,k_Bacteria|p_Proteobacteria|c_Epsilonproteobac...
1,mgs_2,1576,kMGS,YS000214_saliva.17,1572,1574,/hwfssz1/ST_META/P18Z10200N0127_MA/zhujie/yunn...,"RSZAXPI002486-19_RAH_saliva.49,RDPYD18300072_A...",mgs_d0.05_5,YS000214_saliva.spades.bin.17,...,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__...,k__Bacteria,k__Bacteria|p__Bacteroidota,k__Bacteria|p__Bacteroidota|c__Bacteroidia,k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__...,k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__...,k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__...,k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__...,k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__...,k_Bacteria|p_Bacteroidetes|c_Bacteroidia|o_Bac...
2,mgs_3,6,kMGS,1150461.5.patric.fna,0,2,/hwfssz1/ST_META/P18Z10200N0127_MA/database/IG...,"SEQF2707,1150461.5.patric.fna,GCF_001005065.1_...",mgs_d0.05_9,1150461.5.patric,...,d__Bacteria;p__Actinobacteriota;c__Actinobacte...,k__Bacteria,k__Bacteria|p__Actinobacteriota,k__Bacteria|p__Actinobacteriota|c__Actinobacteria,k__Bacteria|p__Actinobacteriota|c__Actinobacte...,k__Bacteria|p__Actinobacteriota|c__Actinobacte...,k__Bacteria|p__Actinobacteriota|c__Actinobacte...,k__Bacteria|p__Actinobacteriota|c__Actinobacte...,k__Bacteria|p__Actinobacteriota|c__Actinobacte...,k_Bacteria|p_Actinobacteria|c_Actinobacteria|o...
3,mgs_4,8301,kMGS,GCF_900050145.1_6938_4_11,0,52,/hwfssz1/pub/database/ftp.ncbi.nih.gov/genomes...,"GCF_001141485.1_6925_1_59,GCF_900022795.1_1186...",mgs_d0.05_13,GCF_900050145.1_6938_4_11_genomic,...,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...,k__Bacteria,k__Bacteria|p__Firmicutes,k__Bacteria|p__Firmicutes|c__Bacilli,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k_Bacteria|p_Firmicutes|c_Bacilli|o_Lactobacil...
4,mgs_5,1590,kMGS,GCF_001544055.1_ASM154405v1,0,1,/hwfssz1/pub/database/ftp.ncbi.nih.gov/genomes...,"GCF_000391785.1_Ente_faec_7230532-1_V1,GCF_002...",mgs_d0.05_14,GCF_001544055.1_ASM154405v1_genomic,...,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...,k__Bacteria,k__Bacteria|p__Firmicutes,k__Bacteria|p__Firmicutes|c__Bacilli,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k_Bacteria|p_Firmicutes|c_Bacilli|o_Lactobacil...


In [68]:
mgs_info2.to_csv("lineages_gtdb/oral_mgs_gtdb_taxonomy_add_old_tax.tsv", sep='\t', index=False)

In [68]:
##

In [69]:
!pwd

/hwfssz1/ST_META/P18Z10200N0127_MA/zhujie/igo/assay/taxonomy


In [3]:
mgs_info2 = pd.read_csv("lineages_gtdb/oral_mgs_gtdb_taxonomy_add_old_tax.tsv", sep='\t').set_index("mgs_id", drop=False)

In [4]:
mgs_info2.head()

Unnamed: 0_level_0,mgs_id,size,mtype,representative,MAG,oral_genome,rep_path,genome_lst,mgs_id_old,bin_id,...,classification_new,lineages_superkingdom_new,lineages_phylum_new,lineages_class_new,lineages_order_new,lineages_family_new,lineages_genus_new,lineages_species_new,lineages_strain_new,lineages_old
mgs_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
mgs_1,mgs_1,1289,kMGS,GCF_002013135.1_ASM201313v1,0,12,/hwfssz1/pub/database/ftp.ncbi.nih.gov/genomes...,"GCF_000349405.1_ASM34940v1,GCF_004013755.1_ASM...",mgs_d0.05_1,GCF_002013135.1_ASM201313v1_genomic,...,d__Bacteria;p__Campylobacterota;c__Campylobact...,k__Bacteria,k__Bacteria|p__Campylobacterota,k__Bacteria|p__Campylobacterota|c__Campylobact...,k__Bacteria|p__Campylobacterota|c__Campylobact...,k__Bacteria|p__Campylobacterota|c__Campylobact...,k__Bacteria|p__Campylobacterota|c__Campylobact...,k__Bacteria|p__Campylobacterota|c__Campylobact...,k__Bacteria|p__Campylobacterota|c__Campylobact...,k_Bacteria|p_Proteobacteria|c_Epsilonproteobac...
mgs_2,mgs_2,1576,kMGS,YS000214_saliva.17,1572,1574,/hwfssz1/ST_META/P18Z10200N0127_MA/zhujie/yunn...,"RSZAXPI002486-19_RAH_saliva.49,RDPYD18300072_A...",mgs_d0.05_5,YS000214_saliva.spades.bin.17,...,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__...,k__Bacteria,k__Bacteria|p__Bacteroidota,k__Bacteria|p__Bacteroidota|c__Bacteroidia,k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__...,k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__...,k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__...,k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__...,k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__...,k_Bacteria|p_Bacteroidetes|c_Bacteroidia|o_Bac...
mgs_3,mgs_3,6,kMGS,1150461.5.patric.fna,0,2,/hwfssz1/ST_META/P18Z10200N0127_MA/database/IG...,"SEQF2707,1150461.5.patric.fna,GCF_001005065.1_...",mgs_d0.05_9,1150461.5.patric,...,d__Bacteria;p__Actinobacteriota;c__Actinobacte...,k__Bacteria,k__Bacteria|p__Actinobacteriota,k__Bacteria|p__Actinobacteriota|c__Actinobacteria,k__Bacteria|p__Actinobacteriota|c__Actinobacte...,k__Bacteria|p__Actinobacteriota|c__Actinobacte...,k__Bacteria|p__Actinobacteriota|c__Actinobacte...,k__Bacteria|p__Actinobacteriota|c__Actinobacte...,k__Bacteria|p__Actinobacteriota|c__Actinobacte...,k_Bacteria|p_Actinobacteria|c_Actinobacteria|o...
mgs_4,mgs_4,8301,kMGS,GCF_900050145.1_6938_4_11,0,52,/hwfssz1/pub/database/ftp.ncbi.nih.gov/genomes...,"GCF_001141485.1_6925_1_59,GCF_900022795.1_1186...",mgs_d0.05_13,GCF_900050145.1_6938_4_11_genomic,...,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...,k__Bacteria,k__Bacteria|p__Firmicutes,k__Bacteria|p__Firmicutes|c__Bacilli,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k_Bacteria|p_Firmicutes|c_Bacilli|o_Lactobacil...
mgs_5,mgs_5,1590,kMGS,GCF_001544055.1_ASM154405v1,0,1,/hwfssz1/pub/database/ftp.ncbi.nih.gov/genomes...,"GCF_000391785.1_Ente_faec_7230532-1_V1,GCF_002...",mgs_d0.05_14,GCF_001544055.1_ASM154405v1_genomic,...,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactob...,k__Bacteria,k__Bacteria|p__Firmicutes,k__Bacteria|p__Firmicutes|c__Bacilli,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k__Bacteria|p__Firmicutes|c__Bacilli|o__Lactob...,k_Bacteria|p_Firmicutes|c_Bacilli|o_Lactobacil...


In [5]:
# CRC
# Solobacterium moorei
# Parvimonas micra
# Streptococcus anginosus
# Porphyromonas asaccharolytica

In [27]:
crc_species = ["Solobacterium_moorei",
               "Parvimonas_micra",
               "Streptococcus_anginosus",
               "Porphyromonas_asaccharolytica"]

In [28]:
for i, j in enumerate(mgs_info2["lineages_strain_new"]):
    for s in crc_species:
        if s in j:
            print(i, j)

In [29]:
for i, j in enumerate(mgs_info2["lineages_old"]):
    for s in crc_species:
        if s in j:
            print(i, j)
            print(mgs_info2.iloc[i, ]["size"])

294 k_Bacteria|p_Firmicutes|c_Bacilli|o_Lactobacillales|f_Streptococcaceae|g_Streptococcus|s_Streptococcus_anginosus|t_strain=AM58-6
54
617 k_Bacteria|p_Firmicutes|c_Erysipelotrichia|o_Erysipelotrichales|f_Erysipelotrichaceae|g_Solobacterium|s_Solobacterium_moorei
1
868 k_Bacteria|p_Bacteroidetes|c_Bacteroidia|o_Bacteroidales|f_Porphyromonadaceae|g_Porphyromonas|s_Porphyromonas_asaccharolytica
1
894 k_Bacteria|p_Firmicutes|c_Erysipelotrichia|o_Erysipelotrichales|f_Erysipelotrichaceae|g_Solobacterium|s_Solobacterium_moorei
1
931 k_Bacteria|p_Bacteroidetes|c_Bacteroidia|o_Bacteroidales|f_Porphyromonadaceae|g_Porphyromonas|s_Porphyromonas_asaccharolytica
3
1306 k_Bacteria|p_Firmicutes|c_Bacilli|o_Erysipelotrichales|f_Erysipelotrichaceae|g_Solobacterium|s_Solobacterium_moorei
1
2965 k_Bacteria|p_Firmicutes|c_Erysipelotrichia|o_Erysipelotrichales|f_Erysipelotrichaceae|g_Solobacterium|s_Solobacterium_moorei
125
2986 k_Bacteria|p_Firmicutes|c_Tissierellia|o_Tissierellales|f_Peptoniphilaceae|g

In [30]:
for i, j in enumerate(mgs_info2["lineages_old"]):
    for s in crc_species:
        if s in j:
            print(i, j)
            print(mgs_info2.iloc[i, ]["oral_genome"])

294 k_Bacteria|p_Firmicutes|c_Bacilli|o_Lactobacillales|f_Streptococcaceae|g_Streptococcus|s_Streptococcus_anginosus|t_strain=AM58-6
15
617 k_Bacteria|p_Firmicutes|c_Erysipelotrichia|o_Erysipelotrichales|f_Erysipelotrichaceae|g_Solobacterium|s_Solobacterium_moorei
1
868 k_Bacteria|p_Bacteroidetes|c_Bacteroidia|o_Bacteroidales|f_Porphyromonadaceae|g_Porphyromonas|s_Porphyromonas_asaccharolytica
1
894 k_Bacteria|p_Firmicutes|c_Erysipelotrichia|o_Erysipelotrichales|f_Erysipelotrichaceae|g_Solobacterium|s_Solobacterium_moorei
1
931 k_Bacteria|p_Bacteroidetes|c_Bacteroidia|o_Bacteroidales|f_Porphyromonadaceae|g_Porphyromonas|s_Porphyromonas_asaccharolytica
3
1306 k_Bacteria|p_Firmicutes|c_Bacilli|o_Erysipelotrichales|f_Erysipelotrichaceae|g_Solobacterium|s_Solobacterium_moorei
1
2965 k_Bacteria|p_Firmicutes|c_Erysipelotrichia|o_Erysipelotrichales|f_Erysipelotrichaceae|g_Solobacterium|s_Solobacterium_moorei
124
2986 k_Bacteria|p_Firmicutes|c_Tissierellia|o_Tissierellales|f_Peptoniphilaceae|g

In [None]:
##

In [32]:
mgs_info2.keys()

Index(['mgs_id', 'size', 'mtype', 'representative', 'MAG', 'oral_genome',
       'rep_path', 'genome_lst', 'mgs_id_old', 'bin_id', 'classification',
       'lineages_superkingdom', 'lineages_phylum', 'lineages_class',
       'lineages_order', 'lineages_family', 'lineages_genus',
       'lineages_species', 'lineages_strain', 'classification_new',
       'lineages_superkingdom_new', 'lineages_phylum_new',
       'lineages_class_new', 'lineages_order_new', 'lineages_family_new',
       'lineages_genus_new', 'lineages_species_new', 'lineages_strain_new',
       'lineages_old'],
      dtype='object')

In [33]:
# RDPYD18
# RSZYD18
# YS00
samples_cluster = []

for mgs_id in mgs_info2.index.unique():
    genome_lst = mgs_info2.loc[mgs_id, "genome_lst"].split(",")
    samples_dict = {}
    for bin_id in genome_lst:
        if bin_id.startswith("RDPYD18") or bin_id.startswith("RSZYD18") or bin_id.startswith("YS00"):
            sample_id = bin_id.split(".")[0]
            if sample_id in samples_dict:
                samples_dict[sample_id]["bin_lst"] += [bin_id]
                samples_dict[sample_id]["bin_count"] += 1
            else:
                samples_dict[sample_id] = {}
                samples_dict[sample_id]["sample_id"] = sample_id
                samples_dict[sample_id]["bin_lst"] = [bin_id]
                samples_dict[sample_id]["bin_count"] = 1
                samples_dict[sample_id]["mgs_id"] = mgs_id
                samples_dict[sample_id]["classification"] = mgs_info2.loc[mgs_id, "classification_new"]
                samples_dict[sample_id]["lineages_superkingdom"] = mgs_info2.loc[mgs_id, "lineages_superkingdom_new"]
                samples_dict[sample_id]["lineages_phylum"] = mgs_info2.loc[mgs_id, "lineages_phylum_new"]
                samples_dict[sample_id]["lineages_class"] = mgs_info2.loc[mgs_id, "lineages_class_new"]
                samples_dict[sample_id]["lineages_order"] = mgs_info2.loc[mgs_id, "lineages_class_new"]
                samples_dict[sample_id]["lineages_family"] = mgs_info2.loc[mgs_id, "lineages_family_new"]
                samples_dict[sample_id]["lineages_genus"] = mgs_info2.loc[mgs_id, "lineages_genus_new"]
                samples_dict[sample_id]["lineages_species"] = mgs_info2.loc[mgs_id, "lineages_species_new"]
                samples_dict[sample_id]["lineages_strain"] = mgs_info2.loc[mgs_id, "lineages_strain_new"]
                
    for key in samples_dict:
        samples_dict[key]["bin_lst"] = ",".join(samples_dict[key]["bin_lst"])
        samples_cluster.append(samples_dict[key])

In [34]:
samples_cluster_df = pd.DataFrame(samples_cluster)\
                       .loc[:, ["sample_id", "mgs_id", "bin_count", "bin_lst",
                                "classification",
                                "lineages_superkingdom",
                                "lineages_phylum",
                                "lineages_class",
                                "lineages_order",
                                "lineages_family",
                                "lineages_genus",
                                "lineages_species",
                                "lineages_strain"]]\
                       .sort_values(["sample_id", "mgs_id"])

In [35]:
samples_cluster_df.head()

Unnamed: 0,sample_id,mgs_id,bin_count,bin_lst,classification,lineages_superkingdom,lineages_phylum,lineages_class,lineages_order,lineages_family,lineages_genus,lineages_species,lineages_strain
15494,RDPYD18075162_A_saliva,mgs_2984,1,RDPYD18075162_A_saliva.12,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__...,k__Bacteria,k__Bacteria|p__Bacteroidota,k__Bacteria|p__Bacteroidota|c__Bacteroidia,k__Bacteria|p__Bacteroidota|c__Bacteroidia,k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__...,k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__...,k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__...,k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__...
29661,RDPYD18075162_A_saliva,mgs_3255,1,RDPYD18075162_A_saliva.6,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__...,k__Bacteria,k__Bacteria|p__Bacteroidota,k__Bacteria|p__Bacteroidota|c__Bacteroidia,k__Bacteria|p__Bacteroidota|c__Bacteroidia,k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__...,k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__...,k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__...,k__Bacteria|p__Bacteroidota|c__Bacteroidia|o__...
32212,RDPYD18075162_A_saliva,mgs_3288,1,RDPYD18075162_A_saliva.13,d__Bacteria;p__Campylobacterota;c__Campylobact...,k__Bacteria,k__Bacteria|p__Campylobacterota,k__Bacteria|p__Campylobacterota|c__Campylobact...,k__Bacteria|p__Campylobacterota|c__Campylobact...,k__Bacteria|p__Campylobacterota|c__Campylobact...,k__Bacteria|p__Campylobacterota|c__Campylobact...,k__Bacteria|p__Campylobacterota|c__Campylobact...,k__Bacteria|p__Campylobacterota|c__Campylobact...
37140,RDPYD18075162_A_saliva,mgs_3413,1,RDPYD18075162_A_saliva.11,d__Bacteria;p__Proteobacteria;c__Gammaproteoba...,k__Bacteria,k__Bacteria|p__Proteobacteria,k__Bacteria|p__Proteobacteria|c__Gammaproteoba...,k__Bacteria|p__Proteobacteria|c__Gammaproteoba...,k__Bacteria|p__Proteobacteria|c__Gammaproteoba...,k__Bacteria|p__Proteobacteria|c__Gammaproteoba...,k__Bacteria|p__Proteobacteria|c__Gammaproteoba...,k__Bacteria|p__Proteobacteria|c__Gammaproteoba...
47335,RDPYD18075162_A_saliva,mgs_3584,1,RDPYD18075162_A_saliva.5,d__Bacteria;p__Actinobacteriota;c__Actinobacte...,k__Bacteria,k__Bacteria|p__Actinobacteriota,k__Bacteria|p__Actinobacteriota|c__Actinobacteria,k__Bacteria|p__Actinobacteriota|c__Actinobacteria,k__Bacteria|p__Actinobacteriota|c__Actinobacte...,k__Bacteria|p__Actinobacteriota|c__Actinobacte...,k__Bacteria|p__Actinobacteriota|c__Actinobacte...,k__Bacteria|p__Actinobacteriota|c__Actinobacte...


In [36]:
len(samples_cluster_df)

47471

In [56]:
samples_cluster_df.to_csv("lineages_gtdb/samples_cluster.tsv", sep='\t', index=False)

In [50]:
# samples_cluster_df_group = pd.DataFrame(samples_cluster_df.groupby("sample_id").size()).reset_index()
# samples_cluster_df_group = samples_cluster_df_group.rename(columns={0: "sgb_count"}).sort_values("sgb_count")
# samples_cluster_df_group.to_csv("lineages_gtdb/samples_cluster_count.tsv", sep='\t', index=False)

In [59]:
samples_cluster_df2 = samples_cluster_df.set_index("sample_id")

In [69]:
samples_cluster_size = []
for sample_id in samples_cluster_df2.index.unique():
    sample_dict = {}
    mgs_id_lst = samples_cluster_df2.loc[[sample_id], "mgs_id"].tolist()
    bin_lst = samples_cluster_df2.loc[[sample_id], "bin_lst"].tolist()
    sample_dict["sample_id"] = sample_id
    sample_dict["sgb_count"] = len(mgs_id_lst)
    sample_dict["mgs_lst"] = ",".join(mgs_id_lst)
    sample_dict["bin_lst"] = ",".join(bin_lst)
    samples_cluster_size.append(sample_dict)
samples_cluster_size_df = pd.DataFrame(samples_cluster_size)

In [73]:
samples_cluster_size_df = samples_cluster_size_df\
                                .loc[:, ["sample_id", "sgb_count", "bin_lst", "mgs_lst"]]\
                                .sort_values(["sgb_count", "sample_id"])

In [74]:
samples_cluster_size_df.head()

Unnamed: 0,sample_id,sgb_count,bin_lst,mgs_lst
18,RDPYD18075182_A_saliva,1,RDPYD18075182_A_saliva.2,mgs_3558
58,RDPYD18088794_A_saliva,1,RDPYD18088794_A_saliva.2,mgs_1463
520,RDPYD18189834_A_saliva,1,RDPYD18189834_A_saliva.5,mgs_986
537,RDPYD18189851_A_saliva,1,RDPYD18189851_A_saliva.2,mgs_3447
617,RDPYD18189935_A_saliva,1,RDPYD18189935_A_saliva.2,mgs_30


In [75]:
samples_cluster_size_df.tail()

Unnamed: 0,sample_id,sgb_count,bin_lst,mgs_lst
699,RDPYD18190860_A_togue,44,"RDPYD18190860_A_togue.32,RDPYD18190860_A_togue...","mgs_1040,mgs_105,mgs_1113,mgs_1243,mgs_134,mgs..."
1923,RSZYD18187184_A_saliva,44,"RSZYD18187184_A_saliva.3,RSZYD18187184_A_saliv...","mgs_105,mgs_106,mgs_1113,mgs_1429,mgs_15,mgs_1..."
2252,RSZYD18187537_A_saliva,44,"RSZYD18187537_A_saliva.85,RSZYD18187537_A_sali...","mgs_103,mgs_104,mgs_1260,mgs_1282,mgs_134,mgs_..."
1358,RSZYD18078405_A_saliva,45,"RSZYD18078405_A_saliva.48,RSZYD18078405_A_sali...","mgs_103,mgs_1078,mgs_121,mgs_134,mgs_16,mgs_17..."
2646,RSZYD18187982_A_saliva,48,"RSZYD18187982_A_saliva.53,RSZYD18187982_A_sali...","mgs_146,mgs_155,mgs_1747,mgs_191,mgs_234,mgs_2..."


In [77]:
samples_cluster_size_df.to_csv("lineages_gtdb/samples_cluster_count_details.tsv", sep='\t', index=False)