In [1]:
import pandas as pd 

from collections import namedtuple

Load list of reference sequences from NCBI

In [2]:
reference_ncbi = pd.read_csv("../data/ncbi_bacteria_references.tsv", sep="\t")
reference_ncbi.shape


(18637, 14)

In [41]:
reference_ncbi.loc[reference_ncbi["Organism Name"].apply(lambda x: x.startswith("Escherichia coli")),:]

Unnamed: 0,Assembly Accession,Assembly Name,Organism Name,Organism Infraspecific Names Breed,Organism Infraspecific Names Strain,Organism Infraspecific Names Cultivar,Organism Infraspecific Names Ecotype,Organism Infraspecific Names Isolate,Organism Infraspecific Names Sex,Annotation Name,Assembly Level,Assembly Release Date,WGS project accession,Assembly Stats Number of Scaffolds
3,GCF_000005845.2,ASM584v2,Escherichia coli str. K-12 substr. MG1655,,K-12 substr. MG1655,,,,,Annotation submitted by NCBI RefSeq,Complete Genome,2013-09-26,,1.0
4,GCF_000008865.2,ASM886v2,Escherichia coli O157:H7 str. Sakai,,Sakai substr. RIMD 0509952,,,,,Annotation submitted by NCBI RefSeq,Complete Genome,2018-06-08,,3.0


In [3]:
labels_ncbi = reference_ncbi[["Assembly Accession","Organism Name","Assembly Release Date"]].to_dict("records")


In [4]:
labels_ncbi[0]

{'Assembly Accession': 'GCF_000006945.2',
 'Organism Name': 'Salmonella enterica subsp. enterica serovar Typhimurium str. LT2',
 'Assembly Release Date': '2016-01-13'}

load list of labels EBI

In [5]:
labels_by_sampleid = pd.read_csv("../data/labels_krakenbracken_by_sampleid.txt", sep="\t")
labels_ebi = labels_by_sampleid.label.unique().tolist()

In [6]:
labels_ebi[:12]

['Lactobacillus hokkaidonensis',
 'Xanthomonas euvesicatoria',
 'Streptococcus pyogenes',
 'Corynebacterium glutamicum',
 'Vibrio parahaemolyticus',
 'Mycobacteroides chelonae',
 'Cronobacter sakazakii',
 'Serratia ficaria',
 'Pandoraea oxalativorans',
 'Staphylococcus epidermidis',
 'Clostridium perfringens',
 'Mycobacterium chimaera']

In [7]:
len(labels_ebi)

2600

In [20]:
def preprocessing(label):
    return "_".join(label.strip().lower().split(" "))

def check_ncbi_matches(label_ebi):
    label_ebi = preprocessing(label_ebi)
    labels = []
    for label_ncbi in labels_ncbi:
        label_p = preprocessing(label_ncbi["Organism Name"])
        if label_p.startswith(label_ebi):
            labels.append(label_ncbi)

    return labels

In [21]:
references_by_ebi = {preprocessing(label_ebi): check_ncbi_matches(label_ebi) for label_ebi in labels_ebi}

In [29]:
references_curated = dict(filter(lambda x: len(x[1])==1, references_by_ebi.items()))

In [45]:
to_check = dict(filter(lambda x: len(x[1])>1, references_by_ebi.items()))
len(to_check)

8

this ones were manually checked to select one reference

In [53]:
checked = {'escherichia_coli': [{'Assembly Accession': 'GCF_000008865.2',
   'Organism Name': 'Escherichia coli O157:H7 str. Sakai',
   'Assembly Release Date': '2018-06-08'}],
 'streptococcus_equi': [{'Assembly Accession': 'GCF_015689395.1',
   'Organism Name': 'Streptococcus equi subsp. zooepidemicus',
   'Assembly Release Date': '2020-11-25'}],
 'pseudomonas_syringae': [{'Assembly Accession': 'GCF_018394375.1',
   'Organism Name': 'Pseudomonas syringae',
   'Assembly Release Date': '2021-05-18'}],
 'pseudomonas_oryzihabitans': [{'Assembly Accession': 'GCF_007665635.1',
   'Organism Name': 'Pseudomonas oryzihabitans',
   'Assembly Release Date': '2019-07-30'}],
 'aerococcus_urinae': [{'Assembly Accession': 'GCF_001543175.1',
   'Organism Name': 'Aerococcus urinae',
   'Assembly Release Date': '2016-02-01'}],
 'sphingomonas_panacis': [{'Assembly Accession': 'GCF_001717955.1',
   'Organism Name': 'Sphingomonas panacis',
   'Assembly Release Date': '2016-09-01'}],
 'exiguobacterium_antarcticum': [{'Assembly Accession': 'GCF_025234655.1',
   'Organism Name': 'Exiguobacterium antarcticum',
   'Assembly Release Date': '2022-09-16'}],
 'coxiella-like_endosymbiont': [{'Assembly Accession': 'GCF_002871095.1',
   'Organism Name': 'Coxiella-like endosymbiont',
   'Assembly Release Date': '2018-01-10'}]}

references_curated.update(checked)

In [68]:
df_reference_sequences = pd.DataFrame.from_dict({k:v[0] for k,v in references_curated.items()}, orient="index")
df_reference_sequences.reset_index(inplace=True)
df_reference_sequences.rename({"index":"label"}, inplace=True, axis=1)
df_reference_sequences.to_csv("../data/ncib_reference_sequences.txt", sep="\t", index=False)
df_reference_sequences.head()

Unnamed: 0,label,Assembly Accession,Organism Name,Assembly Release Date
0,xanthomonas_euvesicatoria,GCF_017724035.1,Xanthomonas euvesicatoria pv. alfalfae,2021-04-03
1,streptococcus_pyogenes,GCF_900475035.1,Streptococcus pyogenes,2018-06-17
2,corynebacterium_glutamicum,GCF_000404185.1,Corynebacterium glutamicum SCgG2,2013-06-03
3,vibrio_parahaemolyticus,GCF_000196095.1,Vibrio parahaemolyticus RIMD 2210633,2004-05-11
4,mycobacteroides_chelonae,GCF_001632805.1,Mycobacteroides chelonae CCUG 47445,2016-04-27
