In [1]:
import pandas as pd
from Bio import Entrez
email = 'matinnu@biosustain.dtu.dk'

In [2]:
def strain_metadata_extractor(description):
    query = description.split()
    
    ctr = 0
    for q in query:
        if q.startswith("Annotated"):
            break
        else:
            ctr = ctr+1

    genus = query[0]
    species = query[1]

    if ctr == 4:
        strain = " ".join(query[2:4])
    elif ctr == 3:
        strain = query[2]

    return genus, species, strain

def find_ncbi_genus_lineage(genus, email):
    Entrez.email = email 
          
    # find keywords
    handle = Entrez.esearch(
        db='taxonomy', term=genus, rettype='gb', retmode='text')
    record = Entrez.read(handle, validate=False)
    handle.close()
    
    # find lineage
    assert len(record['IdList']) == 1
    
    id_tax = record['IdList'][0]
    handle2 = Entrez.efetch(db='taxonomy', id=id_tax, retmode='xml')
    record2 = Entrez.read(handle2, validate=False)
    handle2.close()
    return record2

def get_lineage(species, email):
    
    genus = species.split()[0]
    record = find_ncbi_genus_lineage(genus, email)
    
    container = []

    converter = {'superkingdom' : 'd',
             'phylum' : 'p',
             'class' : 'c',
             'order' : 'o',
             'family' : 'f'
            }

    for r in converter.keys():
        for i in record[0]['LineageEx']:
            if i['Rank'] == r:
                result = f"{converter[r]}__{i['ScientificName']}"
                container.append(result)
    
    container.append(f"g__{genus}")
    container.append(f"s__{species}")

    print(";".join(container))
    return container

In [3]:
df = pd.read_csv("../tables/clean_metadata.csv")
df.head(1)

Unnamed: 0,name,resource_id,resource_link,project_id,project_link,folder_name,related_projects,status,release_date,contact_name,contact_mail,masked_assemblies,assembly_id_alias,annotation_id_alias,annotations
0,Aspergillus acristatulus CBS 119.55 Annotated ...,Aspacri1,https://mycocosm.jgi.doe.gov/Aspacri1,1052020,https://genome.jgi.doe.gov/portal/pages/projec...,AspacrStandDraft_FD,"['SP 1052022', 'SP 1052021', 'AP 1052024', 'AP...",Complete,2014-03-25,Scott E. Baker,scott.baker@pnnl.gov,/datadrive/matin_other_projects/jgi_aspergillu...,Aspacri1,Aspacri1,/datadrive/matin_other_projects/jgi_aspergillu...


In [4]:
genus_all = []
species_all = []
strain_all = []

for i in df.name:
    genus, species, strain = strain_metadata_extractor(i)
    genus_all.append(genus)
    species_all.append(species)
    strain_all.append(strain)

In [5]:
# generate sample file
df_sample = pd.DataFrame()
df_sample.loc[:, "genome_id"] = df.loc[:, "project_id"]
df_sample.loc[:, "source"] = 'custom'
df_sample.loc[:, "genus"] = genus_all
df_sample.loc[:, "species"] = species_all
df_sample.loc[:, "strain"] = strain_all
df_sample.loc[:, "closest_placement_reference"] = ""
df_sample.loc[:, "description"] = df.loc[:, "name"]
df_sample.reset_index(drop=True)
df_sample.to_csv("../tables/samples.csv", index=False)

In [133]:
# generate unit file
df_units = pd.DataFrame()
df_units.loc[:, "genome_id"] = df.loc[:, "project_id"]
df_units.loc[:, "unit"] = 1
df_units.loc[:, "assembly"] = [i.replace("/datadrive/matin_other_projects/jgi_aspergillus/", "") for i in df.loc[:, "masked_assemblies"]]
df_units.loc[:, "annotations"] = [i.replace("/datadrive/matin_other_projects/jgi_aspergillus/", "") for i in df.loc[:, "annotations"]]
df_units.reset_index(drop=True)
df_units.to_csv("../tables/units.csv", index=False)

In [134]:
tax_dict = {df_sample.loc[i, "genome_id"] : " ".join(df_sample.loc[i, ["genus", "species"]].to_list()) for i in df_sample.index}

In [135]:
output = []
for num, genome_id in enumerate(tax_dict.keys()):
    lineage = get_lineage(tax_dict[genome_id], email)
    output.append({genome_id : lineage})

d__Eukaryota;p__Ascomycota;c__Eurotiomycetes;o__Eurotiales;f__Aspergillaceae;g__Aspergillus;s__Aspergillus acristatulus
d__Eukaryota;p__Ascomycota;c__Eurotiomycetes;o__Eurotiales;f__Aspergillaceae;g__Aspergillus;s__Aspergillus affinis
d__Eukaryota;p__Ascomycota;c__Eurotiomycetes;o__Eurotiales;f__Aspergillaceae;g__Aspergillus;s__Aspergillus alabamensis
d__Eukaryota;p__Ascomycota;c__Eurotiomycetes;o__Eurotiales;f__Aspergillaceae;g__Aspergillus;s__Aspergillus albertensis
d__Eukaryota;p__Ascomycota;c__Eurotiomycetes;o__Eurotiales;f__Aspergillaceae;g__Aspergillus;s__Aspergillus allahabadii
d__Eukaryota;p__Ascomycota;c__Eurotiomycetes;o__Eurotiales;f__Aspergillaceae;g__Aspergillus;s__Aspergillus alliaceus
d__Eukaryota;p__Ascomycota;c__Eurotiomycetes;o__Eurotiales;f__Aspergillaceae;g__Aspergillus;s__Aspergillus ambiguus
d__Eukaryota;p__Ascomycota;c__Eurotiomycetes;o__Eurotiales;f__Aspergillaceae;g__Aspergillus;s__Aspergillus amoenus
d__Eukaryota;p__Ascomycota;c__Eurotiomycetes;o__Eurotiales;f

In [136]:
df_taxonomy = pd.DataFrame()

for num, i in enumerate(output):
    genome_id = str(list(i.keys())[0])
    classification = ";".join(list(i.values())[0])
    df_taxonomy.loc[num, "user_genome"] = genome_id
    df_taxonomy.loc[num, "classification"] = classification

df_taxonomy.to_csv("../tables/JGI_taxonomy.tsv", sep="\t", index=False)