In [1]:
import numpy as np
import pandas as pd
from Bio import Entrez  
from Bio.Entrez import Parser 
from ete3 import NCBITaxa
from concurrent.futures import ProcessPoolExecutor  



## Substract species info from raw data

In [2]:
wildsi_data_file = '../../data/2024-07-28_ena_sequences.csv'

In [3]:
chunksize = 5000000  # Adjust this size based on your system's memory constraints

# Initialize a Counter to keep track of organism counts
rows = []
accessions = []
countries = []
coordinates = []
species = []
taxids = []
tax_codes = []

# Read the CSV in chunks
nr = 1
for chunk in pd.read_csv(wildsi_data_file, chunksize=chunksize, low_memory=False):
    print(nr)
    #print(chunk.head(2))
    rows.append(len(list(chunk.index)))
    accessions.extend(list(chunk['ACCESSION']))
    countries.extend(list(chunk['COUNTRY']))
    coordinates.extend(list(chunk['LAT_LON']))
    species.extend(list(chunk['ORGANISM']))
    taxids.extend(list(chunk['TAXID']))
    tax_codes.extend(list(chunk['CODE']))
    nr = nr +1

1
2
3
4
5
6
7
8


In [8]:
df_wildsi_species = pd.DataFrame(zip(accessions, countries,
                             taxids, tax_codes, species),
                          columns = ['ACCESSION','COUNTRY', 'TAXID', 'CODE', 'ORGANISM'])

In [10]:
df_wildsi_species.CODE.value_counts()

CODE
VRL    11026881
INV     7710270
ENV     4601217
PLN     4303806
VRT     3437288
MAM     2201084
FUN     1650075
PRO     1425560
ROD      210476
HUM      157054
PHG       28822
MUS       10317
UNC        5420
SYN        1111
TGN          66
Name: count, dtype: int64

In [9]:
df_wildsi_species.to_csv('../../data/processed_data/2024-08-06_df_wildsi_species.csv')

## Collect NCBI Taxonomy data with etetoolkit

In [4]:
df_wildsi_species = pd.read_csv('../../data/processed_data/2024-08-06_df_wildsi_species.csv')

In [5]:
#df_wildsi_species[df_wildsi_species.CODE=='TGN'] #various organisms
df_wildsi_species = df_wildsi_species[df_wildsi_species.TAXID.notna()]
all_species_ncbi_ids = df_wildsi_species.TAXID.astype(int).drop_duplicates().to_list()

### The total number of unique species, unprocessed

In [6]:
len(all_species_ncbi_ids)

1532757

In [9]:
species_names = {}
norank_names = {}
superkingdom_names = {}
taxonomic_lineages = {}

# Initialize NCBITaxa  
ncbi = NCBITaxa()

try:    
    for taxid in all_species_ncbi_ids:

        # Get the species name  
        species_name = ncbi.get_taxid_translator([taxid])[taxid]
        species_names[taxid] = species_name

        # Get the taxonomic lineage  
        taxonomic_lineage = ncbi.get_lineage(taxid)  
        taxonomic_lineages[taxid] = taxonomic_lineage

        norank_name = ncbi.get_taxid_translator([taxonomic_lineage[1]])[taxonomic_lineage[1]]
        norank_names[taxid] = norank_name

        superkingdom_name = ncbi.get_taxid_translator([taxonomic_lineage[2]])[taxonomic_lineage[2]]
        superkingdom_names[taxid] = superkingdom_name
except KeyError:
    pass

In [10]:
unique_values = set(norank_names.values())  
print(unique_values) 

{'other entries', 'unclassified entries', 'cellular organisms', 'Viruses'}


In [11]:
unique_values = set(superkingdom_names.values())  
print(unique_values) 

{'Monodnaviria', 'Naldaviricetes', 'Alphasatellitidae', 'Archaea', 'unclassified sequences', 'other sequences', 'Satellites', 'unclassified bacterial viruses', 'Tolecusatellitidae', 'Bacteria', 'Pospiviroidae', 'Duplodnaviria', 'unclassified viruses', 'Riboviria', 'environmental samples', 'Adnaviria', 'Eukaryota', 'Polydnaviriformidae', 'Avsunviroidae', 'Anelloviridae', 'Varidnaviria', 'Thaspiviridae', 'Ribozyviria'}


In [12]:
wildsi_species_with_ncbi_domains = {}

for taxid in all_species_ncbi_ids:
    try:   
        if norank_names[taxid] == 'cellular organisms':
            wildsi_species_with_ncbi_domains[taxid] = superkingdom_names[taxid]
        elif norank_names[taxid] == 'Viruses':
            wildsi_species_with_ncbi_domains[taxid] = 'Viruses'
        else:
            wildsi_species_with_ncbi_domains[taxid] = superkingdom_names[taxid]
    except KeyError:
        pass
        

In [13]:
unique_values = set(wildsi_species_with_ncbi_domains.values())  
print(unique_values) 

{'unclassified sequences', 'other sequences', 'Viruses', 'Archaea', 'Bacteria', 'Eukaryota'}


In [14]:
df_wildsi_species['NCBI_taxa'] = df_wildsi_species['TAXID'].map(wildsi_species_with_ncbi_domains)

In [15]:
df_wildsi_species['NCBI_species'] = df_wildsi_species['TAXID'].map(species_names)

In [16]:
df_wildsi_species.to_csv('../../data/processed_data/2024-08-06_df_wildsi_species_ncbi_domains.csv')

## Remove duplicates and save updated species list

In [None]:
df_wildsi_species = pd.read_csv('../../data/processed_data/2024-08-06_df_wildsi_species_ncbi_domains.csv')

In [None]:
df_wildsi_species

In [4]:
df_wildsi_species = df_wildsi_species[['ACCESSION', 'TAXID', 'NCBI_taxa', 'NCBI_species']]

In [8]:
#df_wildsi_species = df_wildsi_species.head(1000000)

In [9]:
#df_wildsi_species = df_wildsi_species.sort_values(by='NCBI_species', ascending=False)

In [10]:
#df_wildsi_species[df_wildsi_species.NCBI_taxa=='other sequences'].NCBI_species.drop_duplicates().to_list()

### Pseudocode to remove duplicates

In [11]:
#Archaea
#if first word starts with uncultured
#if second word doesn't start with capital letter:
#change species to "unidentified Archaeon"
#else
#change species name to first plus second word
#else
#if first word starts with Candidatus
#change species name to second plus third word
#else
#change species name to first plus second word

#Bacteria
#if first word starts with uncultured or unidentified
#if second word doesn't start with capital letter:
#change species to "unidentified Bacterium"
#else
#change species name to first plus second word
#if first word doesn't start with capital letter
#change species name to "unidentified Bacterium"
#else
#if first word starts with Candidatus
#change species name to second plus third word
#else
#change species name to first plus second word

#Viruses
#keep as it is

#Eukaryota
#if first word starts with uncultured
#if second word doesn't start with capital letter:
#change species to "unidentified Eukaryon"
#else
#change species name to first word plus 'sp'
#if first word starts with cf.
#change species name to second plus third word
#if first word starts with [
#delete first and last symbols from first word and change species name to first plus second word
#else
#change species name to first plus second word

#unclassified sequences
#change species name to 'Unidentified'

#other sequences
#change species name to 'Unidentified'



In [7]:
def update_species_name(df, chunksize=1000000):  
    chunks = np.array_split(df, len(df) // chunksize + 1)  
    updated_chunks = []  
    for chunk in chunks:  
        chunk['NCBI_species_name'] = chunk.apply(lambda row: update_species(row['NCBI_taxa'], row['NCBI_species']), axis=1)  
        updated_chunks.append(chunk)  
    return pd.concat(updated_chunks, ignore_index=True)  

def update_species(taxa, species):  
    if pd.isnull(species) or not isinstance(species, str):  
        return 'Unidentified'  
    
    words = species.split()  
    
    if taxa == 'Archaea':  
        if words[0].startswith('uncultured'):  
            if not words[1][0].isupper():  
                return 'unidentified Archaeon'  
            else:  
                return ' '.join(words[:2])  
        elif words[0].startswith('Candidatus'):  
            return ' '.join(words[1:3])  
        else:  
            return ' '.join(words[:2])  
            
    elif taxa == 'Bacteria':  
        if words[0].startswith(('uncultured', 'unidentified')):  
            if not words[1][0].isupper():  
                return 'unidentified Bacterium'  
            else:  
                return ' '.join(words[:2])  
        elif not words[0][0].isupper():  
            return 'unidentified Bacterium'  
        elif words[0].startswith('Candidatus'):  
            return ' '.join(words[1:3])  
        else:  
            return ' '.join(words[:2])  
            
    elif taxa == 'Viruses':  
        return species  
            
    elif taxa == 'Eukaryota':  
        if words[0].startswith('uncultured'):  
            if not words[1][0].isupper():  
                return 'unidentified Eukaryon'  
            else:  
                return words[0] + ' sp'  
        elif words[0].startswith('cf.'):  
            return ' '.join(words[1:3])  
        elif words[0].startswith('['):  
            return ' '.join([words[0][1:-1]] + words[1:2])  
        else:  
            return ' '.join(words[:2])  
            
    elif taxa in ['unclassified sequences', 'other sequences']:  
        return 'Unidentified'  
        
    else:  
        return species  
    
def update_domain_name(df):  
    df['NCBI_domain_name'] = df['NCBI_taxa'].apply(lambda x: 'Unidentified' if x in ['unclassified sequences', 'other sequences'] else x)  
    return df  

In [8]:
df_wildsi_species_updated = update_species_name(df_wildsi_species) 

In [9]:
df_wildsi_species = update_domain_name(df_wildsi_species_updated)

### The total number of unique species, filtered

In [10]:
df_wildsi_species = df_wildsi_species[['TAXID', 'NCBI_domain_name', 'NCBI_species_name']]

In [13]:
df_wildsi_species.TAXID = df_wildsi_species.TAXID.astype(int)
df_wildsi_species = df_wildsi_species.drop_duplicates()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_wildsi_species.TAXID = df_wildsi_species.TAXID.astype(int)


In [14]:
df_wildsi_species.NCBI_domain_name.value_counts()

NCBI_domain_name
Eukaryota       33259
Bacteria         4937
Viruses          4684
Archaea           147
Unidentified       18
Name: count, dtype: int64

In [16]:
df_wildsi_species[df_wildsi_species.NCBI_domain_name.isna()]

Unnamed: 0,TAXID,NCBI_domain_name,NCBI_species_name
267285,1026341,,Unidentified
267292,1812256,,Unidentified
267295,370543,,Unidentified
267302,52163,,Unidentified
267322,1565112,,Unidentified
...,...,...,...
36753961,2925717,,Unidentified
36756677,2925237,,Unidentified
36757813,2893375,,Unidentified
36760498,2905984,,Unidentified


In [17]:
df_wildsi_species

Unnamed: 0,TAXID,NCBI_domain_name,NCBI_species_name
0,2132616,Eukaryota,Delia sp.
1,2697049,Viruses,Severe acute respiratory syndrome coronavirus 2
2,1312858,Viruses,Influenza A virus (A/swine/Ohio/A01349485/2013...
3,1472648,Eukaryota,Aquilaria agallochum
6,36312,Eukaryota,Hynobius retardatus
...,...,...,...
36753961,2925717,,Unidentified
36756677,2925237,,Unidentified
36757813,2893375,,Unidentified
36760498,2905984,,Unidentified


In [18]:
df_wildsi_species.to_csv('../../data/processed_data/2024-08-06_df_wildsi_species_with_domain_no_dups.csv')

## Collect NCBI Taxonomy data with biopython

In [2]:
df_wildsi_species = pd.read_csv('../../data/processed_data/2024-08-06_df_wildsi_species.csv')
#df_wildsi_species[df_wildsi_species.CODE=='TGN'] #various organisms
df_wildsi_species = df_wildsi_species[df_wildsi_species.TAXID.notna()]
all_species_ncbi_ids = df_wildsi_species.TAXID.astype(int).drop_duplicates().to_list()

In [3]:
all_species_ncbi_ids_test = all_species_ncbi_ids[0:1500]
with open('taxids_file_test.csv', 'w') as text_file:  
    for item in all_species_ncbi_ids_test:  
        text_file.write(f"{item}\n")  

In [4]:
with open('taxids_file_full_list.csv', 'w') as text_file:  
    for item in all_species_ncbi_ids:  
        text_file.write(f"{item}\n")  

In [3]:
from Bio import Entrez  
import time  
import random  

# Initialize Entrez  
Entrez.email = "homo.korvin@gmail.com"  # Replace with your email  

def parse_ncbi_taxids(taxid):  
    try:  
        # Get the species name and taxonomic lineage  
        handle = Entrez.efetch(db='taxonomy', id=taxid, retmode='xml')  
        record = Entrez.read(handle)  
        handle.close()  
        species_name = record[0]['ScientificName']  
        
        taxonomic_lineage = []  
        for rank in record[0]['LineageEx']:  
            taxonomic_unit = rank['Rank'] if rank['Rank'] != '' else 'no rank'  
            taxonomic_name = rank['ScientificName']  
            taxonomic_lineage.append((taxonomic_unit, taxonomic_name))  
            
        # Extract the superkingdom  
        species_domain = None  
        for unit, name in taxonomic_lineage:  
            if unit == 'superkingdom':  
                species_domain = name  
                break  
        
        # Return results  
        return species_domain, species_name, taxonomic_lineage

    except Exception as e:  
        print(f"An error occurred: {e}")  
        return None, None, None

def batch_parse_ncbi_taxids(taxid_list):  
    species_names = {}
    domain_names = {}
    taxonomic_lineages = {}
    
    for taxid in taxid_list:  
        result = parse_ncbi_taxids(taxid)  
        
        if result:  
            
            species_names[taxid] = result[0]
            domain_names[taxid] = result[1]
            taxonomic_lineages[taxid] = result[2]
        # Add random sleep to lessen the chance of being banned  
        time.sleep(random.uniform(0.5, 2.0))  
        
    return species_names, domain_names, taxonomic_lineages

In [5]:
all_species_ncbi_ids_test

import time  

start_time = time.process_time()  
results = batch_parse_ncbi_taxids(all_species_ncbi_ids_test)
end_time = time.process_time()  

print(f"Execution time: {end_time - start_time} seconds")  

An error occurred: HTTP Error 400: Bad Request
An error occurred: HTTP Error 400: Bad Request
An error occurred: HTTP Error 400: Bad Request
An error occurred: IncompleteRead(1862 bytes read)
An error occurred: HTTP Error 400: Bad Request
An error occurred: HTTP Error 400: Bad Request
An error occurred: HTTP Error 400: Bad Request
An error occurred: HTTP Error 400: Bad Request
An error occurred: HTTP Error 400: Bad Request
An error occurred: HTTP Error 400: Bad Request
An error occurred: HTTP Error 400: Bad Request
An error occurred: HTTP Error 400: Bad Request


KeyboardInterrupt: 

## Parse NCBI taxonomy data from dmp files

In [3]:
import pandas as pd  

def parse_dmp_file(file_path, columns):  
    """Reads a .dmp file and returns the DataFrame after cleaning."""  
    with open(file_path, 'r') as file:  
        lines = file.readlines()  

    # Process lines to split using the specified separator and remove line terminators  
    processed_lines = [line.strip().split("\t|\t") for line in lines]  

    # Convert processed lines to DataFrame  
    return pd.DataFrame(processed_lines, columns=columns)  

def get_taxonomy_info(nodes_file_path, names_file_path):  
    # Define columns for nodes.dmp and names.dmp  
    nodes_columns = [  
        "tax_id", "parent_tax_id", "rank", "embl_code", "division_id",  
        "inherited_div_flag", "genetic_code_id", "inherited_GC_flag",  
        "mitochondrial_genetic_code_id", "inherited_MGC_flag",  
        "GenBank_hidden_flag", "hidden_subtree_root_flag", "comments"  
    ]  
    names_columns = ["tax_id", "name_txt", "unique_name", "name_class"]  

    # Parse the files  
    nodes_df = parse_dmp_file(nodes_file_path, nodes_columns)  
    names_df = parse_dmp_file(names_file_path, names_columns) 
    
    return nodes_df, names_df

In [4]:
# Example usage:  
#[9606, 10090]  # Human and Mouse  
nodes_file = '../fetch_ncbi_taxonomy/taxdmp/nodes.dmp'  
names_file = '../fetch_ncbi_taxonomy/taxdmp/names.dmp'  

taxonomic_lineage, species_names = get_taxonomy_info(nodes_file, names_file)

In [5]:
all_species_ncbi_ids_str = [str(x) for x in all_species_ncbi_ids]

In [6]:
ncbi_species_ids_and_names = species_names[(species_names.tax_id.isin(all_species_ncbi_ids_str))&(species_names.name_class=='scientific name\t|')]
ncbi_species_ids_and_names = ncbi_species_ids_and_names[['tax_id', 'name_txt']]

In [7]:
ncbi_species_ids_and_names.columns = ['species_id_ncbi', 'species_name_ncbi']
ncbi_species_ids_and_names

Unnamed: 0,species_id_ncbi,species_name_ncbi
20,7,Azorhizobium caulinodans
32,9,Buchnera aphidicola
41,11,Cellulomonas gilvus
50,14,Dictyoglomus thermophilum
68,17,Methylophilus methylotrophus
...,...,...
4085087,3161927,Chthoniidae sp. Biologic-PSEU009
4085090,3161928,Chthoniidae sp. Biologic-PSEU005
4085092,3161929,Chthoniidae sp. Biologic-PSEU008
4085094,3161930,Chthoniidae sp. Biologic-PSEU006


In [8]:
taxonomic_lineage[(taxonomic_lineage['rank'] == 'superkingdom')]

Unnamed: 0,tax_id,parent_tax_id,rank,embl_code,division_id,inherited_div_flag,genetic_code_id,inherited_GC_flag,mitochondrial_genetic_code_id,inherited_MGC_flag,GenBank_hidden_flag,hidden_subtree_root_flag,comments
1,2,131567,superkingdom,,0,0,11,0,0,0,0,0,\t|
1697,2157,131567,superkingdom,,0,0,11,0,0,0,0,0,\t|
2182,2759,131567,superkingdom,,1,0,1,0,1,0,0,0,\t|
8235,10239,1,superkingdom,,9,0,1,0,0,0,0,0,\t|


In [9]:
taxonomic_lineage[taxonomic_lineage.tax_id == '2']

Unnamed: 0,tax_id,parent_tax_id,rank,embl_code,division_id,inherited_div_flag,genetic_code_id,inherited_GC_flag,mitochondrial_genetic_code_id,inherited_MGC_flag,GenBank_hidden_flag,hidden_subtree_root_flag,comments
1,2,131567,superkingdom,,0,0,11,0,0,0,0,0,\t|


In [10]:
supekingdom_ids = [2,2157,2759,10239]
supekingdom_ids = [str(x) for x in supekingdom_ids]
species_names[(species_names['tax_id'].isin(supekingdom_ids))&(species_names.name_class=='scientific name\t|')]

Unnamed: 0,tax_id,name_txt,unique_name,name_class
2,2,Bacteria,Bacteria <bacteria>,scientific name\t|
21122,2157,Archaea,,scientific name\t|
22826,2759,Eukaryota,,scientific name\t|
43160,10239,Viruses,,scientific name\t|


In [15]:
def add_superkingdom_info(lineage_df, names_df, species_df):  
    # Filter names to only include scientific names  
    scientific_names_df = names_df[names_df['name_class'] == 'scientific name\t|']  

    # Create a dictionary to store superkingdom information for each tax_id  
    tax_id_to_superkingdom = {}  

    # Traverse each tax_id to find its superkingdom  
    for idx, row in species_df.iterrows():  
        tax_id = row['species_id_ncbi']  
        current_tax_id = tax_id  
        superkingdom_tax_id = None  
        superkingdom_name = None  
        
        # Traverse up the tree to find the superkingdom  
        while current_tax_id != '1':  
            current_node = lineage_df[lineage_df['tax_id'] == current_tax_id]
            #print(current_node)
    
            if current_tax_id == '2':  
                superkingdom_tax_id = 2
                superkingdom_name = 'Bacteria'
                break
            elif current_tax_id == '2157':  
                superkingdom_tax_id = 2157
                superkingdom_name = 'Archaea' 
                break
            elif current_tax_id == '2759':  
                superkingdom_tax_id = 2759
                superkingdom_name = 'Eukaryota'  
                break
            elif current_tax_id == '10239':  
                superkingdom_tax_id = 10239
                superkingdom_name = 'Viruses'
                break
            elif current_node.empty:  
                break  
                
            current_tax_id = current_node.iloc[0]['parent_tax_id']  
        
        if superkingdom_tax_id:  
            tax_id_to_superkingdom[tax_id] = (superkingdom_tax_id, superkingdom_name)  

    # Get the superkingdom for each species in the species dataframe  
    species_df['tax_id_superkingdom'] = species_df['species_id_ncbi'].apply(lambda x: tax_id_to_superkingdom.get(x, (None, None))[0])  
    species_df['superkingdom_name'] = species_df['species_id_ncbi'].apply(lambda x: tax_id_to_superkingdom.get(x, (None, None))[1])  

    return species_df  

In [17]:
# taxonomic_lineage, species_names
import time  

start_time = time.process_time()  

ncbi_species_ids_and_names_and_domain = add_superkingdom_info(taxonomic_lineage, 
                                                              species_names,
                                                              ncbi_species_ids_and_names.tail(150))
end_time = time.process_time()  

print(f"Execution time: {end_time - start_time} seconds")  

Execution time: 361.604487896 seconds


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  species_df['tax_id_superkingdom'] = species_df['species_id_ncbi'].apply(lambda x: tax_id_to_superkingdom.get(x, (None, None))[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  species_df['superkingdom_name'] = species_df['species_id_ncbi'].apply(lambda x: tax_id_to_superkingdom.get(x, (None, None))[1])


In [18]:
ncbi_species_ids_and_names_and_domain

Unnamed: 0,species_id_ncbi,species_name_ncbi,tax_id_superkingdom,superkingdom_name
4083733,3158951,Metrioptera ambigua,2759,Eukaryota
4084227,3159452,Origocandona grommike,2759,Eukaryota
4084236,3159464,Neodiplostomum spathoides,2759,Eukaryota
4084237,3159465,Besnoitia sp. GMore-2024a,2759,Eukaryota
4084238,3159466,Zeugodacus cucurbitae noda-like virus 2,10239,Viruses
...,...,...,...,...
4085087,3161927,Chthoniidae sp. Biologic-PSEU009,2759,Eukaryota
4085090,3161928,Chthoniidae sp. Biologic-PSEU005,2759,Eukaryota
4085092,3161929,Chthoniidae sp. Biologic-PSEU008,2759,Eukaryota
4085094,3161930,Chthoniidae sp. Biologic-PSEU006,2759,Eukaryota


## Parse NCBI taxonomy data from zip files

In [7]:
import os  
import zipfile  
import pandas as pd  

# Directory containing ZIP files  
zip_directory = '/home/erikz/SRC/research/paul/dsi_origins_proj/data/processed_data/ncbi-lineage-zip-files'  

# Create a list to store dataframes  
dataframes = []  

# Iterate over all files in the specified directory  
for file_name in os.listdir(zip_directory):  
    if file_name.endswith('.zip'):  
        # Construct the full path to the zip file  
        file_path = os.path.join(zip_directory, file_name)  
        
        with zipfile.ZipFile(file_path, 'r') as zip_ref:  
            # Extract all contents to a temporary directory  
            zip_ref.extractall('tmp')  

            # Construct the path to the taxonomy_summary.tsv file  
            tsv_path = os.path.join('tmp', 'ncbi_dataset', 'data', 'taxonomy_summary.tsv')  

            # Check if the file exists  
            if os.path.exists(tsv_path):  
                # Read the TSV file into a DataFrame  
                df = pd.read_csv(tsv_path, sep='\t')  
                # Append the DataFrame to the list  
                dataframes.append(df)  

# Concatenate all DataFrames into a single DataFrame  
result_df = pd.concat(dataframes, ignore_index=True)  

# Save the resulting DataFrame to a CSV file in the zip file directory  
result_df.to_csv(os.path.join(zip_directory, 'results_taxonomic_lineage_ncbi.csv'), index=False)  

# Clean up by removing the temporary directory  
import shutil  
shutil.rmtree('tmp')  

### Domain info

In [2]:
df_wildsi_species = pd.read_csv('../../data/processed_data/2024-08-06_df_wildsi_species.csv')

In [3]:
df_wildsi_species = df_wildsi_species[['TAXID', 'CODE', 'ORGANISM']].drop_duplicates()

In [4]:
df_ncbi_lineage = pd.read_csv('../../data/processed_data/ncbi-lineage-zip-files/results_taxonomic_lineage_ncbi.csv')

  df_ncbi_lineage = pd.read_csv('../../data/processed_data/ncbi-lineage-zip-files/results_taxonomic_lineage_ncbi.csv')


In [5]:
df_ncbi_lineage = df_ncbi_lineage[['Taxid', 'Rank', 'Group name', 'Superkingdom name',
                                  'Kingdom name', 'Phylum name', 'Class name', 'Order name',
                                  'Family name', 'Genus name', 'Species name']]

df_ncbi_lineage = df_ncbi_lineage.drop_duplicates(subset='Taxid', keep='first')

In [6]:
df_wildsi_species.columns = ['Taxid', 'CODE', 'ORGANISM']

In [7]:
df_wildsi_species = df_wildsi_species[df_wildsi_species.Taxid.notna()]
df_ncbi_lineage = df_ncbi_lineage[df_ncbi_lineage.Taxid.notna()]
df_wildsi_species.Taxid = df_wildsi_species.Taxid.astype(int)
df_ncbi_lineage.Taxid = df_ncbi_lineage.Taxid.astype(int)

In [8]:
df_wildsi_species = df_wildsi_species.drop_duplicates(subset='Taxid', keep='first')

In [9]:
df_wildsi_species.Taxid.value_counts()

Taxid
2132616    1
1676019    1
1678791    1
1695213    1
1695209    1
          ..
310543     1
1816110    1
103249     1
1537424    1
1838057    1
Name: count, Length: 1532757, dtype: int64

In [10]:
df_wildsi_with_ncbi_lineage = df_wildsi_species.merge(df_ncbi_lineage, how='left', on='Taxid')

In [11]:
df_wildsi_with_ncbi_lineage.Taxid.value_counts()

Taxid
2132616    1
1676019    1
1678791    1
1695213    1
1695209    1
          ..
310543     1
1816110    1
103249     1
1537424    1
1838057    1
Name: count, Length: 1532757, dtype: int64

In [12]:
df_wildsi_with_ncbi_lineage['Superkingdom name'].value_counts()

Superkingdom name
Eukaryota    1068788
Bacteria      249329
Viruses       201677
Archaea         5325
Name: count, dtype: int64

## Save wildsi species with ncbi lineage info

In [14]:
df_wildsi_with_ncbi_lineage.to_csv('../2024-08-28_df_wildsi_with_ncbi_lineage.csv')