# Make New Expected from Old Expected
This notebook was used to rerun the expected taxonomy when the NCBI taxonomy database was updated. 

In [None]:
import os
from ncbi.names import convert_expected, generate_names_df
import pandas as pd
from typing import List
import re

In [None]:
def underscore_split(exp_df: pd.DataFrame) -> List[List[str]]:
    exp_names = exp_df.index.tolist()

    # print(exp_names)
    # Replace any instance of sp with sp.
    exp_names = [re.sub("_sp_", "_sp._", x) for x in exp_names]

    # There is also a case where sp is at the end, so replace that too.
    exp_names = [re.sub("_sp$", "_sp.", x) for x in exp_names]

    # print(exp_names)

    exp_split_names = [x.split("_") for x in exp_names]

    return exp_split_names

def whitespace_split(exp_df: pd.DataFrame) -> List[List[str]]:
    exp_names = exp_df.index.tolist()
    exp_split_names = [x.split() for x in exp_names]

    return exp_split_names

split_dict = {
    "hilo": underscore_split,
    "mixed": underscore_split,
    "bmock12": whitespace_split,
    "camisimGI": whitespace_split,
    "nist": underscore_split,
    "tourlousse": underscore_split
}

# replacement_dict = {
#     "Treponema caldarium": "Gracilinema caldarium",
#     "Pseudomonas stutzeri": "Stutzerimonas stutzeri",
# }

In [None]:
# Find everything in target directory with "expected_species_annotated" in the name.
target_dir = "../pipelines_old"
def find_expected(dir: str):
    for root, dirs, files in os.walk(dir):
        for file in files:
            if file.endswith("expected_species_annotated.csv"):
                yield os.path.join(root, file)

def drop_id_and_reannotate(f):
    new_path = f.replace("pipelines_old", "expected_pipelines")
    new_path = new_path.replace("expected_species_annotated", "expected_species")

    print(new_path)

    new_df = pd.read_csv(f, sep=",", index_col=0)

    # Drop tax_id column.
    new_df.drop(new_df.columns[1], axis=1, inplace=True)

    # Save to new path.
    # new_df.to_csv(new_path, index_label="Species")

    source = new_path.split("/")[-2]
    print(source)

    convert_expected(new_path, split_func=split_dict[source])

def copy_to_expected_dir(f: str):
    new_path = f.replace("pipelines_old", "expected_pipelines")

    new_df = pd.read_csv(f, sep=",", index_col=0)

    # Save to new path.
    new_df.to_csv(new_path, index_label="Species")

for f in find_expected(target_dir):
    # Now, we are going to reannotate the files.
    drop_id_and_reannotate(f)