# Notebook for NIST 

This notebook contains the code to generate the NIST expected files.

In [None]:
import pandas as pd
import openpyxl
import sys
import os

sys.path.append("../..")

from utils.ncbi.names import convert_expected, split_jams

In [None]:
# Read the expected.xlsx file to a dataframe.
df = pd.read_excel('expected.xlsx', engine='openpyxl', usecols=[2,4,5,6,7,8])

# Set Orgranism as the index.
df.set_index('Organism', inplace=True)
df.index.name = "Species"
display(df)

def convert_species_to_genus(df):
    # df = df.set_index("Species")
    df = df.groupby(df.index.str.split(" ", n=1).str[0]).sum()
    df = df.reset_index()
    df = df.rename(columns={"Species": "Genus"})
    df.set_index("Genus", inplace=True)

    # Ensure that the RA adds up to 100%.
    ra_sum = df["RA"].sum()
    print(ra_sum)

    return df

In [None]:
# Split the dataframe into one per column.
def main():
    cols = df.columns.tolist()
    dfs = [df[[col]] for col in cols]
    for d in dfs:
        name = d.columns[0]
        d.columns = ['RA']
        spec_path = f"{name}_expected_species.csv"
        genus_path = f"{name}_expected_genus.csv"

        d.to_csv(spec_path, index=True)

        # Add TAX_ID
        convert_expected(spec_path, split_jams)

        # Convert the species to genus.
        genus_df = convert_species_to_genus(d)
        genus_df.to_csv(genus_path, index=True)

        # Add TAX_ID
        convert_expected(genus_path, split_jams)

In [None]:
# <-- Main -->
main()