We need one sequence for each species to use a reference in the computation of ANI. 

In [2]:
import numpy as np
import pandas as pd

from pathlib import Path
from collections import Counter

KMER = 6
DIR_EXP = Path(f"../experiments-paper/{KMER}mer")
NAME_EXPERIMENT = "07_25_2024-autoencoder"

path_experiment = DIR_EXP.joinpath(NAME_EXPERIMENT)

In [3]:
# get predictions

kfold_test = list(
    path_experiment.joinpath("cross-validation").glob("test*")
)
kfold_test.sort()

paths_fcgr = []
labels = []

for kfold in kfold_test:

    with open(kfold) as fp:
        for line in fp.readlines():
            path_fcgr, label = line.strip().split("\t")

            # if path_fcgr not in df_outliers.path:
            labels.append(label)
            paths_fcgr.append(path_fcgr)

df_labels = pd.DataFrame({
    "path_fcgr": paths_fcgr,
    "label": labels
})
df_labels["sample_id"] = df_labels.path_fcgr.apply(lambda s: Path(s).stem)
df_labels["tarfile"] = df_labels.path_fcgr.apply(lambda s: Path(s).parent.stem)
df_labels.head()

Unnamed: 0,path_fcgr,label,sample_id,tarfile
0,data/fcgr/6mer/salmonella_enterica__20/SAMN060...,salmonella_enterica,SAMN06034028,salmonella_enterica__20
1,data/fcgr/6mer/escherichia_coli__21/SAMN097384...,escherichia_coli,SAMN09738459,escherichia_coli__21
2,data/fcgr/6mer/salmonella_enterica__46/SAMN103...,salmonella_enterica,SAMN10391382,salmonella_enterica__46
3,data/fcgr/6mer/neisseria_meningitidis__02/SAME...,neisseria_meningitidis,SAMEA2147963,neisseria_meningitidis__02
4,data/fcgr/6mer/salmonella_enterica__18/SAMN053...,salmonella_enterica,SAMN05357778,salmonella_enterica__18


Number of sequences by label

In [4]:
count_by_label = Counter(df_labels.label)

Mapping integer encoding (by model) to label.
They were generated by considering all labels, and then sorting

In [5]:
unique_labels = list(df_labels.label.unique())
unique_labels.sort()
dict_int2label = {idx: label for idx, label in enumerate(unique_labels)}

In [6]:
path_cf = path_experiment.joinpath("cross-validation/confident-learning")

pred_probs = np.load( path_cf.joinpath("pred_probs.npy") )
preds = [dict_int2label[int(x)] for x in pred_probs.argmax(axis=1) ]
df_labels["pred"] = preds
df_labels["prob"] = pred_probs.max(axis=1)
len(preds)

657775

# find the most confident prediction

In [7]:
df_labels.head()

Unnamed: 0,path_fcgr,label,sample_id,tarfile,pred,prob
0,data/fcgr/6mer/salmonella_enterica__20/SAMN060...,salmonella_enterica,SAMN06034028,salmonella_enterica__20,salmonella_enterica,0.999959
1,data/fcgr/6mer/escherichia_coli__21/SAMN097384...,escherichia_coli,SAMN09738459,escherichia_coli__21,escherichia_coli,0.999406
2,data/fcgr/6mer/salmonella_enterica__46/SAMN103...,salmonella_enterica,SAMN10391382,salmonella_enterica__46,salmonella_enterica,0.999947
3,data/fcgr/6mer/neisseria_meningitidis__02/SAME...,neisseria_meningitidis,SAMEA2147963,neisseria_meningitidis__02,neisseria_meningitidis,0.999588
4,data/fcgr/6mer/salmonella_enterica__18/SAMN053...,salmonella_enterica,SAMN05357778,salmonella_enterica__18,salmonella_enterica,0.999957


In [8]:
df_labels["true_positive"] = df_labels.apply(lambda row: row["label"] == row["pred"], axis=1)

In [9]:
list_best_classified = [] 
no_data = []
for label in unique_labels:
    # label = unique_labels[0]
    try: 
        best_classified = df_labels.query(f"label == '{label}' and pred == '{label}'").sort_values(by="prob", ascending=False).iloc[0,:].to_dict()
        list_best_classified.append(best_classified)
    except:
        no_data.append(label)

In [10]:
df_best_classified = pd.DataFrame(list_best_classified)

In [11]:
df_best_classified.to_csv("../data/reference_list.txt", sep="\t", index=False)

In [14]:
for tarfile in df_best_classified.tarfile.unique():

    sample_ids = df_best_classified.query(f"tarfile == '{tarfile}'").sample_id.tolist()
    with open(f"../data/reference_sequences/lists_by_tar/{tarfile}.txt", "w") as fp:

        for sample_id in sample_ids:
            fp.write(f"{tarfile}/{sample_id}.fa\n")