In [3]:
import json
import pathlib
import numpy as np
import pandas as pd
from cleanlab.outlier import OutOfDistribution   
from cleanlab.rank import find_top_issues

In [4]:
path="/data/bacteria/File1_full_krakenbracken.txt"
krakenbracken = pd.read_csv(path, sep="\t", usecols=["sample_id","V2","V3","V4","V5","V6","V7"])

In [5]:
PATH_TRAIN="/data/bacteria/experiments/autoencoders/6mer/27122023-5"
KMER=6
xq = np.load(f"{PATH_TRAIN}/faiss-embeddings/query_embeddings.npy").astype("float32")

In [6]:
ood = OutOfDistribution()
train_feature_embeddings = np.load(f"{PATH_TRAIN}/faiss-embeddings/embeddings.npy").astype("float32")
test_feature_embeddings = np.load(f"{PATH_TRAIN}/faiss-embeddings/query_embeddings.npy").astype("float32")

# To get outlier scores for train_data using feature matrix train_feature_embeddings
ood_train_feature_scores = ood.fit_score(features=train_feature_embeddings)

# To get outlier scores for additional test_data using feature matrix test_feature_embeddings
ood_test_feature_scores = ood.score(features=test_feature_embeddings)

Fitting OOD estimator based on provided features ...


___
## Train outliers

In [7]:
top_train_ood_features_idxs = find_top_issues(quality_scores=ood_train_feature_scores, top=20)

In [8]:
top_train_ood_features_idxs

array([ 66932,  57040,   1699,  48858,  47255,    622,  75290,  87763,
        72004,  61354,   3707,  89564,  10104, 106937, 105761,  86574,
        26259,  87881,  61374,  93858])

In [9]:
with open(pathlib.Path(PATH_TRAIN).joinpath("faiss-embeddings/id_embeddings.json"),"r") as fp:   
    id_train = json.load(fp)

In [10]:
id_train = {int(k): v for k,v in id_train.items()}

list_files_outlier_train=[]
for idx in top_train_ood_features_idxs: 
    list_files_outlier_train.append(id_train[idx])

ids_outlier_train = [pathlib.Path(p).stem for p in list_files_outlier_train]

In [21]:
krakenbracken.query(f"sample_id in {ids_outlier_train}").sort_values("V3")

Unnamed: 0,sample_id,V2,V3,V4,V5,V6,V7
274931,SAMN02199272,Mannheimia haemolytica,2.328,Staphylococcus aureus,1.927,Pantoea sp. PSNIH1,1.902
285675,SAMN02440684,Mannheimia haemolytica,2.385,Staphylococcus aureus,2.264,Pantoea sp. PSNIH1,1.919
274631,SAMN02194946,Prochlorococcus marinus,4.714,Campylobacter coli,4.339,Staphylococcus aureus,3.68
287764,SAMN02585027,Mycoplasma bovis,7.453,Mycoplasma bovirhinis,6.194,Mycoplasma cynos,5.787
269019,SAMN00792234,Porphyromonas gingivalis,10.559,Tannerella sp. oral taxon HOT-286,6.055,Porphyromonas asaccharolytica,5.47
277874,SAMN02344579,Porphyromonas gingivalis,10.671,Tannerella sp. oral taxon HOT-286,5.202,Porphyromonas asaccharolytica,3.753
342328,SAMN03854411,Porphyromonas gingivalis,11.352,Tannerella sp. oral taxon HOT-286,5.504,Porphyromonas asaccharolytica,4.805
113025,SAMEA2275256,Campylobacter coli,11.854,Campylobacter hyointestinalis,8.873,Campylobacter jejuni,8.074
132747,SAMEA2613127,Campylobacter coli,15.175,Campylobacter hyointestinalis,12.679,Campylobacter iguaniorum,5.85
455482,SAMN06455630,Roseburia hominis,18.801,Faecalibacterium prausnitzii,18.222,Bacillus subtilis,6.716


## Test outliers

In [13]:
top_test_ood_features_idxs = find_top_issues(quality_scores=ood_test_feature_scores, top=20)
top_test_ood_features_idxs

array([ 4239,  5999,  9095,  6169, 11595, 11907,  3615, 11827, 11320,
       12012, 11391,  8642,  2874,  2317,  8993,  9364,  6793,  6326,
       11103,  9336])

In [19]:
# find the opposite of outliers
find_top_issues(quality_scores=-ood_test_feature_scores, top=20)


array([  858,  4698,  5734,  2179,   806, 12324,  7569,  5135,  6411,
       12264,  5917,  5961,   124,  4542,  8554,  5819,  5262,  9121,
        5838,  8063])

In [17]:
with open(pathlib.Path(PATH_TRAIN).joinpath("faiss-embeddings/id_query_embeddings.json"),"r") as fp:   
    id_test = json.load(fp)

id_test = {int(k): v for k,v in id_test.items()}

list_files_outlier_test=[]
for idx in top_test_ood_features_idxs: 
    list_files_outlier_test.append(id_test[idx])

ids_test_outlier = [pathlib.Path(p).stem for p in list_files_outlier_test]

In [20]:
krakenbracken.query(f"sample_id in {ids_test_outlier}").sort_values(by="V3")

Unnamed: 0,sample_id,V2,V3,V4,V5,V6,V7
577500,SAMN09074685,Enterobacter cloacae,1.55,Methylophaga frappieri,1.121,Oceanicoccus sagamiensis,1.118
393572,SAMN05216520,Erysipelothrix rhusiopathiae,2.29,Anaerostipes hadrus,2.271,Staphylococcus aureus,2.108
274864,SAMN02199114,Stenotrophomonas maltophilia,6.392,Stenotrophomonas acidaminiphila,3.661,Pseudoxanthomonas suwonensis,3.651
413494,SAMN05660668,Roseburia hominis,7.1,[Eubacterium] rectale,6.179,Butyrivibrio hungatei,5.388
455471,SAMN06455487,Roseburia hominis,8.614,Lachnoclostridium phocaeense,4.367,Burkholderia cenocepacia,3.986
413432,SAMN05660330,Bacillus thuringiensis,10.031,Burkholderia multivorans,4.477,Pseudomonas putida,2.191
195010,SAMEA3719298,Streptococcus suis,10.747,Streptococcus sp. 'group B',7.651,Streptococcus agalactiae,3.356
294104,SAMN02745516,Stenotrophomonas maltophilia,11.091,Sulfuricaulis limicola,10.19,Syntrophomonas wolfei,8.51
455460,SAMN06455237,Roseburia hominis,11.266,Lachnoclostridium phocaeense,6.904,Mordavella sp. Marseille-P3756,4.733
455428,SAMN06454752,Roseburia hominis,11.834,Fibrobacter succinogenes,7.201,Lachnoclostridium phocaeense,5.707
