In [1]:
import json
import pathlib
import numpy as np
import pandas as pd
from cleanlab.outlier import OutOfDistribution   
from cleanlab.rank import find_top_issues

In [2]:
path="/data/bacteria/File1_full_krakenbracken.txt"
krakenbracken = pd.read_csv(path, sep="\t", usecols=["sample_id","V2","V3","V4","V5","V6","V7"])
N_OUTLIERS = 400

In [3]:
PATH_TRAIN="/data/bacteria/experiments/autoencoders/6mer/27122023-5"
KMER=6
xq = np.load(f"{PATH_TRAIN}/faiss-embeddings/query_embeddings.npy").astype("float32")

In [4]:
ood = OutOfDistribution()
train_feature_embeddings = np.load(f"{PATH_TRAIN}/faiss-embeddings/embeddings.npy").astype("float32")
test_feature_embeddings = np.load(f"{PATH_TRAIN}/faiss-embeddings/query_embeddings.npy").astype("float32")

# To get outlier scores for train_data using feature matrix train_feature_embeddings
ood_train_feature_scores = ood.fit_score(features=train_feature_embeddings)

# To get outlier scores for additional test_data using feature matrix test_feature_embeddings
ood_test_feature_scores = ood.score(features=test_feature_embeddings)

Fitting OOD estimator based on provided features ...


___
## Train outliers

In [5]:
top_train_ood_features_idxs = find_top_issues(quality_scores=ood_train_feature_scores, top=N_OUTLIERS)

In [6]:
top_train_ood_features_idxs

array([ 66932,  57040,   1699,  48858,  47255,    622,  75290,  87763,
        72004,  61354,   3707,  89564,  10104, 106937, 105761,  86574,
        26259,  87881,  61374,  93858,  78503,  11685,  17487,  31313,
        70439, 100305,  21173,  14604,  19959,  64045,  85660,  28385,
        72398,  62344,   4392,  71329,  30391,  71180,   7273,  23200,
        54834,  26741,  92065,  77870,  36498,  89468,   4169,  56625,
        54155,   3265, 108852,  97354,  82620,  62916,  11135,  96536,
         1700,  53586,  34304,  88777,  83859, 109774,  16623,  24646,
        49501,  87823, 103167,  57711,   7733,  52718,  65733,  84296,
        30037,  48026,  62954,  89014,   7765, 106501,    392,  80258,
        33732, 105964,  23498,  46038,     72,  68827,  18229,  27647,
        65116,  60519,  64677,  26449,  30298,  79268,  10424,  57076,
        80704, 105923,  32830,  64470,    494,   4940, 104939,  68571,
        40663,  85740,  67993,  85758,  68717,  88245,  44216,  64819,
      

In [7]:
with open(pathlib.Path(PATH_TRAIN).joinpath("faiss-embeddings/id_embeddings.json"),"r") as fp:   
    id_train = json.load(fp)

In [8]:
id_train = {int(k): v for k,v in id_train.items()}

list_files_outlier_train=[]
for idx in top_train_ood_features_idxs: 
    list_files_outlier_train.append(id_train[idx])

ids_outlier_train = [pathlib.Path(p).stem for p in list_files_outlier_train]

with open(pathlib.Path(PATH_TRAIN).joinpath("paths_outliers_train.json"),"w") as fp:
    json.dump(list_files_outlier_train, fp)

df_path_npy = pd.DataFrame([(pathlib.Path(p).stem,p) for p in list_files_outlier_train], columns=["sample_id","path_npy"])

df_outliers_train = pd.merge(
    krakenbracken.query(f"sample_id in {ids_outlier_train}"),
    df_path_npy
)

df_outliers_train.to_csv(pathlib.Path(PATH_TRAIN).joinpath("test/outliers-train.csv"))

## Test outliers

In [9]:
top_test_ood_features_idxs = find_top_issues(quality_scores=ood_test_feature_scores, top=N_OUTLIERS)
top_test_ood_features_idxs

array([ 4239,  5999,  9095,  6169, 11595, 11907,  3615, 11827, 11320,
       12012, 11391,  8642,  2874,  2317,  8993,  9364,  6793,  6326,
       11103,  9336,  9557,  6094,  7860,  4555,  6784,  7256,  7774,
        3782,  8013,  3233,  9112,  9220,  1757, 10778,  5129,  3506,
        5403,  8886,  6087, 11333,  2096,  9868,  9027, 11936,  3981,
        5704,  9567,  4806,  9758, 11846,  7307,  6382,  8465,  1784,
        5404,  9710, 10735,  4774,  4132, 10130, 12046,  8187,  6732,
       11091,  4348,  9190,   443,  5514,  1009,   656,  4324, 11985,
        2895,  2930,  1446, 11069, 11967,  5586, 11480,  3685,  9102,
        8612,  3872,  4649,   697, 12045, 10978, 11541,  1928, 10294,
        6904,  3810, 10880,  1374,  4101,  9864,  2907,  7520,   644,
        1364,  7502,  6315,  6049,  9662,  8586,  3656,  1202,  9202,
        1551,  1435,  5864,  8284,  4635,  4559,   654,  8413,  8362,
        4773,  6848,  4193,  1892,   751,  6447,  4909,   701,  7679,
        1247,  9840,

In [10]:
# find the opposite of outliers
find_top_issues(quality_scores=-ood_test_feature_scores, top=N_OUTLIERS)


array([  858,  4698,  5734,  2179,   806, 12324,  7569,  5135,  6411,
       12264,  5917,  5961,   124,  4542,  8554,  5819,  5262,  9121,
        5838,  8063, 10727,  8105,  2616,  3159, 12074,  4882,  9922,
        2439,  5486,  9700,  4111,  1412, 11858, 10191,  7578, 10796,
        3397, 10526,  4538,  4969,  3286,  9602,  3947,  2332,  4695,
        7394,  9407, 11431,  8375,  4179,  9460, 10954,  3425,  6845,
        9417,  3059,  8538,  6857,  6320,  6916,  4511,  5572,  8033,
        2753,  9020,  6279, 10419, 11711,  1534,  1075, 11082,  4313,
        3863,  6846,  5779,  3550,  2214,  1418,  2515,  6229,  8572,
        8663,  1934,  6739,  3371,  4811,  7042,  1573,  6826,  7081,
          92,  3486,  3840,  6554,  3269, 11434,  9489,  7198,  2732,
        3055,   258, 10935,  1134,   802,  8415,  8418,  2954,  4486,
        4779,  8823,  3131,  1442, 10070,  1846,  3699,  8506, 10696,
        7316,  4360,  4031,  7907,  5183,  7395,  4673,  3404,  3578,
        7322,  8036,

In [11]:
with open(pathlib.Path(PATH_TRAIN).joinpath("faiss-embeddings/id_query_embeddings.json"),"r") as fp:   
    id_test = json.load(fp)

id_test = {int(k): v for k,v in id_test.items()}

list_files_outlier_test=[]
for idx in top_test_ood_features_idxs: 
    list_files_outlier_test.append(id_test[idx])

ids_outlier_test = [pathlib.Path(p).stem for p in list_files_outlier_test]

with open(pathlib.Path(PATH_TRAIN).joinpath("paths_outliers_test.json"),"w") as fp:
    json.dump(list_files_outlier_test, fp)

df_path_npy = pd.DataFrame([(pathlib.Path(p).stem,p) for p in list_files_outlier_test], columns=["sample_id","path_npy"])

df_outliers_test = pd.merge(
    krakenbracken.query(f"sample_id in {ids_outlier_test}"),
    df_path_npy
)

df_outliers_test.to_csv(pathlib.Path(PATH_TRAIN).joinpath("test/outliers-test.csv"))