In [30]:
from pathlib import Path
import numpy as np
import tensorflow as tf

OUTDIR = Path("output")
OUTDIR.mkdir(parents=True, exist_ok=True)
BATCH_SIZE=256
EPOCHS=30

In [31]:
EXP = Path("/home/avila/github/panspace-paper")

path_train_embeddings = "experiments-paper/6mer/07_25_2024-autoencoder/cross-validation/mean_squared_error-relu-relu-5-fold/faiss-embeddings/embeddings.npy"
path_test_embeddings = "experiments-paper/6mer/07_25_2024-autoencoder/cross-validation/mean_squared_error-relu-relu-5-fold/test/embeddings.npy" 
path_train_labels = "experiments-paper/6mer/07_25_2024-autoencoder/cross-validation/train_5-fold.txt" 
path_test_labels = "experiments-paper/6mer/07_25_2024-autoencoder/cross-validation/test_5-fold.txt" 
outdir = "experiments-paper/6mer/07_25_2024-autoencoder/cross-validation/mean_squared_error-relu-relu-5-fold/confident-learning"

In [32]:
# path_train_embeddings = ""
X_train = np.load(EXP.joinpath(path_train_embeddings)) 

# path_train_labels = ""
labels_train = []
with open(EXP.joinpath(path_train_labels)) as fp:
    for line in fp.readlines():
        label = line.strip().split("\t")[-1]
        labels_train.append(label)

In [33]:

# path_test_labels = ""
labels_test = []
with open(EXP.joinpath(path_test_labels)) as fp:
    for line in fp.readlines():
        label = line.strip().split("\t")[-1]
        labels_test.append(label)


In [34]:
unique_labels = list(set(labels_train).union(labels_test))
unique_labels.sort()
dict_labels = {label: idx for idx, label in enumerate(unique_labels)}

In [35]:
y_train = np.array([dict_labels[l] for l in labels_train])

In [36]:
# Model parameters
num_classes = len(unique_labels)
input_shape = (128,)

model = tf.keras.Sequential(
    [
        tf.keras.layers.Input(shape=input_shape),
        tf.keras.layers.Dense(64, activation="relu"),
        tf.keras.layers.Dense(32, activation="relu"),
        tf.keras.layers.Dense(num_classes, activation="softmax"),
    ]
)

model.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    metrics=[
        tf.keras.metrics.SparseCategoricalAccuracy(name="acc"),
    ],
)

Path(f"{OUTDIR}/checkpoints").mkdir(exist_ok=True, parents=True)
cb_checkpoint = tf.keras.callbacks.ModelCheckpoint(
    filepath=f'{OUTDIR}/checkpoints/weights-mlp.keras',
    monitor='val_loss',
    mode='min',
    save_best_only=True,
    verbose=1
)

# stop training if
cb_earlystop = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    mode='min',
    min_delta=0.001,
    patience=20,
    verbose=1
)
# save history of training
cb_csvlogger = tf.keras.callbacks.CSVLogger(
    filename=f'{OUTDIR}/training_log.csv',
    separator='\t',
    append=False
)

# # save time by epoch
# cb_csvtime = CSVTimeHistory(
#     filename=f'{OUTDIR}/time_log.csv',
#     separator='\t',
#     append=False
# )

callbacks = [
    cb_checkpoint,
    cb_earlystop,
    cb_csvlogger,
    # cb_csvtime,
]

model.fit(
    X_train,
    y_train,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_split=0.2,
    callbacks=callbacks,
)


Epoch 1/30
Epoch 1: val_loss improved from inf to 0.50825, saving model to output/checkpoints/weights-mlp.keras
Epoch 2/30
Epoch 2: val_loss improved from 0.50825 to 0.36364, saving model to output/checkpoints/weights-mlp.keras
Epoch 3/30
Epoch 3: val_loss improved from 0.36364 to 0.31380, saving model to output/checkpoints/weights-mlp.keras
Epoch 4/30
Epoch 4: val_loss improved from 0.31380 to 0.28152, saving model to output/checkpoints/weights-mlp.keras
Epoch 5/30
Epoch 5: val_loss improved from 0.28152 to 0.26539, saving model to output/checkpoints/weights-mlp.keras
Epoch 6/30
Epoch 6: val_loss improved from 0.26539 to 0.25250, saving model to output/checkpoints/weights-mlp.keras
Epoch 7/30
Epoch 7: val_loss improved from 0.25250 to 0.24390, saving model to output/checkpoints/weights-mlp.keras
Epoch 8/30
Epoch 8: val_loss improved from 0.24390 to 0.24059, saving model to output/checkpoints/weights-mlp.keras
Epoch 9/30
Epoch 9: val_loss improved from 0.24059 to 0.23772, saving model 

<keras.src.callbacks.History at 0x7aa91991dba0>

### test

In [37]:
# path_test_embeddings = ""
X_test = np.load(EXP.joinpath(path_test_embeddings))
y_test = np.array([dict_labels[l] for l in labels_test])

In [38]:
pred_probs = model.predict(X_test)

 125/4112 [..............................] - ETA: 1s  



In [39]:
y_pred = pred_probs.argmax(axis=1)

In [None]:
np.save(file=outdir.joinpath("pred_probs.npy"), arr=pred_probs)
np.save(file=outdir.joinpath("labels.npy"), arr=y_test)

In [47]:
from sklearn.metrics import classification_report

report = classification_report(y_test, y_pred, output_dict=True, 
                               labels = np.array(list(dict_labels.values())),
                               target_names = np.array(list(dict_labels.keys()))
                               )

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [49]:
dict(
    filter(lambda d: d[1]["support"]>0,report.items())
)

{'[clostridium]_bolteae': {'precision': 0.0,
  'recall': 0.0,
  'f1-score': 0.0,
  'support': 11.0},
 '[clostridium]_sphenoides': {'precision': 0.0,
  'recall': 0.0,
  'f1-score': 0.0,
  'support': 1.0},
 '[clostridium]_ultunense': {'precision': 0.0,
  'recall': 0.0,
  'f1-score': 0.0,
  'support': 2.0},
 '[enterobacter]_lignolyticus': {'precision': 0.0,
  'recall': 0.0,
  'f1-score': 0.0,
  'support': 1.0},
 '[eubacterium]_eligens': {'precision': 0.1,
  'recall': 0.14285714285714285,
  'f1-score': 0.11764705882352941,
  'support': 7.0},
 '[eubacterium]_hallii': {'precision': 0.5652173913043478,
  'recall': 0.7222222222222222,
  'f1-score': 0.6341463414634146,
  'support': 18.0},
 '[eubacterium]_rectale': {'precision': 0.0,
  'recall': 0.0,
  'f1-score': 0.0,
  'support': 5.0},
 '[eubacterium]_sulci': {'precision': 0.0,
  'recall': 0.0,
  'f1-score': 0.0,
  'support': 2.0},
 '[haemophilus]_ducreyi': {'precision': 1.0,
  'recall': 0.6666666666666666,
  'f1-score': 0.8,
  'support': 3.0}

In [50]:
X_test.shape

(131555, 128)

In [57]:
path_label_issues = "/home/avila/github/panspace-paper/experiments-paper/6mer/07_25_2024-metric_learning/cross-validation/confident-learning/label_issues.npy"

In [58]:
li = np.load(path_label_issues)

In [59]:
li.sum()

215699