In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import torch.nn.functional as F

from netam import framework, models
from netam.framework import calculate_loss
from epam.torch_common import pick_device, print_parameter_count

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
shmoof_data_path = "/Users/matsen/data/shmoof_edges_11-Jan-2023_NoNode0_iqtree_K80+R_masked.csv"
all_df = pd.read_csv(shmoof_data_path)

# Here's the fraction of sequences of length more than 410
(all_df["parent"].str.len() > 410).sum() / len(all_df)

0.00403216242498992

In [3]:
train_df, val_df = framework.load_shmoof_dataframes("/Users/matsen/data/shmoof_edges_11-Jan-2023_NoNode0_iqtree_K80+R_masked.csv", val_nickname="51") #, sample_count=5000)


In [4]:
kmer_length = 3
max_length = 410

train_dataset = framework.SHMoofDataset(train_df, kmer_length=kmer_length, max_length=max_length)
val_dataset = framework.SHMoofDataset(val_df, kmer_length=kmer_length, max_length=max_length)

device = pick_device()
train_dataset.to(device)
val_dataset.to(device)

print(f"we have {len(train_dataset)} training examples and {len(val_dataset)} validation examples")

Using Metal Performance Shaders
we have 26592 training examples and 22424 validation examples


In [5]:
cat_params = {
    "kernel_size": [7, 9, 11, 13, 15],
    "dropout": [0., 0.1, 0.2],
}
int_params = {
    "embedding_dim": (10, 30),
    "num_filters": (5, 15),
}
float_params = {
}
log_float_params = {
}
fixed_hyperparams = {
    "batch_size": 1024,
    "learning_rate": 0.1,
    "min_learning_rate": 1e-2, # early stopping!
    "dropout": 0.1,
    "l2_regularization_coeff": 1e-6,
}
epochs = 100
optuna_steps = 45

hyper_burrito = framework.HyperBurrito(pick_device(), train_dataset, val_dataset, models.CNNModel,  epochs=epochs)

hyper_burrito.optuna_optimize(optuna_steps, cat_params, int_params, float_params, log_float_params, fixed_hyperparams)

[I 2023-11-21 19:51:06,381] A new study created in memory with name: no-name-b30516ae-93e7-4434-b018-9d695d924fe4


Using Metal Performance Shaders


[I 2023-11-21 19:55:30,036] Trial 0 finished with value: 0.060838413528787036 and parameters: {'kernel_size': 9, 'dropout': 0.2, 'embedding_dim': 10, 'num_filters': 7}. Best is trial 0 with value: 0.060838413528787036.
[I 2023-11-21 20:00:42,750] Trial 1 finished with value: 0.060794451621188635 and parameters: {'kernel_size': 7, 'dropout': 0.0, 'embedding_dim': 27, 'num_filters': 7}. Best is trial 1 with value: 0.060794451621188635.
[I 2023-11-21 20:04:52,711] Trial 2 finished with value: 0.060769484768137524 and parameters: {'kernel_size': 11, 'dropout': 0.2, 'embedding_dim': 12, 'num_filters': 6}. Best is trial 2 with value: 0.060769484768137524.
[I 2023-11-21 20:07:25,931] Trial 3 finished with value: 0.06631180427801044 and parameters: {'kernel_size': 15, 'dropout': 0.2, 'embedding_dim': 30, 'num_filters': 12}. Best is trial 2 with value: 0.060769484768137524.
[I 2023-11-21 20:12:39,330] Trial 4 finished with value: 0.060504133231961806 and parameters: {'kernel_size': 11, 'dropout

Best Hyperparameters: {'kernel_size': 13, 'dropout': 0.2, 'embedding_dim': 21, 'num_filters': 13}
Best Validation Loss: 0.060420312966900216
