In [2]:
# automatically upload modules
%load_ext autoreload
%autoreload 2

In [3]:
from argparse import Namespace
import os
import pandas as pd
import pickle
from ray import tune
from ray.tune.schedulers import ASHAScheduler
import torch
import pickle

from genome_embeddings import data_viz
from genome_embeddings import evaluate
from genome_embeddings import models
from genome_embeddings import train_test
from genome_embeddings import util
from genome_embeddings import trainable # import before ray (?)
import ray

In [4]:
flags = Namespace(
    DATA_FP = '/Users/natasha/Desktop/mcgill_postdoc/ncbi_genomes/genome_embeddings/data/', 
    SAVE_FP = '/Users/natasha/Desktop/mcgill_postdoc/ncbi_genomes/genome_embeddings/',
    KEGG_FP = '/Users/natasha/Desktop/mcgill_postdoc/ncbi_genomes/kegg_dataset/',
    data_source = 'kegg', #['get_homologues' | 'kegg']
    n_test = 0.1, # train-test split, n * 100 = % of data that goes into test set (e.g.: 0.1 -> 10%)
    num_epochs = 3, # IF CIRRICULUM, DO NOT SET < 3
    batch_size = 128,
    lr = 1e-3,
    kfolds = 10, # number of folds for cross-validation
    print_every = 50, # print loss every n batches during training (5)
    replacement_threshold = 0.5, # probability over which binarizer converts to a 1
    num_corruptions = 100, # number of corrupted versions of a genome to produce
    corruption_fraction = 0.5, # fraction of genes to retain during corruption process
    phy_mode = "bacteria_only", # training with only bacteria vs also euk/arch
    cirriculum = False, # implement cirriculum learning based on gene count
    rare_threshold = 10, # drop features that occur fewer than this times in training ds 
    weight_decay=0.1 # L2 regularization
    )

### Data exploration + preprocessing 

In [None]:
# # First create genome representations (very slow)
# # Each genome is a list of KO's and/or KEGG modules
# if os.path.isfile(flags.DATA_FP+'genome_to_mod.csv'):
#     print("Genome representations already exist")
# else:
#     genome_rep.genome_kos(flags.KEGG_FP)
#     print("Must generate genome representations from scratch. This will take several hours.")

In [None]:
# df, cluster_names = util.load_data(flags.DATA_FP, flags.data_source)
# genome_to_tax = util.genome_to_tax(df)
# with open('cluster_names.txt', 'wb') as filehandle:
#     pickle.dump(cluster_names, filehandle)


In [None]:
#data_viz.tax_distrib(df, genome_to_tax)

In [None]:
#data_viz.module_stats(df)

In [None]:
#data_viz.genes_per_genome(df)

In [None]:
# # Split train-test sets in a phylogenetically balanced manner 
# if os.path.isfile(flags.DATA_FP+'uncorrupted_train_balanced.csv'):
#     print("Train-test split already exists, loading from file")
#     train_orig = pd.read_csv(flags.DATA_FP+"uncorrupted_train_balanced.csv", index_col=0)    
#     test_orig = pd.read_csv(flags.DATA_FP+"uncorrupted_test_balanced.csv", index_col=0)    

# else:
#     # Create dict mapping each genome to a unique numerical ID
#     genome_to_num ={}
#     for i,genome in enumerate(df.index):
#         genome_to_num[genome] = i

#     num_to_genome = {v: k for k, v in genome_to_num.items()}
        
#     print("Generating train-test split")
#     train_orig, test_orig = util.balanced_split(df, flags.n_test, genome_to_tax, 
#                                                 num_to_genome, flags.DATA_FP)    
#     train_orig.to_csv(flags.DATA_FP+'uncorrupted_train_balanced.csv')
#     test_orig.to_csv(flags.DATA_FP+'uncorrupted_test_balanced.csv')

In [None]:
#data_viz.hist_prob_ko(train_orig)

In [None]:
# if flags.phy_mode == "bacteria_only":
#     train_genomes = train_orig.index.to_list()
#     test_genomes = test_orig.index.to_list()
    
#     unf_train_data, train_tax_dict = util.bacteria_only(train_orig, train_genomes, genome_to_tax)
#     unf_test_data, test_tax_dict = util.bacteria_only(test_orig, test_genomes, genome_to_tax)

In [None]:
# # Remove rare features from train + test datasets
# # Rare = fewer than n occurences in training dataset
# # Last argument specifies n, set to correspond to 1% of genomes (3432 genomes -> n = 34)
# train_data, test_data, cluster_names = util.remove_rare(unf_train_data, unf_test_data, 
#                                                         cluster_names, unf_train_data.shape[0]*0.01)

In [5]:
# Produce corrupted genomes
# Could eventually do re-sampling / extra-corrupting to have more examples of "rare" genome types
#    e.g.: those from underrepresented groups M00003   

if os.path.isfile(flags.DATA_FP+'corrupted_train_0607.pt'):
    print("Corrupted genomes already exist")
    train_data = torch.load(flags.DATA_FP+"corrupted_train_0607.pt")
    test_data = torch.load(flags.DATA_FP+"corrupted_test_0607.pt")
    genome_idx_train = torch.load(flags.DATA_FP+"genome_idx_train_0607.pt")
    genome_idx_test = torch.load(flags.DATA_FP+"genome_idx_test_0607.pt")
else:
    print("Generating corrupted dataset from scratch with",flags.num_corruptions,"corrupted versions of each genome")
    train_data, genome_idx_train = util.corrupt(train_data, flags.num_corruptions, flags.corruption_fraction, 
                                                cluster_names, "train", flags.DATA_FP)

    test_data, genome_idx_test = util.corrupt(test_data, flags.num_corruptions, flags.corruption_fraction, 
                                              cluster_names, "test", flags.DATA_FP)

Corrupted genomes already exist


In [None]:
# print(("There are %s genomes and %s features in the training dataset") % 
#       (train_data.shape[0],int(train_data.shape[1]/2)))

# print(("There are %s genomes and %s features in the test dataset") % 
#       (test_data.shape[0],int(test_data.shape[1]/2)))

In [None]:
# if flags.cirriculum:
#     loaders = util.cirriculum_load(train_data, test_data, flags.batch_size, 
#                            flags.batch_size, cluster_names)
# else:
#     loaders = util.dataloaders(train_data, test_data, flags.batch_size, 
#                                flags.batch_size, cluster_names)

In [None]:
# from skorch.dataset import CVSplit
# from torch.utils.data import DataLoader, TensorDataset

In [None]:
# num_features = int(train_data.shape[1]/2)
# X = train_data[:,:num_features] # corrupted genomes in first half of matrix columns
# y = train_data[:,num_features:] # uncorrupted in second half of matrix columns

In [None]:
# # Create dataloader with folds 
# train_ds = TensorDataset(X, y)
# splitter = CVSplit(cv=3)
# train_dl = splitter(train_ds)

In [None]:
# type(train_dl), type(train_dl[0])

In [None]:
# import numpy as np
# k = 3
# idx_genomes = [i for i in range(len(X))]
# num_test = int(len(idx_genomes) / k )
# num_train = len(idx_genomes) - num_test

In [None]:
# test_idx = np.random.choice(idx_genomes, num_test, replace=False)
# train_idx = list(set(idx_genomes) - set(test_idx))

In [None]:
# a = X[train_idx]

In [None]:
# a.shape

In [None]:
# train_set = train_dl[0]
# cv_set = train_dl[1]

### Define and train network

In [6]:
train_data.shape[1]

14130

In [7]:
num_features = int(train_data.shape[1]/2) # Number of features in the entire dataset (train + test)

In [21]:
# define the network
model = models.AutoEncoder(num_features, 6)
print(model)

AutoEncoder(
  (layers): ModuleList(
    (0): Linear(in_features=7065, out_features=3532, bias=True)
    (1): Linear(in_features=3532, out_features=1766, bias=True)
    (2): Linear(in_features=1766, out_features=1177, bias=True)
    (3): Linear(in_features=1177, out_features=1766, bias=True)
    (4): Linear(in_features=1766, out_features=3532, bias=True)
    (5): Linear(in_features=3532, out_features=7065, bias=True)
  )
)


In [23]:
import torch.optim as optim
import torch.nn as nn

for epoch in range(1):
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    model.train()
    batch_size = 32
    loaders = trainable.cv_dataloader(batch_size, num_features, 10)
    
    model = model.to(device)
    optimizer = optim.Adam(
        model.parameters(),
        lr=0.01,
        weight_decay=0.1
        )
    criterion = nn.BCELoss(reduction='sum')
    
    
    # enumerate batches in epoch
    for batch_idx, (data, target) in enumerate(loaders["train"]):

        if batch_idx > 9: break
        
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        pred = model(data)
        loss = criterion(pred, target)
        loss.backward()
        optimizer.step()
        
        print("batch_idx", batch_idx, loss.item())

batch_idx 0 3164000.0
batch_idx 1 3184500.0
batch_idx 2 3198500.0
batch_idx 3 3159300.0
batch_idx 4 3204051.0
batch_idx 5 18199942.0
batch_idx 6 2638798.75
batch_idx 7 3220659.75
batch_idx 8 3085895.5
batch_idx 9 3330005.5


In [24]:
train_vars = {"batch_size": 32, #tune.grid_search([32, 64]), # [32, 64, 128, 256]
              "num_epochs": flags.num_epochs,
              "replacement_threshold": flags.replacement_threshold,
              "kfolds": flags.kfolds,
              "lr": tune.grid_search([0.1, 0.0001]),
              "weight_decay": 0.01, #tune.grid_search([0.01, 0.00001]),
              "nn_layers": tune.grid_search([6,8])
}

In [25]:
ray.shutdown()
ray.init(local_mode=True)

2020-07-09 11:22:15,906	INFO resource_spec.py:212 -- Starting Ray with 8.74 GiB memory available for workers and up to 4.39 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-07-09 11:22:16,391	INFO services.py:1170 -- View the Ray dashboard at [1m[32mlocalhost:8265[39m[22m


{'node_ip_address': '192.168.2.23',
 'raylet_ip_address': '192.168.2.23',
 'redis_address': '192.168.2.23:56511',
 'object_store_address': '/tmp/ray/session_2020-07-09_11-22-15_787695_80208/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2020-07-09_11-22-15_787695_80208/sockets/raylet',
 'webui_url': 'localhost:8265',
 'session_dir': '/tmp/ray/session_2020-07-09_11-22-15_787695_80208'}

In [None]:
analysis = tune.run(
    trainable.train_AE, 
    name="exp_1",
    config=train_vars, 
    verbose=2, 
    resources_per_trial={
            "cpu": 2,
            "gpu": 0
    },
    num_samples=2,
    scheduler=ASHAScheduler(metric="test_f1", mode="max", grace_period=1, time_attr="n_batch"),
    local_dir=flags.SAVE_FP+"TUNE_RESULT_DIR"
    )

print("Best config is:", analysis.get_best_config(metric="test_f1"))

2020-07-09 11:38:15,858	INFO trainable.py:217 -- Getting current IP.


batch_size 32




f1 0.5995521444497024


Trial name,status,loc,lr,nn_layers
train_AE_00000,RUNNING,,0.1,6
train_AE_00001,PENDING,,0.0001,6
train_AE_00002,PENDING,,0.1,8
train_AE_00003,PENDING,,0.0001,8
train_AE_00004,PENDING,,0.1,6
train_AE_00005,PENDING,,0.0001,6
train_AE_00006,PENDING,,0.1,8
train_AE_00007,PENDING,,0.0001,8


2020-07-09 11:38:31,145	INFO trainable.py:217 -- Getting current IP.


batch_size 32




f1 0.31932410215133855


Trial name,status,loc,lr,nn_layers
train_AE_00000,RUNNING,,0.1,6
train_AE_00001,RUNNING,,0.0001,6
train_AE_00002,PENDING,,0.1,8
train_AE_00003,PENDING,,0.0001,8
train_AE_00004,PENDING,,0.1,6
train_AE_00005,PENDING,,0.0001,6
train_AE_00006,PENDING,,0.1,8
train_AE_00007,PENDING,,0.0001,8


2020-07-09 11:38:45,282	INFO trainable.py:217 -- Getting current IP.


batch_size 32




f1 0.5884612574506558


Trial name,status,loc,lr,nn_layers
train_AE_00000,RUNNING,,0.1,6
train_AE_00001,RUNNING,,0.0001,6
train_AE_00002,RUNNING,,0.1,8
train_AE_00003,PENDING,,0.0001,8
train_AE_00004,PENDING,,0.1,6
train_AE_00005,PENDING,,0.0001,6
train_AE_00006,PENDING,,0.1,8
train_AE_00007,PENDING,,0.0001,8


2020-07-09 11:39:00,488	INFO trainable.py:217 -- Getting current IP.


batch_size 32




f1 0.322881657915541


Trial name,status,loc,lr,nn_layers
train_AE_00000,RUNNING,,0.1,6
train_AE_00001,RUNNING,,0.0001,6
train_AE_00002,RUNNING,,0.1,8
train_AE_00003,RUNNING,,0.0001,8
train_AE_00004,PENDING,,0.1,6
train_AE_00005,PENDING,,0.0001,6
train_AE_00006,PENDING,,0.1,8
train_AE_00007,PENDING,,0.0001,8


2020-07-09 11:39:18,312	INFO trainable.py:217 -- Getting current IP.


batch_size 32




f1 0.5644379376981854


Trial name,status,loc,lr,nn_layers
train_AE_00000,RUNNING,,0.1,6
train_AE_00001,RUNNING,,0.0001,6
train_AE_00002,RUNNING,,0.1,8
train_AE_00003,RUNNING,,0.0001,8
train_AE_00004,RUNNING,,0.1,6
train_AE_00005,PENDING,,0.0001,6
train_AE_00006,PENDING,,0.1,8
train_AE_00007,PENDING,,0.0001,8


2020-07-09 11:39:36,973	INFO trainable.py:217 -- Getting current IP.


batch_size 32




f1 0.3322516271881666


Trial name,status,loc,lr,nn_layers
train_AE_00000,RUNNING,,0.1,6
train_AE_00001,RUNNING,,0.0001,6
train_AE_00002,RUNNING,,0.1,8
train_AE_00003,RUNNING,,0.0001,8
train_AE_00004,RUNNING,,0.1,6
train_AE_00005,RUNNING,,0.0001,6
train_AE_00006,PENDING,,0.1,8
train_AE_00007,PENDING,,0.0001,8




Result for train_AE_00000:
  date: 2020-07-09_11-38-31
  done: false
  experiment_id: 9ce4c544889c407f96d27175673e40c4
  experiment_tag: 0_lr=0.1,nn_layers=6
  hostname: natashas-MacBook-Pro.local
  iterations_since_restore: 1
  node_ip: 192.168.2.23
  pid: 80208
  test_f1: 0.5995521444497024
  test_loss: 3175500.0
  time_since_restore: 15.266225099563599
  time_this_iter_s: 15.266225099563599
  time_total_s: 15.266225099563599
  timestamp: 1594309111
  timesteps_since_restore: 0
  train_f1: 0.29197380787193256
  train_loss: 158866.671875
  training_iteration: 1
  trial_id: '00000'
  




f1 0.591020224215779


Trial name,status,loc,lr,nn_layers,iter,total time (s)
train_AE_00000,RUNNING,192.168.2.23:80208,0.1,6,1.0,15.2662
train_AE_00001,RUNNING,,0.0001,6,,
train_AE_00002,RUNNING,,0.1,8,,
train_AE_00003,RUNNING,,0.0001,8,,
train_AE_00004,RUNNING,,0.1,6,,
train_AE_00005,RUNNING,,0.0001,6,,
train_AE_00006,PENDING,,0.1,8,,
train_AE_00007,PENDING,,0.0001,8,,


Result for train_AE_00005:
  date: 2020-07-09_11-39-52
  done: false
  experiment_id: eabf8e9b808c48db932b404b2c64a061
  experiment_tag: 5_lr=0.0001,nn_layers=6
  hostname: natashas-MacBook-Pro.local
  iterations_since_restore: 1
  node_ip: 192.168.2.23
  pid: 80208
  test_f1: 0.3322516271881666
  test_loss: 152357.46875
  time_since_restore: 15.468389987945557
  time_this_iter_s: 15.468389987945557
  time_total_s: 15.468389987945557
  timestamp: 1594309192
  timesteps_since_restore: 0
  train_f1: 0.2821531493498646
  train_loss: 157891.25
  training_iteration: 1
  trial_id: '00005'
  




f1 0.6034795316218932


Trial name,status,loc,lr,nn_layers,iter,total time (s)
train_AE_00000,RUNNING,192.168.2.23:80208,0.1,6,1.0,15.2662
train_AE_00001,RUNNING,,0.0001,6,,
train_AE_00002,RUNNING,,0.1,8,,
train_AE_00003,RUNNING,,0.0001,8,,
train_AE_00004,RUNNING,,0.1,6,,
train_AE_00005,RUNNING,192.168.2.23:80208,0.0001,6,1.0,15.4684
train_AE_00006,PENDING,,0.1,8,,
train_AE_00007,PENDING,,0.0001,8,,


Result for train_AE_00002:
  date: 2020-07-09_11-39-00
  done: false
  experiment_id: 2fb0edae845d46ab9a480aa001da45a5
  experiment_tag: 2_lr=0.1,nn_layers=8
  hostname: natashas-MacBook-Pro.local
  iterations_since_restore: 1
  node_ip: 192.168.2.23
  pid: 80208
  test_f1: 0.5884612574506558
  test_loss: 2999800.0
  time_since_restore: 15.102816820144653
  time_this_iter_s: 15.102816820144653
  time_total_s: 15.102816820144653
  timestamp: 1594309140
  timesteps_since_restore: 0
  train_f1: 0.26146570824273047
  train_loss: 159152.515625
  training_iteration: 1
  trial_id: '00002'
  




Trial name,status,loc,lr,nn_layers,iter,total time (s)
train_AE_00000,RUNNING,192.168.2.23:80208,0.1,6,1.0,15.2662
train_AE_00001,RUNNING,,0.0001,6,,
train_AE_00002,RUNNING,192.168.2.23:80208,0.1,8,1.0,15.1028
train_AE_00003,RUNNING,,0.0001,8,,
train_AE_00004,RUNNING,,0.1,6,,
train_AE_00005,RUNNING,192.168.2.23:80208,0.0001,6,1.0,15.4684
train_AE_00006,PENDING,,0.1,8,,
train_AE_00007,PENDING,,0.0001,8,,


2020-07-09 11:40:23,980	ERROR trial_runner.py:519 -- Trial train_AE_00002: Error processing event.
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/site-packages/ray/tune/trial_runner.py", line 467, in _process_trial
    result = self.trial_executor.fetch_result(trial)
  File "/usr/local/lib/python3.7/site-packages/ray/tune/ray_trial_executor.py", line 431, in fetch_result
    result = ray.get(trial_future[0], DEFAULT_GET_TIMEOUT)
  File "/usr/local/lib/python3.7/site-packages/ray/worker.py", line 1515, in get
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError: [36mray::WrappedFunc.train()[39m (pid=80208, ip=192.168.2.23)
  File "python/ray/_raylet.pyx", line 417, in ray._raylet.execute_task.function_executor
  File "/usr/local/lib/python3.7/site-packages/ray/tune/trainable.py", line 261, in train
    result = self._train()
  File "/usr/local/lib/python3.7/site-packages/ray/tune/function_runner.py", line 199, in _train
    block=True, timeout=RESULT_

batch_size 32
f1 0.5633717274774457




Trial name,status,loc,lr,nn_layers,iter,total time (s)
train_AE_00000,RUNNING,192.168.2.23:80208,0.1,6,1.0,15.2662
train_AE_00001,RUNNING,,0.0001,6,,
train_AE_00002,ERROR,,0.1,8,1.0,15.1028
train_AE_00003,RUNNING,,0.0001,8,,
train_AE_00004,RUNNING,,0.1,6,,
train_AE_00005,RUNNING,192.168.2.23:80208,0.0001,6,1.0,15.4684
train_AE_00006,RUNNING,,0.1,8,,
train_AE_00007,PENDING,,0.0001,8,,

Trial name,# failures,error file
train_AE_00002,1,"/Users/natasha/Desktop/mcgill_postdoc/ncbi_genomes/genome_embeddings/TUNE_RESULT_DIR/exp_1/train_AE_2_lr=0.1,nn_layers=8_2020-07-09_11-38-45b8ze7tz5/error.txt"


2020-07-09 11:40:30,016	ERROR trial_runner.py:519 -- Trial train_AE_00006: Error processing event.
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/site-packages/ray/tune/trial_runner.py", line 467, in _process_trial
    result = self.trial_executor.fetch_result(trial)
  File "/usr/local/lib/python3.7/site-packages/ray/tune/ray_trial_executor.py", line 431, in fetch_result
    result = ray.get(trial_future[0], DEFAULT_GET_TIMEOUT)
  File "/usr/local/lib/python3.7/site-packages/ray/worker.py", line 1515, in get
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError: [36mray::WrappedFunc.train()[39m (pid=80208, ip=192.168.2.23)
  File "python/ray/_raylet.pyx", line 417, in ray._raylet.execute_task.function_executor
  File "/usr/local/lib/python3.7/site-packages/ray/tune/trainable.py", line 261, in train
    result = self._train()
  File "/usr/local/lib/python3.7/site-packages/ray/tune/function_runner.py", line 199, in _train
    block=True, timeout=RESULT_

batch_size 32
Result for train_AE_00003:
  date: 2020-07-09_11-39-18
  done: false
  experiment_id: 669ac964867040f7b81ff5dd99798811
  experiment_tag: 3_lr=0.0001,nn_layers=8
  hostname: natashas-MacBook-Pro.local
  iterations_since_restore: 1
  node_ip: 192.168.2.23
  pid: 80208
  test_f1: 0.322881657915541
  test_loss: 153437.0625
  time_since_restore: 17.795568704605103
  time_this_iter_s: 17.795568704605103
  time_total_s: 17.795568704605103
  timestamp: 1594309158
  timesteps_since_restore: 0
  train_f1: 0.29502811018450087
  train_loss: 158562.828125
  training_iteration: 1
  trial_id: '00003'
  


2020-07-09 11:40:30,507	ERROR trial_runner.py:519 -- Trial train_AE_00007: Error processing event.
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/site-packages/ray/tune/trial_runner.py", line 467, in _process_trial
    result = self.trial_executor.fetch_result(trial)
  File "/usr/local/lib/python3.7/site-packages/ray/tune/ray_trial_executor.py", line 431, in fetch_result
    result = ray.get(trial_future[0], DEFAULT_GET_TIMEOUT)
  File "/usr/local/lib/python3.7/site-packages/ray/worker.py", line 1515, in get
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError: [36mray::WrappedFunc.train()[39m (pid=80208, ip=192.168.2.23)
  File "python/ray/_raylet.pyx", line 417, in ray._raylet.execute_task.function_executor
  File "/usr/local/lib/python3.7/site-packages/ray/tune/trainable.py", line 261, in train
    result = self._train()
  File "/usr/local/lib/python3.7/site-packages/ray/tune/function_runner.py", line 199, in _train
    block=True, timeout=RESULT_

Result for train_AE_00004:
  date: 2020-07-09_11-39-36
  done: false
  experiment_id: 120fb477ad66449ca9fc3748fb24a37f
  experiment_tag: 4_lr=0.1,nn_layers=6
  hostname: natashas-MacBook-Pro.local
  iterations_since_restore: 1
  node_ip: 192.168.2.23
  pid: 80208
  test_f1: 0.5644379376981854
  test_loss: 3028700.0
  time_since_restore: 18.62154221534729
  time_this_iter_s: 18.62154221534729
  time_total_s: 18.62154221534729
  timestamp: 1594309176
  timesteps_since_restore: 0
  train_f1: 0.26497840955883006
  train_loss: 158402.46875
  training_iteration: 1
  trial_id: '00004'
  
Result for train_AE_00001:
  date: 2020-07-09_11-38-45
  done: false
  experiment_id: 3a8e48ffe70b48219976ad892b640dde
  experiment_tag: 1_lr=0.0001,nn_layers=6
  hostname: natashas-MacBook-Pro.local
  iterations_since_restore: 1
  node_ip: 192.168.2.23
  pid: 80208
  test_f1: 0.31932410215133855
  test_loss: 152894.234375
  time_since_restore: 14.113343000411987
  time_this_iter_s: 14.113343000411987
  time_

In [None]:
for i in train_vars:
    if isinstance(train_vars[i], dict):
        print("Best "+i+":", analysis.get_best_config(metric="test_f1")[i])

In [None]:
analysis.dataframe()["logdir"][0]

In [None]:
# train the model
# train_losses, test_losses, train_f1_scores, test_f1_scores = train_test.train_model(loaders, 
#         model, flags.num_epochs, flags.print_every,
#         flags.SAVE_FP, flags.replacement_threshold, cluster_names, flags.cirriculum, train_data[:,:len(cluster_names)],
#         search_space)
#train_losses, test_losses, train_f1_scores, test_f1_scores = train_test.train_model(train_vars, hyperparams)

### Evaluate model performance

In [None]:
# # evaluate model performance
# perf_lc = data_viz.learning_curve(train_f1_scores, test_f1_scores, "performance", flags.cirriculum)

In [None]:
# # evaluate model performance
# optim_lc = data_viz.learning_curve(train_losses, test_losses, "optimization", flags.cirriculum)

In [None]:
# # first convert test_data from subset -> tensor, split corrupt vs target sets
# tensor_test_data = torch.tensor([i.numpy() for i in test_data]).float()
# corrupt_test_data = tensor_test_data[:,:len(cluster_names)]
# target = tensor_test_data[:,len(cluster_names):].detach().numpy()

In [None]:
# # Generate probabilities for ROC curve
# model.eval()
# with torch.no_grad():
#     y_probas = model(corrupt_test_data) # predicted probabilities generated by model

In [None]:
# roc = data_viz.my_roc_curve(target, y_probas.numpy())

In [None]:
# util.log_results(roc, optim_lc, perf_lc, flags, model)

In [None]:
# # create embeddings for test set
# #uncorrupt_test_data = tensor_test_data[:,len(cluster_names):]
# #tensor_test_data = torch.tensor([i.numpy() for i in test_data]).float()
# embeddings = train_test.generate_embeddings(model, corrupt_test_data)

In [None]:
#data_viz.plot_tSNE(embeddings, test_data, num_to_genome, genome_to_tax, test_tax_dict)

In [None]:
# # tSNE for corrupted genomes passed through untrained model
# untrained_model = models.AutoEncoder(len(cluster_names))
# untr_embeddings = train_test.generate_embeddings(untrained_model, corrupt_test_data)

In [None]:
#data_viz.plot_tSNE(untr_embeddings, test_data, num_to_genome, genome_to_tax, test_tax_dict)
# data_viz.plot_tSNE(untr_embeddings, test_data, num_to_genome, genome_to_tax, genome_idx_test)

In [None]:
# # Evaluate model and compare against baselines
# # Get corrupted input set, target set, and predictions set (binarized to 1's and 0's)
# #corrupt_test_data = tensor_test_data[:,:len(cluster_names)]

# model.eval()
# with torch.no_grad():
#     pred = model.forward(corrupt_test_data).detach().numpy()
# b_pred = train_test.binarize(pred, flags.replacement_threshold)

In [None]:
# # Generate confusion matrix
# cm = evaluate.dom_confusion_matrix(b_pred, target, num_to_genome, genome_to_tax, test_tax_dict, genome_idx_test)

In [None]:
# util.log_results(roc, optim_lc, perf_lc, flags, model, cm)

In [None]:
# # Baseline 1: untrained DAE
# # Generate predictions using an untrained DAE model
# model.eval()
# with torch.no_grad():
#     untr_pred = untrained_model.forward(corrupt_test_data).detach().numpy()
# untr_b_preds = train_test.binarize(untr_pred, flags.replacement_threshold)

In [None]:
# # if os.path.isfile(flags.DATA_FP+"rand_b_pred.pt"):
# #     print("Loading random predictions from file")
# #     rand_b_pred = torch.load(flags.DATA_FP+"rand_b_pred.pt")
# # else: 
# #     # This is slow
# #     print("Generating random predictions, this will take a while (~30 min)")
# #     rand_b_pred = evaluate.generate_baseline(num_features, train_data, 
# #                                              corrupt_test_data, "base_random", cluster_names)
# #     torch.save(rand_b_pred, flags.DATA_FP+"rand_b_pred.pt")

# rand_b_pred = evaluate.generate_baseline(num_features, train_data, 
#                                          corrupt_test_data, "base_random", cluster_names)

In [None]:
# torch.save(rand_b_pred, flags.DATA_FP+"rand_b_pred.pt")

In [None]:
# # if os.path.isfile(flags.DATA_FP+"smart_b_pred.pt"):
# #     print("Loading smart random predictions from file")
# #     smart_b_pred = torch.load(flags.DATA_FP+"smart_b_pred.pt")
# # else:
# #     print("Generating smart random predictions, this will take a while (~30 min)")
# #     smart_b_pred = evaluate.generate_baseline(num_features, train_data, 
# #                                           corrupt_test_data, "smart_random", cluster_names)
# #     torch.save(smart_b_pred, flags.DATA_FP+"smart_b_pred.pt")

# smart_b_pred = evaluate.generate_baseline(num_features, train_data, 
#                                       corrupt_test_data, "smart_random", cluster_names)

In [None]:
# torch.save(smart_b_pred, flags.DATA_FP+"smart_b_pred.pt")

In [None]:
# import numpy as np
# np.sum(smart_b_pred == rand_b_pred), np.sum(smart_b_pred != rand_b_pred)

In [None]:
# import pandas as pd
# hs = evaluate.hamming(target, b_pred)
# hs_stats = [round(sum(hs)/len(hs),2), round(min(hs),2), round(max(hs),2)]

# untr_hs = evaluate.hamming(target, untr_b_preds)
# untr_hs_stats = [round(sum(untr_hs)/len(untr_hs),2), round(min(untr_hs),2), round(max(untr_hs),2)]

# rand_hs = evaluate.hamming(target, rand_b_pred)
# rand_hs_stats = [round(sum(rand_hs)/len(rand_hs),2), round(min(rand_hs),2), round(max(rand_hs),2)]

# smart_hs = evaluate.hamming(target, smart_b_pred)
# smart_hs_stats = [round(sum(smart_hs)/len(smart_hs),2), round(min(smart_hs),2), round(max(smart_hs),2)]


# hamming_df = pd.DataFrame([hs_stats, untr_hs_stats, rand_hs_stats, smart_hs_stats], columns=['mean', 'min', 'max'], 
#                             index=["DAE trained", "DAE untrained", "Random chance", "Smart random chance"])
# hamming_df