In [1]:
# automatically upload modules
%load_ext autoreload
%autoreload 2

In [2]:
from argparse import Namespace
import os
import pandas as pd
#import pickle
from ray import tune
import ConfigSpace as CS
from ray.tune.suggest.bohb import TuneBOHB
from ray.tune.schedulers import HyperBandForBOHB
import torch
import argparse # for ray distributed training

from genome_embeddings import data_viz
from genome_embeddings import evaluate
from genome_embeddings import models
from genome_embeddings import train_test
from genome_embeddings import util
from genome_embeddings import trainable # import before ray (?)
import ray

In [3]:
flags = Namespace(
    DATA_FP = '/Users/natasha/Desktop/mcgill_postdoc/ncbi_genomes/genome_embeddings/data/', 
    SAVE_FP = '/Users/natasha/Desktop/mcgill_postdoc/ncbi_genomes/genome_embeddings/',
    KEGG_FP = '/Users/natasha/Desktop/mcgill_postdoc/ncbi_genomes/kegg_dataset/',
    data_source = 'kegg', #['get_homologues' | 'kegg']
    n_test = 0.1, # train-test split, n * 100 = % of data that goes into test set (e.g.: 0.1 -> 10%)
    num_epochs = 2, # IF CIRRICULUM, DO NOT SET < 3
    batch_size = 128,
    lr = 1e-3,
    kfolds = 10, # number of folds for cross-validation
    print_every = 50, # print loss every n batches during training (5)
    replacement_threshold = 0.5, # probability over which binarizer converts to a 1
    num_corruptions = 100, # number of corrupted versions of a genome to produce
    corruption_fraction = 0.5, # fraction of genes to retain during corruption process
    phy_mode = "bacteria_only", # training with only bacteria vs also euk/arch
    cirriculum = False, # implement cirriculum learning based on gene count
    rare_threshold = 10, # drop features that occur fewer than this times in training ds 
    weight_decay=0.1 # L2 regularization
    )

### Data exploration + preprocessing 

In [4]:
# # First create genome representations (very slow)
# # Each genome is a list of KO's and/or KEGG modules
# if os.path.isfile(flags.DATA_FP+'genome_to_mod.csv'):
#     print("Genome representations already exist")
# else:
#     genome_rep.genome_kos(flags.KEGG_FP)
#     print("Must generate genome representations from scratch. This will take several hours.")

In [5]:
# df, cluster_names = util.load_data(flags.DATA_FP, flags.data_source)
# genome_to_tax = util.genome_to_tax(df)
# with open('cluster_names.txt', 'wb') as filehandle:
#     pickle.dump(cluster_names, filehandle)


In [6]:
#data_viz.tax_distrib(df, genome_to_tax)

In [7]:
#data_viz.module_stats(df)

In [8]:
#data_viz.genes_per_genome(df)

In [9]:
# # Split train-test sets in a phylogenetically balanced manner 
# if os.path.isfile(flags.DATA_FP+'uncorrupted_train_balanced.csv'):
#     print("Train-test split already exists, loading from file")
#     train_orig = pd.read_csv(flags.DATA_FP+"uncorrupted_train_balanced.csv", index_col=0)    
#     test_orig = pd.read_csv(flags.DATA_FP+"uncorrupted_test_balanced.csv", index_col=0)    

# else:
#     # Create dict mapping each genome to a unique numerical ID
#     genome_to_num ={}
#     for i,genome in enumerate(df.index):
#         genome_to_num[genome] = i

#     num_to_genome = {v: k for k, v in genome_to_num.items()}
        
#     print("Generating train-test split")
#     train_orig, test_orig = util.balanced_split(df, flags.n_test, genome_to_tax, 
#                                                 num_to_genome, flags.DATA_FP)    
#     train_orig.to_csv(flags.DATA_FP+'uncorrupted_train_balanced.csv')
#     test_orig.to_csv(flags.DATA_FP+'uncorrupted_test_balanced.csv')

In [10]:
#data_viz.hist_prob_ko(train_orig)

In [11]:
# if flags.phy_mode == "bacteria_only":
#     train_genomes = train_orig.index.to_list()
#     test_genomes = test_orig.index.to_list()
    
#     unf_train_data, train_tax_dict = util.bacteria_only(train_orig, train_genomes, genome_to_tax)
#     unf_test_data, test_tax_dict = util.bacteria_only(test_orig, test_genomes, genome_to_tax)

In [12]:
# # Remove rare features from train + test datasets
# # Rare = fewer than n occurences in training dataset
# # Last argument specifies n, set to correspond to 1% of genomes (3432 genomes -> n = 34)
# train_data, test_data, cluster_names = util.remove_rare(unf_train_data, unf_test_data, 
#                                                         cluster_names, unf_train_data.shape[0]*0.01)

In [13]:
# Produce corrupted genomes
# Could eventually do re-sampling / extra-corrupting to have more examples of "rare" genome types
#    e.g.: those from underrepresented groups M00003   

if os.path.isfile(flags.DATA_FP+'corrupted_train_0607.pt'):
    print("Corrupted genomes already exist")
    train_data = torch.load(flags.DATA_FP+"corrupted_train_0607.pt")
    test_data = torch.load(flags.DATA_FP+"corrupted_test_0607.pt")
    genome_idx_train = torch.load(flags.DATA_FP+"genome_idx_train_0607.pt")
    genome_idx_test = torch.load(flags.DATA_FP+"genome_idx_test_0607.pt")
else:
    print("Generating corrupted dataset from scratch with",flags.num_corruptions,"corrupted versions of each genome")
    train_data, genome_idx_train = util.corrupt(train_data, flags.num_corruptions, flags.corruption_fraction, 
                                                cluster_names, "train", flags.DATA_FP)

    test_data, genome_idx_test = util.corrupt(test_data, flags.num_corruptions, flags.corruption_fraction, 
                                              cluster_names, "test", flags.DATA_FP)

Corrupted genomes already exist


In [14]:
# print(("There are %s genomes and %s features in the training dataset") % 
#       (train_data.shape[0],int(train_data.shape[1]/2)))

# print(("There are %s genomes and %s features in the test dataset") % 
#       (test_data.shape[0],int(test_data.shape[1]/2)))

In [15]:
# if flags.cirriculum:
#     loaders = util.cirriculum_load(train_data, test_data, flags.batch_size, 
#                            flags.batch_size, cluster_names)
# else:
#     loaders = util.dataloaders(train_data, test_data, flags.batch_size, 
#                                flags.batch_size, cluster_names)

In [16]:
# from skorch.dataset import CVSplit
# from torch.utils.data import DataLoader, TensorDataset

In [17]:
# num_features = int(train_data.shape[1]/2)
# X = train_data[:,:num_features] # corrupted genomes in first half of matrix columns
# y = train_data[:,num_features:] # uncorrupted in second half of matrix columns

In [18]:
# # Create dataloader with folds 
# train_ds = TensorDataset(X, y)
# splitter = CVSplit(cv=3)
# train_dl = splitter(train_ds)

In [19]:
# type(train_dl), type(train_dl[0])

In [20]:
# import numpy as np
# k = 3
# idx_genomes = [i for i in range(len(X))]
# num_test = int(len(idx_genomes) / k )
# num_train = len(idx_genomes) - num_test

In [21]:
# test_idx = np.random.choice(idx_genomes, num_test, replace=False)
# train_idx = list(set(idx_genomes) - set(test_idx))

In [22]:
# a = X[train_idx]

In [23]:
# a.shape

In [24]:
# train_set = train_dl[0]
# cv_set = train_dl[1]

### Define and train network

In [25]:
train_data.shape[1]

14130

In [26]:
num_features = int(train_data.shape[1]/2) # Number of features in the entire dataset (train + test)

In [27]:
# define the network
# model = models.AutoEncoder(num_features, 6)
# print(model)

In [28]:
# import torch.optim as optim
# import torch.nn as nn

# for epoch in range(1):
#     use_cuda = torch.cuda.is_available()
#     device = torch.device("cuda" if use_cuda else "cpu")
#     model.train()
#     batch_size = 32
#     loaders = trainable.cv_dataloader(batch_size, num_features, 10)
    
#     model = model.to(device)
#     optimizer = optim.Adam(
#         model.parameters(),
#         lr=0.01,
#         weight_decay=0.1
#         )
#     criterion = nn.BCELoss(reduction='sum')
    
    
#     # enumerate batches in epoch
#     for batch_idx, (data, target) in enumerate(loaders["train"]):

#         if batch_idx > 9: break
        
#         data, target = data.to(device), target.to(device)
#         optimizer.zero_grad()
#         pred = model(data)
#         loss = criterion(pred, target)
#         loss.backward()
#         optimizer.step()
        
#         print("batch_idx", batch_idx, loss.item())

In [29]:
config = {"batch_size": 32, #tune.grid_search([32, 64, 128, 256]),
              "num_epochs": flags.num_epochs,
              "replacement_threshold": flags.replacement_threshold,
              "kfolds": flags.kfolds,
              "lr": tune.grid_search([0.1, 0.0001]),
              "weight_decay": 0.01, #tune.grid_search([1e-2, 1e-3, 1e-4, 1e-5]),
              "nn_layers": 6, #tune.grid_search([6,8]), # tune.grid_search([6, 8, 10, 12]),
              "optimizer": "adam" #tune.grid_search(["sgd", "adam"])
}

In [67]:
config_space = CS.ConfigurationSpace()

# UnParametrizedHyperparameter

config_space.add_hyperparameter(
    CS.CategoricalHyperparameter(name='nn_layers', choices=['6'])) #, '8', '10', '12']))

config_space.add_hyperparameter(
    CS.CategoricalHyperparameter(name='batch_size', choices=[32])) #, 64, 128, 256]))

config_space.add_hyperparameter(
    CS.UniformFloatHyperparameter('lr', lower=1e-4, upper=1e-1, log=True))

config_space.add_hyperparameter(
    CS.UniformFloatHyperparameter('weight_decay', lower=1e-5, upper=1e-2, log=True))

config_space.add_hyperparameter(
    CS.CategoricalHyperparameter(name='optimizer', choices=["adam"])) #, "sgd"]))

algo = TuneBOHB(
    config_space, max_concurrent=4, metric='test_f1', mode='max')

bohb = HyperBandForBOHB(
    time_attr='training_iteration',
    metric='test_f1',
    mode='max',
    max_t=100, 
    reduction_factor=3)

In [31]:
ray.shutdown()
#ray.init(local_mode=True)

redis_password = sys.argv[1]
num_cpus = int(sys.argv[2])

ray.init(address=os.environ["ip_head"], redis_password=redis_password)

2020-07-10 10:33:01,143	INFO resource_spec.py:212 -- Starting Ray with 9.28 GiB memory available for workers and up to 4.65 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-07-10 10:33:01,534	INFO services.py:1170 -- View the Ray dashboard at [1m[32mlocalhost:8265[39m[22m


{'node_ip_address': '192.168.2.23',
 'raylet_ip_address': '192.168.2.23',
 'redis_address': '192.168.2.23:61372',
 'object_store_address': '/tmp/ray/session_2020-07-10_10-33-01_061259_9573/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2020-07-10_10-33-01_061259_9573/sockets/raylet',
 'webui_url': 'localhost:8265',
 'session_dir': '/tmp/ray/session_2020-07-10_10-33-01_061259_9573'}

In [68]:
config = {#"min_iter_time_s": 15,
         "num_workers": 1,
         "num_epochs":flags.num_epochs,
         "kfolds":flags.kfolds,
         "replacement_threshold": flags.replacement_threshold}

In [76]:
analysis = tune.run(
    trainable.train_AE, 
    name="exp_1",
    config=config,
    search_alg=algo,
    verbose=2, 
    resources_per_trial={
            "cpu": 2,
            "gpu": 0
    },
    num_samples=2, #1000
    scheduler=bohb,
    local_dir=flags.SAVE_FP+"TUNE_RESULT_DIR",
    stop=trainable.EarlyStopping("test_f1") # if search results aren't improving anymore
    )

#print("Best config is:", analysis.get_best_config(metric="test_f1"))

2020-07-10 11:13:49,042	INFO trainable.py:217 -- Getting current IP.


{'min_iter_time_s': 15, 'num_workers': 1, 'num_epochs': 2, 'kfolds': 10, 'replacement_threshold': 0.5, 'batch_size': 32, 'lr': 0.04170733456212282, 'nn_layers': '6', 'optimizer': 'adam', 'weight_decay': 0.0016696052354833388}
f1 0.5794490601746773




Trial name,status,loc,batch_size,lr,nn_layers,optimizer,weight_decay
train_AE_f4d198e6,RUNNING,,32,0.0417073,6,adam,0.00166961
train_AE_f4d21cee,PENDING,,32,0.0248967,6,adam,0.00155036


2020-07-10 11:14:38,973	INFO trainable.py:217 -- Getting current IP.


{'min_iter_time_s': 15, 'num_workers': 1, 'num_epochs': 2, 'kfolds': 10, 'replacement_threshold': 0.5, 'batch_size': 32, 'lr': 0.0248966568261999, 'nn_layers': '6', 'optimizer': 'adam', 'weight_decay': 0.0015503565634920242}
f1 0.6166763667900373




Trial name,status,loc,batch_size,lr,nn_layers,optimizer,weight_decay
train_AE_f4d198e6,RUNNING,,32,0.0417073,6,adam,0.00166961
train_AE_f4d21cee,RUNNING,,32,0.0248967,6,adam,0.00155036




Result for train_AE_f4d198e6:
  date: 2020-07-10_11-14-38
  done: false
  experiment_id: ce899e416ae843a8a30b4e50d1aaee5f
  experiment_tag: 1_batch_size=32,kfolds=10,lr=0.041707,min_iter_time_s=15,nn_layers=6,num_epochs=2,num_workers=1,optimizer=adam,replacement_threshold=0.5,weight_decay=0.0016696
  hostname: natashas-MacBook-Pro.local
  iterations_since_restore: 1
  node_ip: 192.168.2.23
  pid: 9573
  test_f1: 0.5794490601746773
  test_loss: 3389400.0
  time_since_restore: 49.90380001068115
  time_this_iter_s: 49.90380001068115
  time_total_s: 49.90380001068115
  timestamp: 1594394078
  timesteps_since_restore: 0
  train_f1: 0.27150720692052244
  train_loss: 157917.4375
  training_iteration: 1
  trial_id: f4d198e6
  
f1 0.5786427960616621




Trial name,status,loc,batch_size,lr,nn_layers,optimizer,weight_decay,iter,total time (s)
train_AE_f4d198e6,RUNNING,192.168.2.23:9573,32,0.0417073,6,adam,0.00166961,1.0,49.9038
train_AE_f4d21cee,RUNNING,,32,0.0248967,6,adam,0.00155036,,


Result for train_AE_f4d21cee:
  date: 2020-07-10_11-15-33
  done: false
  experiment_id: a0300dba6e2f47be8cb5332b11e38dad
  experiment_tag: 2_batch_size=32,kfolds=10,lr=0.024897,min_iter_time_s=15,nn_layers=6,num_epochs=2,num_workers=1,optimizer=adam,replacement_threshold=0.5,weight_decay=0.0015504
  hostname: natashas-MacBook-Pro.local
  iterations_since_restore: 1
  node_ip: 192.168.2.23
  pid: 9573
  test_f1: 0.6166763667900373
  test_loss: 3251800.0
  time_since_restore: 54.12103486061096
  time_this_iter_s: 54.12103486061096
  time_total_s: 54.12103486061096
  timestamp: 1594394133
  timesteps_since_restore: 0
  train_f1: 0.3026170815000782
  train_loss: 158816.4375
  training_iteration: 1
  trial_id: f4d21cee
  
f1 0.6026001154821794




Trial name,status,loc,batch_size,lr,nn_layers,optimizer,weight_decay,iter,total time (s)
train_AE_f4d198e6,RUNNING,192.168.2.23:9573,32,0.0417073,6,adam,0.00166961,1,49.9038
train_AE_f4d21cee,RUNNING,192.168.2.23:9573,32,0.0248967,6,adam,0.00155036,1,54.121




Result for train_AE_f4d21cee:
  date: 2020-07-10_11-17-00
  done: false
  experiment_id: a0300dba6e2f47be8cb5332b11e38dad
  experiment_tag: 2_batch_size=32,kfolds=10,lr=0.024897,min_iter_time_s=15,nn_layers=6,num_epochs=2,num_workers=1,optimizer=adam,replacement_threshold=0.5,weight_decay=0.0015504
  hostname: natashas-MacBook-Pro.local
  iterations_since_restore: 2
  node_ip: 192.168.2.23
  pid: 9573
  test_f1: 0.6026001154821794
  test_loss: 3182700.0
  time_since_restore: 141.45698285102844
  time_this_iter_s: 87.33594799041748
  time_total_s: 141.45698285102844
  timestamp: 1594394220
  timesteps_since_restore: 0
  train_f1: 0.5783126045688602
  train_loss: 3209300.0
  training_iteration: 2
  trial_id: f4d21cee
  




Trial name,status,loc,batch_size,lr,nn_layers,optimizer,weight_decay,iter,total time (s)
train_AE_f4d198e6,RUNNING,192.168.2.23:9573,32,0.0417073,6,adam,0.00166961,1,49.9038
train_AE_f4d21cee,RUNNING,192.168.2.23:9573,32,0.0248967,6,adam,0.00155036,2,141.457


Result for train_AE_f4d198e6:
  date: 2020-07-10_11-16-17
  done: false
  experiment_id: ce899e416ae843a8a30b4e50d1aaee5f
  experiment_tag: 1_batch_size=32,kfolds=10,lr=0.041707,min_iter_time_s=15,nn_layers=6,num_epochs=2,num_workers=1,optimizer=adam,replacement_threshold=0.5,weight_decay=0.0016696
  hostname: natashas-MacBook-Pro.local
  iterations_since_restore: 2
  node_ip: 192.168.2.23
  pid: 9573
  test_f1: 0.5786427960616621
  test_loss: 3370500.0
  time_since_restore: 148.44221377372742
  time_this_iter_s: 98.53841376304626
  time_total_s: 148.44221377372742
  timestamp: 1594394177
  timesteps_since_restore: 0
  train_f1: 0.587577747628062
  train_loss: 3200800.0
  training_iteration: 2
  trial_id: f4d198e6
  




Trial name,status,loc,batch_size,lr,nn_layers,optimizer,weight_decay,iter,total time (s)
train_AE_f4d198e6,RUNNING,192.168.2.23:9573,32,0.0417073,6,adam,0.00166961,2,148.442
train_AE_f4d21cee,TERMINATED,,32,0.0248967,6,adam,0.00155036,2,141.457


Trial name,status,loc,batch_size,lr,nn_layers,optimizer,weight_decay,iter,total time (s)
train_AE_f4d198e6,TERMINATED,,32,0.0417073,6,adam,0.00166961,2,148.442
train_AE_f4d21cee,TERMINATED,,32,0.0248967,6,adam,0.00155036,2,141.457


In [73]:
print("Best config is:", analysis.get_best_config(metric="test_f1"))

Best config is: {'min_iter_time_s': 15, 'num_workers': 1, 'num_epochs': 2, 'kfolds': 10, 'replacement_threshold': 0.5, 'batch_size': 32, 'lr': 0.09359780051577608, 'nn_layers': '6', 'optimizer': 'adam', 'weight_decay': 0.0005925090175702109}


In [74]:
# for i in train_vars:
#     if isinstance(train_vars[i], dict):
#         print("Best "+i+":", analysis.get_best_config(metric="test_f1")[i])
for i in config_space:
    print("Best "+i+":", analysis.get_best_config(metric="test_f1")[i])

Best batch_size: 32
Best lr: 0.09359780051577608
Best nn_layers: 6
Best optimizer: adam
Best weight_decay: 0.0005925090175702109


In [35]:
# analysis = tune.run(
#     trainable.train_AE, 
#     name="exp_1",
#     config=train_vars, 
#     verbose=2, 
#     resources_per_trial={
#             "cpu": 2,
#             "gpu": 0
#     },
#     num_samples=2,
#     scheduler=ASHAScheduler(metric="test_f1", mode="max", grace_period=1, time_attr="n_batch"),
#     local_dir=flags.SAVE_FP+"TUNE_RESULT_DIR"
#     )

# print("Best config is:", analysis.get_best_config(metric="test_f1"))

In [37]:
analysis.dataframe()["logdir"][0]

'/Users/natasha/Desktop/mcgill_postdoc/ncbi_genomes/genome_embeddings/TUNE_RESULT_DIR/exp_1/train_AE_1_batch_size=32,kfolds=10,lr=0.01713,min_iter_time_s=15,nn_layers=6,num_epochs=2,num_workers=1,optimizer=adam,replacement_2020-07-10_10-33-02dbyoijom'

In [38]:
# train the model
# train_losses, test_losses, train_f1_scores, test_f1_scores = train_test.train_model(loaders, 
#         model, flags.num_epochs, flags.print_every,
#         flags.SAVE_FP, flags.replacement_threshold, cluster_names, flags.cirriculum, train_data[:,:len(cluster_names)],
#         search_space)
#train_losses, test_losses, train_f1_scores, test_f1_scores = train_test.train_model(train_vars, hyperparams)

### Evaluate model performance

In [39]:
# # evaluate model performance
# perf_lc = data_viz.learning_curve(train_f1_scores, test_f1_scores, "performance", flags.cirriculum)

In [40]:
# # evaluate model performance
# optim_lc = data_viz.learning_curve(train_losses, test_losses, "optimization", flags.cirriculum)

In [41]:
# # first convert test_data from subset -> tensor, split corrupt vs target sets
# tensor_test_data = torch.tensor([i.numpy() for i in test_data]).float()
# corrupt_test_data = tensor_test_data[:,:num_features]
# target = tensor_test_data[:,num_features:].detach().numpy()

In [42]:
# # Generate probabilities for ROC curve
# model.eval()
# with torch.no_grad():
#     y_probas = model(corrupt_test_data) # predicted probabilities generated by model

In [43]:
# roc = data_viz.my_roc_curve(target, y_probas.numpy())

In [44]:
# util.log_results(roc, optim_lc, perf_lc, flags, model)

In [45]:
# # create embeddings for test set
# #uncorrupt_test_data = tensor_test_data[:,len(cluster_names):]
# #tensor_test_data = torch.tensor([i.numpy() for i in test_data]).float()
# embeddings = train_test.generate_embeddings(model, corrupt_test_data)

In [46]:
#data_viz.plot_tSNE(embeddings, test_data, num_to_genome, genome_to_tax, test_tax_dict)

In [47]:
# # tSNE for corrupted genomes passed through untrained model
# untrained_model = models.AutoEncoder(len(cluster_names))
# untr_embeddings = train_test.generate_embeddings(untrained_model, corrupt_test_data)

In [48]:
#data_viz.plot_tSNE(untr_embeddings, test_data, num_to_genome, genome_to_tax, test_tax_dict)
# data_viz.plot_tSNE(untr_embeddings, test_data, num_to_genome, genome_to_tax, genome_idx_test)

In [49]:
# # Evaluate model and compare against baselines
# # Get corrupted input set, target set, and predictions set (binarized to 1's and 0's)
# #corrupt_test_data = tensor_test_data[:,:len(cluster_names)]

# model.eval()
# with torch.no_grad():
#     pred = model.forward(corrupt_test_data).detach().numpy()
# b_pred = train_test.binarize(pred, flags.replacement_threshold)

In [50]:
# # Generate confusion matrix
# cm = evaluate.dom_confusion_matrix(b_pred, target, num_to_genome, genome_to_tax, test_tax_dict, genome_idx_test)

In [51]:
# util.log_results(roc, optim_lc, perf_lc, flags, model, cm)

In [52]:
# # Baseline 1: untrained DAE
# # Generate predictions using an untrained DAE model
# model.eval()
# with torch.no_grad():
#     untr_pred = untrained_model.forward(corrupt_test_data).detach().numpy()
# untr_b_preds = train_test.binarize(untr_pred, flags.replacement_threshold)

In [53]:
# # if os.path.isfile(flags.DATA_FP+"rand_b_pred.pt"):
# #     print("Loading random predictions from file")
# #     rand_b_pred = torch.load(flags.DATA_FP+"rand_b_pred.pt")
# # else: 
# #     # This is slow
# #     print("Generating random predictions, this will take a while (~30 min)")
# #     rand_b_pred = evaluate.generate_baseline(num_features, train_data, 
# #                                              corrupt_test_data, "base_random", cluster_names)
# #     torch.save(rand_b_pred, flags.DATA_FP+"rand_b_pred.pt")

# rand_b_pred = evaluate.generate_baseline(num_features, train_data, 
#                                          corrupt_test_data, "base_random", cluster_names)

In [54]:
# torch.save(rand_b_pred, flags.DATA_FP+"rand_b_pred.pt")

In [55]:
# # if os.path.isfile(flags.DATA_FP+"smart_b_pred.pt"):
# #     print("Loading smart random predictions from file")
# #     smart_b_pred = torch.load(flags.DATA_FP+"smart_b_pred.pt")
# # else:
# #     print("Generating smart random predictions, this will take a while (~30 min)")
# #     smart_b_pred = evaluate.generate_baseline(num_features, train_data, 
# #                                           corrupt_test_data, "smart_random", cluster_names)
# #     torch.save(smart_b_pred, flags.DATA_FP+"smart_b_pred.pt")

# smart_b_pred = evaluate.generate_baseline(num_features, train_data, 
#                                       corrupt_test_data, "smart_random", cluster_names)

In [56]:
# torch.save(smart_b_pred, flags.DATA_FP+"smart_b_pred.pt")

In [57]:
# import numpy as np
# np.sum(smart_b_pred == rand_b_pred), np.sum(smart_b_pred != rand_b_pred)

In [58]:
# import pandas as pd
# hs = evaluate.hamming(target, b_pred)
# hs_stats = [round(sum(hs)/len(hs),2), round(min(hs),2), round(max(hs),2)]

# untr_hs = evaluate.hamming(target, untr_b_preds)
# untr_hs_stats = [round(sum(untr_hs)/len(untr_hs),2), round(min(untr_hs),2), round(max(untr_hs),2)]

# rand_hs = evaluate.hamming(target, rand_b_pred)
# rand_hs_stats = [round(sum(rand_hs)/len(rand_hs),2), round(min(rand_hs),2), round(max(rand_hs),2)]

# smart_hs = evaluate.hamming(target, smart_b_pred)
# smart_hs_stats = [round(sum(smart_hs)/len(smart_hs),2), round(min(smart_hs),2), round(max(smart_hs),2)]


# hamming_df = pd.DataFrame([hs_stats, untr_hs_stats, rand_hs_stats, smart_hs_stats], columns=['mean', 'min', 'max'], 
#                             index=["DAE trained", "DAE untrained", "Random chance", "Smart random chance"])
# hamming_df