In [1]:
# automatically upload modules
%load_ext autoreload
%autoreload 2

In [89]:
import numpy as np

In [3]:
from argparse import Namespace
import os, sys
import pandas as pd
#import pickle
from ray import tune
import ConfigSpace as CS
from ray.tune.suggest.bohb import TuneBOHB
from ray.tune.schedulers import HyperBandForBOHB
import torch
import argparse # for ray distributed training


from genome_embeddings import data_viz
from genome_embeddings import evaluate
from genome_embeddings import models
from genome_embeddings import train_test
from genome_embeddings import util
from genome_embeddings import trainable # import before ray (?)
import ray

loading genome_to_tax


In [None]:
os.system("rm file_out")
os.system("rm file_err")

sys.stdout = open('file_out', 'w')
sys.stderr = open('file_err', 'w')

In [4]:
# settings = Namespace(
#     DATA_FP = '/home/ndudek/projects/def-dprecup/ndudek/',
#     SAVE_FP = '/home/ndudek/projects/def-dprecup/ndudek/',
#     num_epochs = 2,
#     num_cpus=20)

settings = Namespace(
    DATA_FP = '/Users/natasha/Desktop/mcgill_postdoc/ncbi_genomes/genome_embeddings/data/', 
    SAVE_FP = '/Users/natasha/Desktop/mcgill_postdoc/ncbi_genomes/genome_embeddings/',
    num_epochs = 2,
    num_cpus=5)

In [5]:
flags = Namespace(
    KEGG_FP = '/Users/natasha/Desktop/mcgill_postdoc/ncbi_genomes/kegg_dataset/',
    data_source = 'kegg', #['get_homologues' | 'kegg']
    n_test = 0.1, # train-test split, n * 100 = % of data that goes into test set (e.g.: 0.1 -> 10%)
    batch_size = 128,
    lr = 1e-3,
    kfolds = 10, # number of folds for cross-validation
    print_every = 50, # print loss every n batches during training (5)
    replacement_threshold = 0.5, # probability over which binarizer converts to a 1
    num_corruptions = 100, # number of corrupted versions of a genome to produce
    corruption_fraction = 0.5, # fraction of genes to retain during corruption process
    phy_mode = "bacteria_only", # training with only bacteria vs also euk/arch
    cirriculum = False, # implement cirriculum learning based on gene count
    rare_threshold = 10, # drop features that occur fewer than this times in training ds 
    weight_decay=0.1 # L2 regularization
    )

In [None]:
print('done loading modules and setting Namespace variables')

### Data exploration + preprocessing 

In [None]:
# # First create genome representations (very slow)
# # Each genome is a list of KO's and/or KEGG modules
# if os.path.isfile(settings.DATA_FP+'genome_to_mod.csv'):
#     print("Genome representations already exist")
# else:
#     genome_rep.genome_kos(flags.KEGG_FP)
#     print("Must generate genome representations from scratch. This will take several hours.")

In [6]:
df, cluster_names = util.load_data(settings.DATA_FP, flags.data_source)
genome_to_tax = util.genome_to_tax(df)
#np.save(settings.DATA_FP+'genome_to_tax.npy', genome_to_tax) 
# with open('cluster_names.txt', 'wb') as filehandle:
#     pickle.dump(cluster_names, filehandle)


In [None]:
#data_viz.tax_distrib(df, genome_to_tax)

In [None]:
#data_viz.module_stats(df)

In [None]:
#data_viz.genes_per_genome(df)

In [None]:
# genome_to_tax = util.genome_to_tax(df)

In [None]:
# train_orig = pd.read_csv(settings.DATA_FP+"uncorrupted_train_balanced.csv", index_col=0)

In [None]:
# train_genomes = list(train_orig.index)

In [7]:
# Split train-test sets in a phylogenetically balanced manner 
if os.path.isfile(settings.DATA_FP+'uncorrupted_train_balanced.csv'):
    print("Train-test split already exists, loading from file")
    train_orig = pd.read_csv(settings.DATA_FP+"uncorrupted_train_balanced.csv", index_col=0)    
    test_orig = pd.read_csv(settings.DATA_FP+"uncorrupted_test_balanced.csv", index_col=0)    

else:
    # Create dict mapping each genome to a unique numerical ID
    genome_to_num ={}
    for i,genome in enumerate(df.index):
        genome_to_num[genome] = i

    num_to_genome = {v: k for k, v in genome_to_num.items()}
        
    print("Generating train-test split")
    train_orig, test_orig = util.balanced_split(df, flags.n_test, genome_to_tax, 
                                                num_to_genome, settings.DATA_FP)    
    train_orig.to_csv(settings.DATA_FP+'uncorrupted_train_balanced.csv')
    test_orig.to_csv(settings.DATA_FP+'uncorrupted_test_balanced.csv')

Train-test split already exists, loading from file


In [None]:
#data_viz.hist_prob_ko(train_orig)

In [None]:
# if flags.phy_mode == "bacteria_only":
#     train_genomes = train_orig.index.to_list()
#     test_genomes = test_orig.index.to_list()
    
#     unf_train_data, train_tax_dict = util.bacteria_only(train_orig, train_genomes, genome_to_tax)
#     unf_test_data, test_tax_dict = util.bacteria_only(test_orig, test_genomes, genome_to_tax)

In [None]:
sys.stdout.flush()
sys.stderr.flush()

In [None]:
# # Remove rare features from train + test datasets
# # Rare = fewer than n occurences in training dataset
# # Last argument specifies n, set to correspond to 1% of genomes (3432 genomes -> n = 34)
# # Remove genes occuring in <1.1% of genomes ---> extra 0.1 is to make there be an even number of features
# #     An even number of features is essential for having the autoencoder layers work out properly
# train_data, test_data, cluster_names = util.remove_rare(unf_train_data, unf_test_data, 
#                                                         cluster_names, unf_train_data.shape[0]*0.01)

In [92]:
# Produce corrupted genomes
# Could eventually do re-sampling / extra-corrupting to have more examples of "rare" genome types
#    e.g.: those from underrepresented groups M00003   

if os.path.isfile(settings.DATA_FP+'corrupted_train_07-17-20.pt'):
    print("Corrupted genomes already exist")
    train_data = torch.load(settings.DATA_FP+"corrupted_train_07-17-20.pt")
    test_data = torch.load(settings.DATA_FP+"corrupted_test_07-17-20.pt")
    genome_idx_train = torch.load(settings.DATA_FP+"genome_idx_train_07-17-20.pt")
    genome_idx_test = torch.load(settings.DATA_FP+"genome_idx_test_07-17-20.pt")
else:
    print("Generating corrupted dataset from scratch with",flags.num_corruptions,"corrupted versions of each genome")
    train_data, genome_idx_train = util.corrupt(train_data, flags.num_corruptions, flags.corruption_fraction, 
                                                    cluster_names, "train", settings.DATA_FP)
    print("Finished training data, starting test")
    test_data, genome_idx_test = util.corrupt(test_data, flags.num_corruptions, flags.corruption_fraction, 
                                                  cluster_names, "test", settings.DATA_FP)

Corrupted genomes already exist


In [126]:
train_data_save = train_data.numpy()#.random.shuffle()[:100, :]
test_data_save = test_data.numpy()#.random.shuffle()[:100, :]

# np.savetxt(settings.DATA_FP+'mini_corrupted_train.txt', train_data_save) 
# np.savetxt(settings.DATA_FP+'mini_corrupted_test.txt', test_data_save) 

In [127]:
from sklearn.utils import shuffle
test_data_save2 = shuffle(test_data_save, random_state=0)

In [131]:
df_train_data = pd.DataFrame(test_data_save2)

RangeIndex(start=0, stop=28800, step=1)

In [116]:
df_train_data = pd.DataFrame(test_data_save2)  #### NOT NUMPY

genome_to_num ={}
for i,genome in enumerate(df.index):
    genome_to_num[genome] = i

num_to_genome = {v: k for k, v in genome_to_num.items()}

In [117]:
type(df_train_data)

pandas.core.frame.DataFrame

In [124]:
genome_to_tax2 = {}

for i in df_train_data.index:
    genome_num = genome_idx_train[i]
    genome_id = num_to_genome[genome_num] 
    tax = genome_to_tax[genome_id]
    genome_to_tax2[i] = tax	
    print(i, genome_num, genome_id, tax)

tensor_df = torch.tensor(df_train_data.values)


0 0 T04989 k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Alteromonadales;f__Pseudoalteromonadaceae;g__Pseudoalteromonas;s__Pseudoalteromonas_espejiana*
1 0 T04989 k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Alteromonadales;f__Pseudoalteromonadaceae;g__Pseudoalteromonas;s__Pseudoalteromonas_espejiana*
2 0 T04989 k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Alteromonadales;f__Pseudoalteromonadaceae;g__Pseudoalteromonas;s__Pseudoalteromonas_espejiana*
3 0 T04989 k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Alteromonadales;f__Pseudoalteromonadaceae;g__Pseudoalteromonas;s__Pseudoalteromonas_espejiana*
4 0 T04989 k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Alteromonadales;f__Pseudoalteromonadaceae;g__Pseudoalteromonas;s__Pseudoalteromonas_espejiana*
5 0 T04989 k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Alteromonadales;f__Pseudoalteromonadaceae;g__Pseudoalteromonas;s__Pseudoalteromonas_espejiana*
6 0 T04989 k__Bacteria;p__Pr

In [119]:
from collections import defaultdict
phylum_to_genomes = defaultdict(list)
for genome in genome_to_tax2:
    phylum = genome_to_tax2[genome].split(";")[1][3:].strip("*")
    if phylum == "Proteobacteria":
        phylum = genome_to_tax2[genome].split(";")[2][3:].strip("*")
    phylum_to_genomes[phylum].append(genome)


In [122]:
bacteria = defaultdict(lambda: defaultdict(int))
for i in genome_to_tax2:
    print(i)
    domain = genome_to_tax2[i].split(";")[0][3:] # [3:] gets rid of k__ in k__Phylum_name
    if domain == "TM6":
        domain = "Bacteria"
    phylum = genome_to_tax2[i].split(";")[1][3:].strip("*")
    if phylum == "Proteobacteria":
        phylum = genome_to_tax2[i].split(";")[2][3:].strip("*")
    bacteria[domain][phylum] += 1

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


In [121]:
bacteria

defaultdict(<function __main__.<lambda>()>,
            {'Bacteria': defaultdict(int, {'Gammaproteobacteria': 100})})

In [84]:
n_test = 0.1
import random
test_train = {'train': [], 'test': []}
for domain in bacteria:
    num_test = round(sum(bacteria[domain].values())*n_test)
    num_train = sum(bacteria[domain].values()) - num_test
    
    print("num_test",num_test, "num_train",num_train)
    for phylum in bacteria[domain]:
        print(phylum)
        num_genomes = bacteria[domain][phylum]

        #print(phylum_to_genomes[phylum])
        
        if 1 < num_genomes < 5:
        #if num_genomes < 10 and num_genomes > 1:
            # 50-50 split to training vs test set
            p_test = round(bacteria[domain][phylum]*0.5)
            p_train = bacteria[domain][phylum] - p_test

        elif num_genomes == 1:
            #p_test = round(bacteria[domain][phylum]*1)
            #p_train = bacteria[domain][phylum] - p_test
            p_test = 0
            p_train = 1

        else:
            # phylum: [T1230, T327891, T32780]
            p_test = round(bacteria[domain][phylum]*n_test)
            p_train = bacteria[domain][phylum] - p_test

        #print(domain, phylum, num_genomes, p_train, p_test)
        p_train_ds = random.sample(phylum_to_genomes[phylum], p_train)

        p_test_ds = []
        for genome in phylum_to_genomes[phylum]:
            if genome not in p_train_ds:
                p_test_ds.append(genome)

        #print (len(p_train_ds), len(p_test_ds))
        test_train['train'].extend(p_train_ds)
        test_train['test'].extend(p_test_ds)

num_test 10 num_train 90
Gammaproteobacteria


In [83]:
train_split = {}
test_split = {}
for i in range(len(tensor_df)):
    genome = num_to_genome[i]
    print(genome)
    if genome in test_train['train']:
        print("got in train")
        train_split[genome] = tensor_df[i].tolist()
    else:
        print("got in test")
        test_split[genome] = tensor_df[i].tolist()

train_df = pd.DataFrame.from_dict(train_split, orient='index')
test_df = pd.DataFrame.from_dict(test_split, orient='index')


T04989
got in test
T03060
got in test
T05571
got in test
T01445
got in test
T05547
got in test
T04735
got in test
T01821
got in test
T00013
got in test
T01053
got in test
T03184
got in test
T01320
got in test
T04343
got in test
T06060
got in test
T04973
got in test
T05291
got in test
T04298
got in test
T06309
got in test
T04397
got in test
T00929
got in test
T05678
got in test
T01290
got in test
T06426
got in test
T00730
got in test
T05523
got in test
T00447
got in test
T03858
got in test
T00451
got in test
T04403
got in test
T05950
got in test
T03925
got in test
T00352
got in test
T04394
got in test
T05652
got in test
T03963
got in test
T05073
got in test
T04463
got in test
T05576
got in test
T04881
got in test
T05798
got in test
T01611
got in test
T04138
got in test
T01180
got in test
T00380
got in test
T03136
got in test
T03288
got in test
T03095
got in test
T05472
got in test
T06427
got in test
T00600
got in test
T06474
got in test
T00667
got in test
T00868
got in test
T06502
got i

In [81]:
test_split

{'T04989': [1.0,
  1.0,
  1.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  1.0,
  1.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  1.0,
  1.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  1.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  1.0,
  1.0,
  0.0,
  1.0,
  0

In [70]:
n = 15
num_to_genome[n]
train_split[genome] = tensor_df[n].tolist()
train_split[genome]
train_df = pd.DataFrame.from_dict(train_split, orient='index')


In [71]:
train_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14120,14121,14122,14123,14124,14125,14126,14127,14128,14129
T03640,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
ninefold, onefold = util.balanced_split(df=df_train_data, n_test=0.1, 
                                        genome_to_tax=genome_to_tax, 
                                        num_to_genome=num_to_genome, 
                                        path=None, 
                                        genome_idx_train=genome_idx_train)

Number of training examples: 90 (90%), Number of test examples: 10 (10%)


In [30]:
num_features = train_data.shape[1]
train_set = ninefold.values  # ugg CC has pandas==0.23.4, does not support to_numpy()
cv_set = onefold.values
X_train = train_set[:,:num_features] # corrupted genomes in first half of matrix columns
y_train = train_set[:,num_features:] # uncorrupted in second half of matrix columns
X_cv = cv_set[:,:num_features]
y_cv = cv_set[:,num_features:]

In [34]:
from torch.utils.data import DataLoader, TensorDataset

In [39]:
df_train_data.shape

(100, 14130)

In [35]:
train_ds = TensorDataset(X_train, y_train)

TypeError: 'int' object is not callable

In [None]:
sys.stdout.flush()
sys.stderr.flush()

In [None]:
#import numpy as np # this is slooooow
#np.savetxt(settings.DATA_FP+"corrupted_train_1407.txt", train_data.numpy())
#np.savetxt(settings.DATA_FP+"corrupted_test_1407.txt", test_data.numpy())

In [None]:
# print(("There are %s genomes and %s features in the training dataset") % 
#       (train_data.shape[0],int(train_data.shape[1]/2)))

# print(("There are %s genomes and %s features in the test dataset") % 
#       (test_data.shape[0],int(test_data.shape[1]/2)))

### Define and train network

In [None]:
# print(train_data.shape[1])

In [None]:
# num_features = int(train_data.shape[1]/2) # Number of features in the entire dataset (train + test)

In [None]:
# # define the network
# model = models.AutoEncoder(num_features, 6)
# print(model)

In [None]:
sys.stdout.flush()
sys.stderr.flush()

In [None]:
config_space = CS.ConfigurationSpace()

config_space.add_hyperparameter(
    CS.CategoricalHyperparameter(name='nn_layers', choices=['6'])) #, '8', '10', '12']))

config_space.add_hyperparameter(
    CS.CategoricalHyperparameter(name='batch_size', choices=[32])) #, 64, 128, 256]))

#Optimizer = Adam -- LR less important
config_space.add_hyperparameter(
    CS.UniformFloatHyperparameter('lr', lower=1e-4, upper=1e-1, log=True))

config_space.add_hyperparameter(
    CS.UniformFloatHyperparameter('weight_decay', lower=1e-5, upper=1e-2, log=True))

algo = TuneBOHB(
    config_space, max_concurrent=4, metric='test_f1', mode='max')

bohb = HyperBandForBOHB(
    time_attr='training_iteration',
    metric='test_f1',
    mode='max',
    max_t=100, 
    reduction_factor=3)

In [None]:
ray.shutdown()
ray.init(local_mode=True)

# redis_password = sys.argv[1]
# num_cpus = int(sys.argv[2])

# ray.init(address=os.environ["ip_head"], redis_password=redis_password)

In [None]:
config = {"num_workers": 1,
         "num_epochs":settings.num_epochs,
         "kfolds":flags.kfolds,
         "replacement_threshold": flags.replacement_threshold}

In [None]:
analysis = tune.run(
    trainable.train_AE, 
    name="exp_1",
    config=config,
    search_alg=algo,
    verbose=2, 
    resources_per_trial={
            "cpu": settings.num_cpus,
            "gpu": 0
    },
    num_samples=10,  #BUMP UP TO 1000
    scheduler=bohb,
    local_dir=settings.SAVE_FP+"TUNE_RESULT_DIR",
    stop=trainable.EarlyStopping("test_f1") # if search results aren't improving anymore
    )

#print("Best config is:", analysis.get_best_config(metric="test_f1"))

In [None]:
print("Best config is:", analysis.get_best_config(metric="test_f1"))

In [None]:
# for i in train_vars:
#     if isinstance(train_vars[i], dict):
#         print("Best "+i+":", analysis.get_best_config(metric="test_f1")[i])
for i in config_space:
    print("Best "+i+":", analysis.get_best_config(metric="test_f1")[i])

In [None]:
# analysis = tune.run(
#     trainable.train_AE, 
#     name="exp_1",
#     config=train_vars, 
#     verbose=2, 
#     resources_per_trial={
#             "cpu": 2,
#             "gpu": 0
#     },
#     num_samples=2,
#     scheduler=ASHAScheduler(metric="test_f1", mode="max", grace_period=1, time_attr="n_batch"),
#     local_dir=settings.SAVE_FP+"TUNE_RESULT_DIR"
#     )

# print("Best config is:", analysis.get_best_config(metric="test_f1"))

In [None]:
# analysis.dataframe()["logdir"][0]

In [None]:
# train the model
# train_losses, test_losses, train_f1_scores, test_f1_scores = train_test.train_model(loaders, 
#         model, settings.num_epochs, flags.print_every,
#         settings.SAVE_FP, flags.replacement_threshold, cluster_names, flags.cirriculum, train_data[:,:len(cluster_names)],
#         search_space)
#train_losses, test_losses, train_f1_scores, test_f1_scores = train_test.train_model(train_vars, hyperparams)

### Evaluate model performance

In [None]:
# # evaluate model performance
# perf_lc = data_viz.learning_curve(train_f1_scores, test_f1_scores, "performance", flags.cirriculum)

In [None]:
# # evaluate model performance
# optim_lc = data_viz.learning_curve(train_losses, test_losses, "optimization", flags.cirriculum)

In [None]:
# # first convert test_data from subset -> tensor, split corrupt vs target sets
# tensor_test_data = torch.tensor([i.numpy() for i in test_data]).float()
# corrupt_test_data = tensor_test_data[:,:num_features]
# target = tensor_test_data[:,num_features:].detach().numpy()

In [None]:
# # Generate probabilities for ROC curve
# model.eval()
# with torch.no_grad():
#     y_probas = model(corrupt_test_data) # predicted probabilities generated by model

In [None]:
# roc = data_viz.my_roc_curve(target, y_probas.numpy())

In [None]:
# util.log_results(roc, optim_lc, perf_lc, flags, model)

In [None]:
# # create embeddings for test set
# #uncorrupt_test_data = tensor_test_data[:,len(cluster_names):]
# #tensor_test_data = torch.tensor([i.numpy() for i in test_data]).float()
# embeddings = train_test.generate_embeddings(model, corrupt_test_data)

In [None]:
#data_viz.plot_tSNE(embeddings, test_data, num_to_genome, genome_to_tax, test_tax_dict)

In [None]:
# # tSNE for corrupted genomes passed through untrained model
# untrained_model = models.AutoEncoder(len(cluster_names))
# untr_embeddings = train_test.generate_embeddings(untrained_model, corrupt_test_data)

In [None]:
#data_viz.plot_tSNE(untr_embeddings, test_data, num_to_genome, genome_to_tax, test_tax_dict)
# data_viz.plot_tSNE(untr_embeddings, test_data, num_to_genome, genome_to_tax, genome_idx_test)

In [None]:
# # Evaluate model and compare against baselines
# # Get corrupted input set, target set, and predictions set (binarized to 1's and 0's)
# #corrupt_test_data = tensor_test_data[:,:len(cluster_names)]

# model.eval()
# with torch.no_grad():
#     pred = model.forward(corrupt_test_data).detach().numpy()
# b_pred = train_test.binarize(pred, flags.replacement_threshold)

In [None]:
# # Generate confusion matrix
# cm = evaluate.dom_confusion_matrix(b_pred, target, num_to_genome, genome_to_tax, test_tax_dict, genome_idx_test)

In [None]:
# util.log_results(roc, optim_lc, perf_lc, flags, model, cm)

In [None]:
# # Baseline 1: untrained DAE
# # Generate predictions using an untrained DAE model
# model.eval()
# with torch.no_grad():
#     untr_pred = untrained_model.forward(corrupt_test_data).detach().numpy()
# untr_b_preds = train_test.binarize(untr_pred, flags.replacement_threshold)

In [None]:
# # if os.path.isfile(settings.DATA_FP+"rand_b_pred.pt"):
# #     print("Loading random predictions from file")
# #     rand_b_pred = torch.load(settings.DATA_FP+"rand_b_pred.pt")
# # else: 
# #     # This is slow
# #     print("Generating random predictions, this will take a while (~30 min)")
# #     rand_b_pred = evaluate.generate_baseline(num_features, train_data, 
# #                                              corrupt_test_data, "base_random", cluster_names)
# #     torch.save(rand_b_pred, settings.DATA_FP+"rand_b_pred.pt")

# rand_b_pred = evaluate.generate_baseline(num_features, train_data, 
#                                          corrupt_test_data, "base_random", cluster_names)

In [None]:
# torch.save(rand_b_pred, settings.DATA_FP+"rand_b_pred.pt")

In [None]:
# # if os.path.isfile(settings.DATA_FP+"smart_b_pred.pt"):
# #     print("Loading smart random predictions from file")
# #     smart_b_pred = torch.load(settings.DATA_FP+"smart_b_pred.pt")
# # else:
# #     print("Generating smart random predictions, this will take a while (~30 min)")
# #     smart_b_pred = evaluate.generate_baseline(num_features, train_data, 
# #                                           corrupt_test_data, "smart_random", cluster_names)
# #     torch.save(smart_b_pred, settings.DATA_FP+"smart_b_pred.pt")

# smart_b_pred = evaluate.generate_baseline(num_features, train_data, 
#                                       corrupt_test_data, "smart_random", cluster_names)

In [None]:
# torch.save(smart_b_pred, settings.DATA_FP+"smart_b_pred.pt")

In [None]:
# import numpy as np
# np.sum(smart_b_pred == rand_b_pred), np.sum(smart_b_pred != rand_b_pred)

In [None]:
# import pandas as pd
# hs = evaluate.hamming(target, b_pred)
# hs_stats = [round(sum(hs)/len(hs),2), round(min(hs),2), round(max(hs),2)]

# untr_hs = evaluate.hamming(target, untr_b_preds)
# untr_hs_stats = [round(sum(untr_hs)/len(untr_hs),2), round(min(untr_hs),2), round(max(untr_hs),2)]

# rand_hs = evaluate.hamming(target, rand_b_pred)
# rand_hs_stats = [round(sum(rand_hs)/len(rand_hs),2), round(min(rand_hs),2), round(max(rand_hs),2)]

# smart_hs = evaluate.hamming(target, smart_b_pred)
# smart_hs_stats = [round(sum(smart_hs)/len(smart_hs),2), round(min(smart_hs),2), round(max(smart_hs),2)]


# hamming_df = pd.DataFrame([hs_stats, untr_hs_stats, rand_hs_stats, smart_hs_stats], columns=['mean', 'min', 'max'], 
#                             index=["DAE trained", "DAE untrained", "Random chance", "Smart random chance"])
# hamming_df

In [None]:
sys.stdout.flush()
sys.stderr.flush()