### Testing out Hyperspace on MNIST Data - Model Resiliency Test

In [1]:
from hyperspace import create_hyperspace
from ray import tune
import tensorflow as tf
from torch import nn
import pytorch_lightning as pl
from pytorch_lightning import loggers as pl_loggers
from ray.tune.suggest.skopt import SkOptSearch
from skopt import Optimizer
import ray
from tqdm import tqdm
import torch
import torchvision
import statistics
import pandas as pd



In [2]:
ray.init()

2020-10-09 10:25:13,294	INFO services.py:1164 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


{'node_ip_address': '192.168.1.240',
 'raylet_ip_address': '192.168.1.240',
 'redis_address': '192.168.1.240:63779',
 'object_store_address': '/tmp/ray/session_2020-10-09_10-25-12_693516_81011/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2020-10-09_10-25-12_693516_81011/sockets/raylet',
 'webui_url': '127.0.0.1:8265',
 'session_dir': '/tmp/ray/session_2020-10-09_10-25-12_693516_81011',
 'metrics_export_port': 60740}

### Tensorflow Model Objective Function Definition

In [3]:
def mnist_tf_objective(config):
    mnist = tf.keras.datasets.mnist

    (x_train, y_train),(x_test, y_test) = mnist.load_data()
    x_train, x_test = x_train / 255.0, x_test / 255.0

    model = tf.keras.models.Sequential([
      tf.keras.layers.Flatten(input_shape=(28, 28)),
      tf.keras.layers.Dense(128, activation='relu'),
      tf.keras.layers.Dropout(config['dropout']),
      tf.keras.layers.Dense(10, activation='softmax')
    ])

    opt = tf.keras.optimizers.Adam(learning_rate=config['learning_rate'])

    model.compile(optimizer=opt,
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

    res = model.fit(x_train, y_train, epochs=config['epochs'], batch_size=config['batch_size'])
    res_test = model.evaluate(x_test, y_test)
    # res test[0] reports the loss from the evaluation, res_test[1] reports the accuracy
    tune.report(test_loss = res_test[0])
    return res_test[0]

In [7]:
### Defining the hyperspace
hyperparameters = [(0.00001, 0.1),  # learning_rate
                   (0.2, 0.9),  # dropout
                   (10, 100),  # epochs 
                   (10, 1000)]  # batch size
space = create_hyperspace(hyperparameters)

In [12]:
space

[Space([Real(low=0.03750625, high=0.1, prior='uniform', transform='identity'),
        Real(low=0.4625, high=0.9, prior='uniform', transform='identity'),
        Integer(low=44, high=100),
        Integer(low=382, high=1000)]),
 Space([Real(low=1e-05, high=0.06250375000000001, prior='uniform', transform='identity'),
        Real(low=0.4625, high=0.9, prior='uniform', transform='identity'),
        Integer(low=44, high=100),
        Integer(low=382, high=1000)]),
 Space([Real(low=0.03750625, high=0.1, prior='uniform', transform='identity'),
        Real(low=0.2, high=0.6375000000000001, prior='uniform', transform='identity'),
        Integer(low=44, high=100),
        Integer(low=382, high=1000)]),
 Space([Real(low=1e-05, high=0.06250375000000001, prior='uniform', transform='identity'),
        Real(low=0.2, high=0.6375000000000001, prior='uniform', transform='identity'),
        Integer(low=44, high=100),
        Integer(low=382, high=1000)]),
 Space([Real(low=0.03750625, high=0.1, pri

#### Run hypertune for Tensorflow Model...

In [15]:
%%capture tf_run_output

### for each space in hyperspace, we want to search the space using ray tune
results = []
for section in tqdm(space):
    # create a skopt gp minimize object
    optimizer = Optimizer(section)
    search_algo = SkOptSearch(optimizer, ['learning_rate', 'dropout', 'epochs', 'batch_size'],
                              metric='test_loss', mode='min')
    # not using a gpu because running on local
    analysis = tune.run(mnist_tf_objective, search_alg=search_algo, num_samples=20, local_dir="~/Documents/hyper_resilient/experiments/exp1")
    results.append(analysis)

# # print out the best result
# i = 0
# for a in results:
#     print("Best config for space "+str(i)+": "+a.get_best_config(metric="test_loss", mode="min"))
#     i +=1

2020-10-05 14:30:53,530	INFO (unknown file):0 -- gc.collect() freed 109 refs in 3.0544720499999585 seconds
2020-10-05 14:44:31,871	INFO (unknown file):0 -- gc.collect() freed 435 refs in 6.933784093999975 seconds
2020-10-05 15:03:52,497	INFO (unknown file):0 -- gc.collect() freed 275 refs in 1.126252144999853 seconds
2020-10-05 15:15:26,217	INFO (unknown file):0 -- gc.collect() freed 601 refs in 0.9940709779998542 seconds
2020-10-05 15:34:02,505	INFO (unknown file):0 -- gc.collect() freed 221 refs in 0.9932282859999759 seconds
2020-10-05 15:56:18,772	INFO (unknown file):0 -- gc.collect() freed 568 refs in 0.9389973069992266 seconds
2020-10-05 15:58:28,459	INFO (unknown file):0 -- gc.collect() freed 275 refs in 0.13785563399960665 seconds
2020-10-05 16:01:12,978	INFO (unknown file):0 -- gc.collect() freed 568 refs in 3.50938840300023 seconds
2020-10-05 16:04:51,728	INFO (unknown file):0 -- gc.collect() freed 369 refs in 1.023181859000033 seconds
2020-10-05 16:10:23,387	INFO (unknown fil

In [19]:
tf_results = results

In [20]:
tf_results

[<ray.tune.analysis.experiment_analysis.ExperimentAnalysis at 0x7fa6c8f408b0>,
 <ray.tune.analysis.experiment_analysis.ExperimentAnalysis at 0x7fa6c8fdc910>,
 <ray.tune.analysis.experiment_analysis.ExperimentAnalysis at 0x7fa6c8cd75e0>,
 <ray.tune.analysis.experiment_analysis.ExperimentAnalysis at 0x7fa6c9a38340>,
 <ray.tune.analysis.experiment_analysis.ExperimentAnalysis at 0x7fa6ca51c4c0>,
 <ray.tune.analysis.experiment_analysis.ExperimentAnalysis at 0x7fa6ca478520>,
 <ray.tune.analysis.experiment_analysis.ExperimentAnalysis at 0x7fa6ca35c7f0>,
 <ray.tune.analysis.experiment_analysis.ExperimentAnalysis at 0x7fa6ca478b20>,
 <ray.tune.analysis.experiment_analysis.ExperimentAnalysis at 0x7fa6c8f07250>,
 <ray.tune.analysis.experiment_analysis.ExperimentAnalysis at 0x7fa6caabf400>,
 <ray.tune.analysis.experiment_analysis.ExperimentAnalysis at 0x7fa6c8cd7250>,
 <ray.tune.analysis.experiment_analysis.ExperimentAnalysis at 0x7fa6cc783be0>,
 <ray.tune.analysis.experiment_analysis.ExperimentAn

In [21]:
all_tf_results = tf_results[0].results_df
for i in range(1, len(tf_results)):
    all_tf_results = all_tf_results.append(tf_results[i].results_df)

In [23]:
all_tf_results.to_csv('full_tf_results.csv')

### Pytorch Model Objective Function Definition

In [17]:
class NumberNet(pl.LightningModule):
    def __init__(self, config):
        super().__init__()
        self.model = nn.Sequential(
            nn.Flatten(), 
            nn.Linear(784, 128), 
            nn.ReLU(), 
            nn.Dropout(config['dropout']), 
            nn.Linear(128, 10), 
            nn.Softmax())
        self.criterion = nn.CrossEntropyLoss()
        self.config = config
        self.test_loss = None
    
    def train_dataloader(self):
        return torch.utils.data.DataLoader(torchvision.datasets.MNIST("~/resiliency/", train=True, 
                                                                      transform=torchvision.transforms.ToTensor(), target_transform=None, download=True), 
                                           batch_size=int(self.config['batch_size']))
    
    def test_dataloader(self):
        return torch.utils.data.DataLoader(torchvision.datasets.MNIST("~/resiliency/", train=True, 
                                                                      transform=torchvision.transforms.ToTensor(), target_transform=None, download=True), 
                                           batch_size=int(self.config['batch_size']))
    
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.config['learning_rate'])
        return optimizer
    
    def forward(self, x):
        return self.model(x)
    
    def training_step(self, train_batch, batch_idx):
        x, y = train_batch
        logits = self.forward(x)
        loss = self.criterion(logits, y)
        logs = {'train_loss': loss}
        return {'loss': loss}
    
    def test_step(self, test_batch, batch_idx):
        x, y = test_batch
        logits = self.forward(x)
        loss = self.criterion(logits, y)
        logs = {'test_loss': loss}
        return {'test_loss': loss, 'logs': logs}
    
    def test_epoch_end(self, outputs):
        loss = []
        for x in outputs:
            loss.append(float(x['test_loss']))
        avg_loss = statistics.mean(loss)
        tensorboard_logs = {'test_loss': avg_loss}
        self.test_loss = avg_loss
        return {'avg_test_loss': avg_loss, 'log': tensorboard_logs}


In [18]:
def mnist_pt_objective(config):
    model = NumberNet(config)
    trainer = pl.Trainer(max_epochs=config['epochs'])
    trainer.fit(model)
    trainer.test(model)
    tune.report(test_loss=model.test_loss)
    return model.test_loss

In [19]:
%%capture pt_run_output
# hyperparameters = [(0.00000001, 0.1),  # learning_rate
#                    (0.0, 0.9),  # dropout
#                    (10, 100),  # epochs 
#                    (10, 1000)]  # batch size
# space = create_hyperspace(hyperparameters)

### for each space in hyperspace, we want to search the space using ray tune

results = []
for section in tqdm(space):
    # create a skopt gp minimize object
    optimizer = Optimizer(section)
    search_algo = SkOptSearch(optimizer, ['learning_rate', 'dropout', 'epochs', 'batch_size'],
                              metric='test_loss', mode='min')
    # not using a gpu because running on local
    analysis = tune.run(mnist_pt_objective, search_alg=search_algo, num_samples=20)
    results.append(analysis)

# print out the best result
# i = 0
# for a in results:
#     print("Best config for space "+str(i)+": "+a.get_best_config(metric="avg_test_loss", mode="min"))
#     i +=1

2020-10-05 02:10:28,782	INFO (unknown file):0 -- gc.collect() freed 455 refs in 0.2666840870078886 seconds
2020-10-05 04:35:02,320	INFO (unknown file):0 -- gc.collect() freed 434 refs in 0.3666324420046294 seconds
2020-10-05 04:38:43,132	INFO (unknown file):0 -- gc.collect() freed 137 refs in 0.3556398209912004 seconds
2020-10-05 04:46:25,022	INFO (unknown file):0 -- gc.collect() freed 578 refs in 0.28180963201157283 seconds
2020-10-05 05:22:25,641	INFO (unknown file):0 -- gc.collect() freed 771 refs in 0.33821831599925645 seconds
2020-10-05 07:18:19,857	INFO (unknown file):0 -- gc.collect() freed 578 refs in 0.3425632010039408 seconds
2020-10-05 07:47:28,802	INFO (unknown file):0 -- gc.collect() freed 401 refs in 0.3670527409994975 seconds
2020-10-05 08:16:37,472	INFO (unknown file):0 -- gc.collect() freed 771 refs in 0.36160219201701693 seconds


TypeError: can only concatenate str (not "NoneType") to str

In [20]:
pt_results = results

In [18]:
pt_results

NameError: name 'pt_results' is not defined

In [29]:
all_pt_results = pt_results[0].results_df
for i in range(1, len(pt_results)):
    all_pt_results = all_pt_results.append(pt_results[i].results_df)

In [30]:
all_pt_results

Unnamed: 0_level_0,test_loss,time_this_iter_s,done,timesteps_total,episodes_total,training_iteration,experiment_id,date,timestamp,time_total_s,...,hostname,node_ip,time_since_restore,timesteps_since_restore,iterations_since_restore,experiment_tag,config.learning_rate,config.dropout,config.epochs,config.batch_size
trial_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3f34fb72,1.560013,758.230129,True,,,1,b6eced49168f4073923784cbd55b2930,2020-10-05_01-01-53,1601884913,758.230129,...,CSI0354806,192.168.1.240,758.230129,0,1,"1_batch_size=668,dropout=0.62961,epochs=73,lea...",0.054076,0.629609,73,668
3f3549d8,1.570882,942.381004,True,,,1,ee4347a8603246e189280fe2e0f6bfb5,2020-10-05_01-04-57,1601885097,942.381004,...,CSI0354806,192.168.1.240,942.381004,0,1,"2_batch_size=730,dropout=0.51978,epochs=91,lea...",0.068914,0.519781,91,730
3f3588b2,1.597828,864.161679,True,,,1,f6dbe4679d024e828d99cb25e81af958,2020-10-05_01-03-39,1601885019,864.161679,...,CSI0354806,192.168.1.240,864.161679,0,1,"3_batch_size=837,dropout=0.62676,epochs=84,lea...",0.079474,0.626760,84,837
3f35c926,1.621406,894.386705,True,,,1,d057cfd110154abfa0581e77cd8c28f7,2020-10-05_01-04-09,1601885049,894.386705,...,CSI0354806,192.168.1.240,894.386705,0,1,"4_batch_size=841,dropout=0.53659,epochs=87,lea...",0.094646,0.536590,87,841
3f360706,1.650161,517.897673,True,,,1,f60e033dd9854d629ae64912ab9896af,2020-10-05_00-57-54,1601884674,517.897673,...,CSI0354806,192.168.1.240,517.897673,0,1,"5_batch_size=946,dropout=0.74561,epochs=50,lea...",0.097394,0.745606,50,946
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88a6f3de,1.501839,395.798915,True,,,1,307cd180a1dd4f768fedabff9d98c876,2020-10-05_08-40-32,1601912432,395.798915,...,CSI0354806,192.168.1.240,395.798915,0,1,"16_batch_size=479,dropout=0.00095588,epochs=34...",0.027286,0.000956,34,479
88a74afa,1.918050,351.211188,True,,,1,7e885715c9f24e919920a6fd2ff0aa13,2020-10-05_08-39-51,1601912391,351.211188,...,CSI0354806,192.168.1.240,351.211188,0,1,"17_batch_size=29,dropout=0.30593,epochs=16,lea...",0.042727,0.305934,16,29
88a7a1b2,1.507322,477.634328,True,,,1,b8081cbbdffa4f12aeb42c6d57fc313e,2020-10-05_08-44-00,1601912640,477.634328,...,CSI0354806,192.168.1.240,477.634328,0,1,"18_batch_size=458,dropout=0.4681,epochs=52,lea...",0.019268,0.468100,52,458
88a7fd88,1.555737,457.732046,True,,,1,c86f3d90bb574335b2d922813c7b2505,2020-10-05_08-46-13,1601912773,457.732046,...,CSI0354806,192.168.1.240,457.732046,0,1,"19_batch_size=293,dropout=0.29638,epochs=64,le...",0.033494,0.296376,64,293


In [33]:
just_pt_results = all_pt_results[['config.learning_rate','config.dropout', 'config.epochs', 'config.batch_size', 'test_loss']]