In [1]:
%matplotlib notebook

In [2]:
!nvidia-smi

Thu Jul  2 11:47:47 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.64.00    Driver Version: 440.64.00    CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  TITAN V             Off  | 00000000:03:00.0 Off |                  N/A |
| 28%   32C    P8    23W / 250W |     12MiB / 12066MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  Tesla P100-PCIE...  Off  | 00000000:83:00.0 Off |                    0 |
| N/A   32C    P0    27W / 250W |     10MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   2  TITAN V             Off  | 00000000:84:00.0 Off |                  N/A |
| 30%   

In [3]:
import numpy as np
import torch
from pathlib import Path
import matplotlib.pyplot as plt
import pandas as pd

import mlflow

from model.collectdata_mdsA import collect_data
from model.alt_loss_A import Loss
from model.training import trainNet, select_gpu
from model.model_29June2020_B import UNet4SC as Model
from model.training import trainNet, select_gpu, Results
from model.utilities import load_full_state, count_parameters
from model.plots import dual_train_plots, replace_in_ax
from model.utilities import count_parameters, Params
import hiddenlayer as HL
from torchsummary import summary

In [4]:
device = torch.device('cuda:0')    
mlflow.tracking.set_tracking_uri('file:/share/lazy/pv-finder_model_repo')
mlflow.set_experiment('Weird U-Net')

In [5]:
train_loader = collect_data('/share/lazy/sokoloff/ML-data_AA/Aug14_80K_train.h5',
                            '/share/lazy/sokoloff/ML-data_AA/Oct03_80K_train.h5',
                            '/share/lazy/sokoloff/ML-data_AA/Oct03_40K_train.h5',
                            '/share/lazy/will/ML/June30_2020_80k_1.h5',
##                            'dataAA/Oct03_80K2_train.h5',
                             batch_size=64,
                            masking=True, shuffle=True,
                            load_XandXsq=False,
                            load_xy=False)

val_loader = collect_data('/share/lazy/sokoloff/ML-data_AA/Oct03_20K_val.h5',
                          batch_size=64,
                          slice=slice(256 * 39),
                          device=device,
                          masking=True, shuffle=False,
                          load_XandXsq=False,
                          load_xy=False)

Loading data...
Loaded /share/lazy/sokoloff/ML-data_AA/Aug14_80K_train.h5 in 13.47 s
Loaded /share/lazy/sokoloff/ML-data_AA/Oct03_80K_train.h5 in 15.66 s
Loaded /share/lazy/sokoloff/ML-data_AA/Oct03_40K_train.h5 in 7.529 s
Loaded /share/lazy/will/ML/June30_2020_80k_1.h5 in 15.51 s
Constructing 280000 event dataset took 1.635 s
Loading data...
Loaded /share/lazy/sokoloff/ML-data_AA/Oct03_20K_val.h5 in 3.752 s
Constructing 9984 event dataset took 3.089 s


In [6]:
%%writefile architecture.txt
'''
UNet model with the following properties:
- 4 skip connections
- 1/2 downsample rate
**- No MaxPool layers; all downsampling occurs in the convolutional layers** (i.e. stride=2)
- Kernel size decay (i.e. kernel size in convolutional layers proportionally decreases with downsampling)
'''
class ConvBNrelu(nn.Sequential):
    """convolution => [BN] => ReLU"""
    def __init__(self, in_channels, out_channels, k_size):
        super(ConvBNrelu, self).__init__(
            nn.Conv1d(in_channels, out_channels, k_size, stride=2, padding=(k_size-2)//2),
            nn.BatchNorm1d(out_channels),
            nn.ReLU())
    
class ConvBNreluSame(nn.Sequential):
    """preserves dimension of input"""
    def __init__(self, in_channels, out_channels, k_size):
        super(ConvBNreluSame, self).__init__(
            nn.Conv1d(in_channels, out_channels, k_size, stride=1, padding=(k_size-1)//2),
            nn.BatchNorm1d(out_channels),
            nn.ReLU())
    
class Up(nn.Sequential):
    def __init__(self, inc, outc, k_size):
        super().__init__(
            nn.ConvTranspose1d(inc, outc, kernel_size=2, stride=2),
            nn.Conv1d(outc, outc, k_size, stride=1, padding=(k_size-1)//2),
            nn.BatchNorm1d(outc),
            nn.ReLU())


class UNet4SC(nn.Module):
    def __init__(self, n=24):
        super(UNet4SC, self).__init__()
        
        self.rcbn1 = ConvBNreluSame(1, n, k_size=25)
        self.rcbn2 = ConvBNrelu(n, n, k_size=12)
        self.rcbn3 = ConvBNrelu(n, n, k_size=6)
        self.rcbn4 = ConvBNrelu(n, n, k_size=4)
        self.rcbn5 = ConvBNrelu(n, n, k_size=4)

        self.up1 = Up(n, n, k_size=3)
        self.up2 = Up(n*2, n, k_size=3)
        self.up2 = Up(n*2, n, k_size=7)
        self.up3 = Up(n*2, n, k_size=13)
        self.up4 = Up(n*2, n, k_size=25)
        self.outc = nn.Conv1d(n*2, 1, 3, padding=1)

    def forward(self, x):
        
        x1 = self.rcbn1(x) # 4000
        x2 = self.rcbn2(x1) # 2000       
        x3 = self.rcbn3(x2) # 1000
        x4 = self.rcbn4(x3) # 500
        x = self.rcbn5(x4) # 250

        x = self.up1(x) # 500
        x = self.up2(torch.cat([x, x4], 1)) # 1000
        x = self.up3(torch.cat([x, x3], 1)) #2000
        x = self.up4(torch.cat([x, x2], 1)) #4000

        logits_x0 = self.outc(torch.cat([x, x1], 1))

        ret = torch.nn.Softplus()(logits_x0).squeeze()
        return  ret

Overwriting architecture.txt


In [7]:
# params order - batch size, epochs, lr
runs = [
    #(Model(24).to(device), Params(64, 200, 5e-4, 0)),
    #(Model(16).to(device), Params(64, 200, 5e-4, 0)),
    #(Model(12).to(device), Params(64, 200, 5e-4, 0)),
    #(Model(24).to(device), Params(64, 200, 5e-3, 0)),
    #(Model(16).to(device), Params(64, 200, 5e-3, 0)),
    (Model(12).to(device), Params(64, 200, 5e-3, 0)),
    (Model(24).to(device), Params(64, 200, 1e-3, 0)),
    (Model(16).to(device), Params(64, 200, 1e-3, 0)),
    (Model(12).to(device), Params(64, 200, 1e-3, 0))
]

In [8]:
# Define optimizer and loss
loss = Loss(epsilon=1e-5,coefficient=2.5)
eff_avg = 0
fp_avg = 0

In [None]:
# we need this for plots
ax, tax, lax, lines = dual_train_plots()
fig = ax.figure
plt.tight_layout()
results = pd.DataFrame([], columns=Results._fields)

for (model, args) in runs:
    run_name = 'No MaxPool, kernel size decay'

    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

    with mlflow.start_run(run_name = run_name) as run:

        for key, value in vars(args).items():
            print(key, value)
            mlflow.log_param(key, value)

        mlflow.log_param('Parameters', count_parameters(model))
        

        for result in trainNet(model, optimizer, loss,
                                train_loader, val_loader,
                                args.epochs+args.epoch_start, epoch_start=args.epoch_start, notebook=False, device=device):

            result = result._asdict()
            
            # plotting code block ===============================
            results = results.append(pd.Series(result), ignore_index=True)
            xs = results.index
            # Update the plot above
            lines['train'].set_data(results.index,results.cost)
            lines['val'].set_data(results.index,results.val)
            #filter first cost epoch (can be really large)
            max_cost = max(max(results.cost if len(results.cost)<2 else results.cost[1:]), max(results.val))
            
            min_cost = min(min(results.cost), min(results.val))
            # The plot limits need updating too
            ax.set_ylim(min_cost*.9, max_cost*1.1)  
            ax.set_xlim(-.5, len(results.cost) - .5)
            replace_in_ax(lax, lines['eff'], xs, results['eff_val'].apply(lambda x: x.eff_rate))
            replace_in_ax(tax, lines['fp'], xs, results['eff_val'].apply(lambda x: x.fp_rate))
            # Redraw the figure
#            fig.canvas.draw()
            fig.savefig('plot.png')
            # plotting code block =============================== 

            # Log metrics
            mlflow.log_metric('Efficiency', result['eff_val'].eff_rate, result['epoch'])
            mlflow.log_metric('False Positive Rate',  result['eff_val'].fp_rate, result['epoch'])
            mlflow.log_metric('Validation Loss',  result['val']*2, result['epoch'])
            mlflow.log_metric('Training Loss',  result['cost']*2, result['epoch'])
            
            # Log tags
            mlflow.set_tag('Optimizer', 'Adam')
            mlflow.set_tag('Kernel size', 'Mixed')
            mlflow.set_tag('Skip connections', '4')
            mlflow.set_tag('Activation', 'Softplus')
            mlflow.set_tag('Mid Activation', 'Relu')

            # Save model AND optimizer state_dict AND epoch number. x
            torch.save({
                'model':model.state_dict(),
                'optimizer':optimizer.state_dict(),
                'epoch':args.epochs+result['epoch']
                }, 'run_stats.pyt')
            mlflow.log_artifact('run_stats.pyt')
            
            # save a diagram of the architecture
            HL.transforms.Fold("Conv > BatchNorm > LeakyRelu", "ConvBnRelu"),
            HL.build_graph(model, torch.zeros([args.batch_size, 1, 4000]).to(device)).save('architecture', format='png')
            mlflow.log_artifact('architecture.png')
        
            # log the code for the model architecture
            mlflow.log_artifact('architecture.txt')
        
            # save plot that mike likes
            mlflow.log_artifact('plot.png')

<IPython.core.display.Javascript object>

epoch_start 0
batch_size 64
epochs 200
lr 0.005
n_epochs: 200
batch_size: 64 events
dataset_train: 280000 events
dataset_val: 9984 events
loss: Loss()
optimizer: Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 0.005
    weight_decay: 0
)
model: UNet4SC(
  (rcbn1): ConvBNreluSame(
    (0): Conv1d(1, 12, kernel_size=(25,), stride=(1,), padding=(12,))
    (1): BatchNorm1d(12, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
  )
  (rcbn2): ConvBNrelu(
    (0): Conv1d(12, 12, kernel_size=(12,), stride=(2,), padding=(5,))
    (1): BatchNorm1d(12, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
  )
  (rcbn3): ConvBNrelu(
    (0): Conv1d(12, 12, kernel_size=(6,), stride=(2,), padding=(2,))
    (1): BatchNorm1d(12, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
  )
  (rcbn4): ConvBNrelu(
    (0): Conv1d(12, 12, kernel_size=(4,), stride=(2,), padding=(1,))
  

Epoch 44: train=3.6238, val=3.57207, took 92.591 s
  Validation Found 51744 of 54504, added 2088 (eff 94.94%) (0.209 FP/event)
Epoch 45: train=3.62309, val=3.59774, took 89.894 s
  Validation Found 51543 of 54504, added 1866 (eff 94.57%) (0.187 FP/event)
Epoch 46: train=3.62201, val=3.59711, took 91.453 s
  Validation Found 51587 of 54504, added 1824 (eff 94.65%) (0.183 FP/event)
Epoch 47: train=3.62047, val=3.57918, took 88.866 s
  Validation Found 51651 of 54504, added 1947 (eff 94.77%) (0.195 FP/event)
Epoch 48: train=3.6217, val=3.69371, took 90.499 s
  Validation Found 51583 of 54504, added 1971 (eff 94.64%) (0.197 FP/event)
Epoch 49: train=3.61923, val=3.61739, took 89.331 s
  Validation Found 51691 of 54504, added 2123 (eff 94.84%) (0.213 FP/event)
Epoch 50: train=3.61882, val=3.61514, took 90.761 s
  Validation Found 51737 of 54504, added 2108 (eff 94.92%) (0.211 FP/event)
Epoch 51: train=3.61728, val=3.63046, took 91.314 s
  Validation Found 51630 of 54504, added 1913 (eff 94.

In [None]:
##quit()