In [1]:
%matplotlib notebook

In [2]:
!nvidia-smi

Thu Jul  2 11:47:43 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.64.00    Driver Version: 440.64.00    CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  TITAN V             Off  | 00000000:03:00.0 Off |                  N/A |
| 28%   32C    P8    23W / 250W |     12MiB / 12066MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  Tesla P100-PCIE...  Off  | 00000000:83:00.0 Off |                    0 |
| N/A   32C    P0    27W / 250W |     10MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   2  TITAN V             Off  | 00000000:84:00.0 Off |                  N/A |
| 30%   

In [3]:
import numpy as np
import torch
from pathlib import Path
import matplotlib.pyplot as plt
import pandas as pd

import mlflow

from model.collectdata_mdsA import collect_data
from model.alt_loss_A import Loss
from model.training import trainNet, select_gpu
from model.model_29June2020_A import UNet3SC as Model
from model.training import trainNet, select_gpu, Results
from model.utilities import load_full_state, count_parameters
from model.plots import dual_train_plots, replace_in_ax
from model.utilities import count_parameters, Params
import hiddenlayer as HL
from torchsummary import summary

In [4]:
device = torch.device('cuda:2')    
mlflow.tracking.set_tracking_uri('file:/share/lazy/pv-finder_model_repo')
mlflow.set_experiment('Weird U-Net')

In [5]:
train_loader = collect_data('/share/lazy/sokoloff/ML-data_AA/Aug14_80K_train.h5',
                            '/share/lazy/sokoloff/ML-data_AA/Oct03_80K_train.h5',
                            '/share/lazy/sokoloff/ML-data_AA/Oct03_40K_train.h5',
                            '/share/lazy/will/ML/June30_2020_80k_1.h5',
##                            'dataAA/Oct03_80K2_train.h5',
                             batch_size=64,
                            masking=True, shuffle=True,
                            load_XandXsq=False,
                            load_xy=False)

val_loader = collect_data('/share/lazy/sokoloff/ML-data_AA/Oct03_20K_val.h5',
                          batch_size=64,
                          slice=slice(256 * 39),
                          device=device,
                          masking=True, shuffle=False,
                          load_XandXsq=False,
                          load_xy=False)

Loading data...
Loaded /share/lazy/sokoloff/ML-data_AA/Aug14_80K_train.h5 in 14.23 s
Loaded /share/lazy/sokoloff/ML-data_AA/Oct03_80K_train.h5 in 15.28 s
Loaded /share/lazy/sokoloff/ML-data_AA/Oct03_40K_train.h5 in 6.863 s
Loaded /share/lazy/will/ML/June30_2020_80k_1.h5 in 13.54 s
Constructing 280000 event dataset took 1.251 s
Loading data...
Loaded /share/lazy/sokoloff/ML-data_AA/Oct03_20K_val.h5 in 3.215 s
Constructing 9984 event dataset took 2.557 s


In [6]:
%%writefile architecture.txt
'''
UNet model with the following properties:
- 3 skip connections
**- 1/2 -> 1/4 -> 1/4 downsample rates for each layer** (i.e. increased MaxPooling to k=4, s=4 on x3, x4)
- Kernel size decay (i.e. kernel size in convolutional layers proportionally decreases with downsampling)
'''

from torch import nn
import torch

class ConvBNrelu(nn.Sequential):
    """convolution => [BN] => ReLU"""
    def __init__(self, in_channels, out_channels, k_size):
        super(ConvBNrelu, self).__init__(
        nn.Conv1d(in_channels, out_channels, k_size, stride=1, padding=(k_size-1)//2),
        nn.BatchNorm1d(out_channels),
        nn.ReLU())
    

class Up(nn.Sequential):
    def __init__(self, inc, outc, k_size, s):
        super().__init__(
            nn.ConvTranspose1d(inc, outc, k_size, s),
            ConvBNrelu(outc, outc, k_size=5))


class UNet3SC(nn.Module):
    def __init__(self, n=24):
        super(UNet3SC, self).__init__()
        self.d2 = nn.MaxPool1d(kernel_size=2, stride=2)
        self.d4 = nn.MaxPool1d(kernel_size=4, stride=4)
        
        self.rcbn1 = ConvBNrelu(1, n, k_size=25)
        self.rcbn2 = ConvBNrelu(n, n, k_size=7)
        self.rcbn3 = ConvBNrelu(n, n, k_size=5)
        self.rcbn4 = ConvBNrelu(n, n, k_size=3)

        self.up1 = Up(n, n, k_size=4, s=4)
        self.up2 = Up(n*2, n, k_size=4, s=4)
        self.up3 = Up(n*2, n, k_size=2, s=2)
        self.outc = nn.Conv1d(n*2, 1, 5, padding=2)

    def forward(self, x):
        x1 = self.rcbn1(x)
        x2 = self.d2(self.rcbn2(x1))# 2000       
        x3 = self.d4(self.rcbn3(x2))# 500
        x = self.d4(self.rcbn4(x3)) # 125

        x = self.up1(x) # 500
        x = self.up2(torch.cat([x, x3], 1)) # 2000
        x = self.up3(torch.cat([x, x2], 1)) # 4000

        logits_x0 = self.outc(torch.cat([x, x1], 1))

        ret = torch.nn.Softplus()(logits_x0).squeeze()
        return  ret

Overwriting architecture.txt


In [7]:
# params order - batch size, epochs, lr, epoch_start (which is usually set to 0)
runs = [
    (Model().to(device), Params(128, 200, 1e-3, 0))
]

In [8]:
# Define optimizer and loss
loss = Loss(epsilon=1e-5,coefficient=2.5)

In [None]:
# Loop through models in runs dictionary
for (model, args) in runs:
    ##  mds 200121 loss = Loss(epsilon=1e-5,coefficient=1.0)
    loss = Loss(epsilon=1e-5,coefficient=2.5)
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
    
    run_name = 'SCNN-ACNN freeze'
    # Create an mlflow run
    with mlflow.start_run(run_name=run_name) as run:
        # Log parameters of the model
        for key, value in vars(args).items():
            print(key, value)
            mlflow.log_param(key, value)
        
        # Log parameter count in the model
        mlflow.log_param('Parameters', count_parameters(model))
        
        # Begin run
        for result in trainNet(model, optimizer, loss,
                                train_loader, val_loader,
                                args.epochs+args.epoch_start, epoch_start=args.epoch_start,
                                notebook=True, device=device):
    
            result = result._asdict()
            results = results.append(pd.Series(result), ignore_index=True)
            xs = results.index
    
            # Update the plot above
            lines['train'].set_data(results.index, results.cost)
            lines['val'].set_data(results.index, results.val)
    
            #filter first cost epoch (can be really large)
            max_cost = max(max(results.cost if len(results.cost)<2 else results.cost[1:]), max(results.val))
            min_cost = min(min(results.cost), min(results.val))
    
            # The plot limits need updating too
            ax.set_ylim(min_cost*.9, max_cost*1.1)  
            ax.set_xlim(-.5, len(results.cost) - .5)
    
            replace_in_ax(lax, lines['eff'], xs, results['eff_val'].apply(lambda x: x.eff_rate))
            replace_in_ax(tax, lines['fp'], xs, results['eff_val'].apply(lambda x: x.fp_rate))
    
            # Redraw the figure
            fig.canvas.draw()
            plt.tight_layout()
            fig.savefig('plot.png')
            
            
            ## MLFLOW ##
            # Log metrics
            mlflow.log_metric('Efficiency', result['eff_val'].eff_rate, result['epoch'])
            mlflow.log_metric('False Positive Rate',  result['eff_val'].fp_rate, result['epoch'])
            mlflow.log_metric('Validation Loss',  result['val'], result['epoch'])
            mlflow.log_metric('Training Loss',  result['cost'], result['epoch'])
            
            # Log tags
#            mlflow.set_tag('Optimizer', 'Adam')
#            mlflow.set_tag('Kernel size', 'Mixed')
#            mlflow.set_tag('Skip connections', '4')
#            mlflow.set_tag('Activation', 'Softplus')
#            mlflow.set_tag('Mid Activation', 'Relu')

            # Save model state dictionary, optimizer state dictionary, and epoch number
            torch.save({
                'model':model.state_dict(),
                'optimizer':optimizer.state_dict(),
                'epoch':args.epochs+result['epoch']
                }, 'run_stats.pyt')
            # Save the run stats into mlflow
            mlflow.log_artifact('run_stats.pyt')
            
            # Save a diagram of the architecture
            HL.transforms.Fold("Conv", "Conv"),
            HL.build_graph(model, torch.zeros([args.batch_size, 1, 4000]).to(device)).save('architecture', format='png')
            mlflow.log_artifact('architecture.png')
        
            # log the code for the model architecture
            mlflow.log_artifact('architecture.txt')
        
            # save plot that mike likes
            mlflow.log_artifact('plot.png')
            
            # Save each model state dictionary
            torch.save(model.state_dict(), (output / f'{name}_{args.epochs}.pyt'))

<IPython.core.display.Javascript object>

epoch_start 0
batch_size 64
epochs 200
lr 0.001
n_epochs: 200
batch_size: 64 events
dataset_train: 280000 events
dataset_val: 9984 events
loss: Loss()
optimizer: Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 0.001
    weight_decay: 0
)
model: UNet3SC(
  (d2): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (d4): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (rcbn1): ConvBNrelu(
    (0): Conv1d(1, 16, kernel_size=(25,), stride=(1,), padding=(12,))
    (1): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
  )
  (rcbn2): ConvBNrelu(
    (0): Conv1d(16, 16, kernel_size=(7,), stride=(1,), padding=(3,))
    (1): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
  )
  (rcbn3): ConvBNrelu(
    (0): Conv1d(16, 16, kernel_size=(5,), stride=(1,), padding=(2,))
    (1): BatchNorm1d(16, eps=1e-05, moment

Epoch 46: train=3.6339, val=3.613, took 194.84 s
  Validation Found 51637 of 54504, added 1936 (eff 94.74%) (0.194 FP/event)
Epoch 47: train=3.63269, val=3.63221, took 195.51 s
  Validation Found 51503 of 54504, added 1758 (eff 94.49%) (0.176 FP/event)
Epoch 48: train=3.63289, val=3.62058, took 196.07 s
  Validation Found 51558 of 54504, added 1818 (eff 94.59%) (0.182 FP/event)
Epoch 49: train=3.63191, val=3.71149, took 196.0 s
  Validation Found 51307 of 54504, added 1473 (eff 94.13%) (0.148 FP/event)


In [None]:
##quit()