In [2]:
%pip install torch parameterfree pandas numpy scikit-learn matplotlib --quiet

Note: you may need to restart the kernel to use updated packages.


### Load the data

In [2]:
import pandas as pd
df = pd.read_csv('sims_features.csv')

### Utility Methods

In [3]:
def is_pow2(i):
    return i & (i-1) == 0

class OnlineMean:
    def __init__(self):
        self.state = [0,0]
    def add(self, num):
        n,v = self.state
        self.state[1] += (num-v)/(n+1)
        self.state[0] += 1
    def __str__(self):
        return str(round(self.state[1],4))
    def __repr__(self):
        return str(round(self.state[1],4))

def plot_results(args,xlabel=None,ylabel='AUC',llabel=None):
    from operator import itemgetter
    from itertools import groupby
    from matplotlib import pyplot as plt
    
    args = sorted(args)
    
    if len(args[0]) == 3:
        for lbl, group in groupby(args,key=itemgetter(0)):
            x,y = zip(*[g[1:] for g in group])
            plt.plot(x,y,label=lbl)

    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.legend(title=llabel)
    plt.show()

### Helpful Sources about Architectures

+ If normalizing do it before activation https://forums.fast.ai/t/why-perform-batch-norm-before-relu-and-not-after/81293/4
+ What are residual networks https://arxiv.org/pdf/1512.03385.pdf
+ When building residual networks use pre-activation https://arxiv.org/abs/1603.05027
+ There is not one universal best normalization method https://proceedings.neurips.cc/paper/2021/hash/2578eb9cdf020730f77793e8b58e165a-Abstract.html
+ Adam is the standard optimizer but takes a lot of tuning. COCOB tunes automatically https://github.com/bremen79/parameterfree
+ Smaller batches tend to be better? (this seems to be with fixed epochs though so may not relevant) https://arxiv.org/pdf/1804.07612.pdf

### Basic Architectures

In [126]:
from itertools import chain
import torch

class Linear(torch.nn.Module):
    def __init__(self, in_features, out_features, norm='l'):
        super().__init__()

        if norm == 'l': norm = [torch.nn.LayerNorm  (in_features)]
        if norm == 'b': norm = [torch.nn.BatchNorm1d(in_features)]

        input_norm   = norm or []
        output_layer = torch.nn.Linear(in_features=in_features, out_features=out_features)
        self.layers  = torch.nn.Sequential(*input_norm, output_layer)

    def forward(self, Xs):
        return self.layers(Xs)

class Mlp(torch.nn.Module):
    def __init__(self, in_features, out_features, depth=3, width=None, norm='l'):
        super().__init__()

        def norm_layer(in_width, out_width, norm):
            norm = torch.nn.LayerNorm(out_width) if norm == 'l' else torch.nn.BatchNorm1d(out_width)
            return torch.nn.Sequential(torch.nn.Linear(in_features=in_width, out_features=out_width), norm, torch.nn.ReLU())

        def no_norm_layer(in_width, out_width):
            return torch.nn.Sequential(torch.nn.Linear(in_features=in_width, out_features=out_width), torch.nn.ReLU())

        def layer(in_width, out_width, norm):
            if norm:
                return norm_layer(in_width, out_width, norm)
            else:
                return no_norm_layer(in_width, out_width)

        width = width or in_features

        input_layer   =   layer(in_features, width       , norm)
        hidden_layers = [ layer(width      , width       , norm) for _ in range(depth) ]
        output_layer  =   torch.nn.Linear(in_features=width, out_features=out_features)

        self.layers  = torch.nn.Sequential(input_layer,*hidden_layers,output_layer)

    def forward(self, Xs):
        return self.layers(Xs)

class ResNet(torch.nn.Module):
    def __init__(self, in_features, out_features, depth=3, width=None, norm='l'):
        super().__init__()        

        def norm_layer(in_width, out_width, norm):
            norm = torch.nn.LayerNorm(in_width) if norm == 'l' else torch.nn.BatchNorm1d(width)
            return torch.nn.Sequential(norm,torch.nn.ReLU(),torch.nn.Linear(in_features=in_width,out_features=out_width))

        def no_norm_layer(in_width, out_width):
            return torch.nn.Sequential(torch.nn.ReLU(),torch.nn.Linear(in_features=in_width, out_features=out_width))

        def layer(in_width, out_width, norm):
            return norm_layer(in_width, out_width, norm) if norm else no_norm_layer(in_width, out_width)

        class PreActivationResidualBlock(torch.nn.Module):
            def __init__(self, in_width, norm) -> None:
                super().__init__()
                self.layers = torch.nn.Sequential(layer(in_width,in_width,norm),layer(in_width,in_width,norm))
            def forward(self, Xs):
                return Xs+self.layers(Xs)

        width  = width or in_features

        input_layer   = torch.nn.Linear(in_features=in_features, out_features=width)
        hidden_layers = [PreActivationResidualBlock(width,norm) for _ in range(depth)]
        output_layer  = layer(width,out_features,norm)

        self.layers  = torch.nn.Sequential(input_layer,*hidden_layers,output_layer)

    def forward(self, Xs):
        return self.layers(Xs)

class ResNetDropout(torch.nn.Module):
    def __init__(self, in_features, out_features, depth=3, width=None, drop=[.2,.5]):
        super().__init__()

        def layer(in_width,out_width,drop):
            return torch.nn.Sequential(torch.nn.ReLU(),torch.nn.Dropout(drop),torch.nn.Linear(in_features=in_width, out_features=out_width))
        
        class PreActivationResidualBlock(torch.nn.Module):
            def __init__(self, in_width, drop) -> None:
                super().__init__()
                self.layers = torch.nn.Sequential(layer(in_width,in_width,drop),layer(in_width,in_width,drop))
            def forward(self, Xs):
                return Xs+self.layers(Xs)

        width  = width or in_features

        input_layer   = torch.nn.Linear(in_features=in_features, out_features=width)
        hidden_layers = [PreActivationResidualBlock(width,drop[1]) for _ in range(depth)]
        output_layer  = layer(width,out_features,drop[1])

        self.layers  = torch.nn.Sequential(torch.nn.Dropout(drop[0]),input_layer,*hidden_layers,output_layer)

    def forward(self, Xs):
        return self.layers(Xs)


### Simple Training Loop

In [6]:
from torch.utils.data import TensorDataset, DataLoader

def train_model(Xs, ys, model, opt, sched, loss, batch=8, epoch=1, device='cpu',autotype=None):
    loader = DataLoader(TensorDataset(Xs,ys),batch_size=batch,pin_memory=(device!='cpu'),drop_last=True,shuffle=True)
    for _ in range(epoch):
        for X,y in loader:
            opt.zero_grad()
            X,y = X.to(device),y.to(device)
            if not autotype:
                l = loss(model(X),y)
            else:
                with torch.autocast(device_type=device,dtype=autotype):
                    l = loss(model(X),y)
            l.backward()
            opt.step()
        if sched: sched.step()
    return model.eval()

### Hyperparameter Sweeps

In [15]:
def local(device):
    import parameterfree
    from itertools import product
    from sklearn.metrics import roc_auc_score 

    X_all = torch.tensor(df.iloc[:,7:].to_numpy())
    y_all = torch.tensor(((df['experience_id'] != 1) & (df['phase_id'] == 1)).astype(int).to_numpy())[:,None]
    
    X_all = X_all.float()
    y_all = y_all.float()

    outs = []
    for d,w,n in product([2,4,6],[8,16,24],['l','b',None]):
        scores,labels=[],[]
        for pid in sorted(set(df.participant_id)):
            X_trn = X_all[df.participant_id!=pid]
            y_trn = y_all[df.participant_id!=pid]
            X_tst = X_all[df.participant_id==pid]
            y_tst = y_all[df.participant_id==pid]
    
            model = ResNet(13,1,depth=d,width=w,norm=n).to(device)
            loss  = torch.nn.BCEWithLogitsLoss()
            opt   = parameterfree.COCOB(model.parameters())
    
            model = train_model(X_trn,y_trn,model,opt,None,loss,24,3,device)
            with torch.no_grad():
                scores.extend(model(X_tst.to(device)).squeeze().tolist())
                labels.extend(y_tst.squeeze().tolist())
        
        outs.append((d,w,n,roc_auc_score(labels,scores)))
    return outs
outs1 = local('cpu')
from operator import itemgetter
sorted(outs1,key=itemgetter(-1),reverse=True)

[(4, 24, 'b', 0.6274477844250891),
 (4, 16, 'l', 0.6249837388295325),
 (6, 24, 'b', 0.6247434561223881),
 (2, 16, 'b', 0.6241366275021158),
 (2, 16, 'l', 0.6236882018129866),
 (6, 16, 'b', 0.6228793520631408),
 (2, 24, None, 0.6227844633507652),
 (6, 8, None, 0.6224531180890026),
 (2, 24, 'b', 0.6212180343650184),
 (4, 16, 'b', 0.6212126777441586),
 (6, 16, None, 0.6209134722075553),
 (6, 16, 'l', 0.6198742877607335),
 (6, 24, None, 0.618057628057674),
 (4, 24, None, 0.6173256840787453),
 (4, 16, None, 0.616851240516868),
 (2, 24, 'l', 0.6157642290980828),
 (2, 16, None, 0.612146214322992),
 (4, 24, 'l', 0.6117957382724439),
 (6, 8, 'b', 0.6112187536826769),
 (6, 24, 'l', 0.6107703279935476),
 (2, 8, None, 0.6097028299793235),
 (4, 8, 'l', 0.6047758866355369),
 (4, 8, None, 0.6010951228732302),
 (4, 8, 'b', 0.6002973689808799),
 (2, 8, 'l', 0.5992711934704322),
 (2, 8, 'b', 0.5971844070297231),
 (6, 8, 'l', 0.5901519290722179)]

In [5]:
def local(device):
    import parameterfree
    from itertools import product
    from sklearn.metrics import roc_auc_score 

    X_all = torch.tensor(df.iloc[:,7:].to_numpy())
    y_all = torch.tensor(((df['experience_id'] != 1) & (df['phase_id'] == 1)).astype(int).to_numpy())[:,None]
    
    X_all = X_all.float()
    y_all = y_all.float()

    outs = []
    for d,w,n,e in product([4,6,8],[16,24,32],[.2,.4],[3,6]):
        scores,labels=[],[]
        for pid in sorted(set(df.participant_id)):
            X_trn = X_all[df.participant_id!=pid]
            y_trn = y_all[df.participant_id!=pid]
            X_tst = X_all[df.participant_id==pid]
            y_tst = y_all[df.participant_id==pid]
    
            model = ResNetDropout(13,1,depth=d,width=w,drop=[.2,n]).to(device)
            loss  = torch.nn.BCEWithLogitsLoss()
            opt   = parameterfree.COCOB(model.parameters())
    
            model = train_model(X_trn,y_trn,model,opt,None,loss,24,e,device)
            with torch.no_grad():
                scores.extend(model(X_tst.to(device)).squeeze().tolist())
                labels.extend(y_tst.squeeze().tolist())
        
        outs.append((d,w,n,e,roc_auc_score(labels,scores)))
    return outs
outs2 = local('cpu')
from operator import itemgetter
sorted(outs2,key=itemgetter(-1),reverse=True)

[(6, 32, 0.2, 6, 0.6316634450418352),
 (4, 32, 0.4, 6, 0.6297855668146625),
 (4, 16, 0.2, 3, 0.628193885187719),
 (6, 32, 0.4, 6, 0.6228797346789166),
 (8, 32, 0.2, 6, 0.6222977760840653),
 (6, 24, 0.4, 6, 0.620748947424001),
 (8, 16, 0.2, 6, 0.6207160424672902),
 (8, 16, 0.4, 6, 0.6206747199635138),
 (6, 32, 0.2, 3, 0.6193355647485372),
 (4, 16, 0.2, 6, 0.6183920342456423),
 (8, 24, 0.2, 6, 0.6179512608720272),
 (8, 32, 0.4, 6, 0.6177836751622673),
 (6, 24, 0.2, 3, 0.6177779359256317),
 (4, 24, 0.4, 6, 0.6169564598551875),
 (6, 16, 0.2, 3, 0.6167865784507734),
 (8, 24, 0.4, 6, 0.6156701056172588),
 (6, 32, 0.4, 3, 0.6155859301466031),
 (6, 24, 0.2, 6, 0.6155407814850696),
 (8, 32, 0.4, 3, 0.6148130462796737),
 (4, 24, 0.2, 6, 0.6140524061175672),
 (8, 32, 0.2, 3, 0.6133797675838731),
 (6, 24, 0.4, 3, 0.6133782371207703),
 (6, 16, 0.2, 6, 0.6129183329583698),
 (4, 32, 0.2, 3, 0.6120635693154391),
 (4, 24, 0.4, 3, 0.6104190867114481),
 (8, 16, 0.2, 3, 0.6104175562483452),
 (6, 16, 0.4, 

In [6]:
def local(device):
    import parameterfree
    from itertools import product
    from sklearn.metrics import roc_auc_score 

    X_all = torch.tensor(df.iloc[:,7:].to_numpy())
    y_all = torch.tensor(((df['experience_id'] != 1) & (df['phase_id'] == 1)).astype(int).to_numpy())[:,None]
    
    X_all = X_all.float()
    y_all = y_all.float()

    outs = []
    for d,w,n,e in product([4],[16,24],['b'],[3,4,5]):
        scores,labels=[],[]
        for _ in range(3):
            for pid in sorted(set(df.participant_id)):
                X_trn = X_all[df.participant_id!=pid]
                y_trn = y_all[df.participant_id!=pid]
                X_tst = X_all[df.participant_id==pid]
                y_tst = y_all[df.participant_id==pid]
        
                model = ResNet(13,1,depth=d,width=w,norm=n).to(device)
                loss  = torch.nn.BCEWithLogitsLoss()
                opt   = parameterfree.COCOB(model.parameters())
        
                model = train_model(X_trn,y_trn,model,opt,None,loss,24,e,device)
                with torch.no_grad():
                    scores.extend(model(X_tst.to(device)).squeeze().tolist())
                    labels.extend(y_tst.squeeze().tolist())
        outs.append((d,w,n,e,roc_auc_score(labels,scores)))
    return outs
outs3 = local('cpu')
from operator import itemgetter
sorted(outs3,key=itemgetter(-1),reverse=True)

[(4, 16, 'b', 5, 0.6304536139590479),
 (4, 24, 'b', 5, 0.6247887748353774),
 (4, 16, 'b', 4, 0.6229591487088078),
 (4, 24, 'b', 3, 0.6180165181182173),
 (4, 24, 'b', 4, 0.6167185578684254),
 (4, 16, 'b', 3, 0.6072754304725066)]

In [7]:
def local(device):
    import parameterfree
    from itertools import product
    from sklearn.metrics import roc_auc_score
    
    X_all = torch.tensor(df.iloc[:,7:].to_numpy())
    y_all = torch.tensor(((df['experience_id'] != 1) & (df['phase_id'] == 1)).astype(int).to_numpy())[:,None]
    
    X_all = X_all.float()
    y_all = y_all.float()

    outs = []
    for d,w,n,e in product([4,5],[16],['b'],[6,7]):
        scores,labels=[],[]
        for _ in range(3):
            for pid in sorted(set(df.participant_id)):
                X_trn = X_all[df.participant_id!=pid]
                y_trn = y_all[df.participant_id!=pid]
                X_tst = X_all[df.participant_id==pid]
                y_tst = y_all[df.participant_id==pid]
        
                model = ResNet(13,1,depth=d,width=w,norm=n).to(device)
                loss  = torch.nn.BCEWithLogitsLoss()
                opt   = parameterfree.COCOB(model.parameters())
        
                model = train_model(X_trn,y_trn,model,opt,None,loss,24,e,device)
                with torch.no_grad():
                    scores.extend(model(X_tst.to(device)).squeeze().tolist())
                    labels.extend(y_tst.squeeze().tolist())
        outs.append((d,w,n,e,roc_auc_score(labels,scores)))
    return outs
outs3 = local('cpu')
from operator import itemgetter
sorted(outs3,key=itemgetter(-1),reverse=True)

[(4, 16, 'b', 6, 0.6272364954911707),
 (5, 16, 'b', 7, 0.624532804881429),
 (4, 16, 'b', 7, 0.619482446693545),
 (5, 16, 'b', 6, 0.6168667151993521)]

In [522]:
def local(device):
    import parameterfree
    from itertools import product
    from sklearn.metrics import roc_auc_score

    sigmoid = torch.nn.Sigmoid()
    
    X_all = torch.tensor(df.iloc[:,7:].to_numpy())
    y_all = torch.tensor(((df['experience_id'] != 1) & (df['phase_id'] == 1)).astype(int).to_numpy())[:,None]
    
    X_all = X_all.float()
    y_all = y_all.float()

    outs = []
    for d,w,n,e in product([4,5],[16],['b'],[6,7]):
        scores,labels=[],[]
        for _ in range(1):
            for pid in sorted(set(df.participant_id))[:1]:
                X_trn = X_all[df.participant_id!=pid]
                y_trn = y_all[df.participant_id!=pid]
                X_tst = X_all[df.participant_id==pid]
                y_tst = y_all[df.participant_id==pid]
        
                model = ResNet(13,1,depth=d,width=w,norm=n).to(device)
                loss  = torch.nn.BCEWithLogitsLoss()
                opt   = parameterfree.COCOB(model.parameters())
        
                model = train_model(X_trn,y_trn,model,opt,None,loss,24,e,device)
                with torch.no_grad():
                    scores.extend(sigmoid(model(X_tst.to(device))).squeeze().tolist())
                    labels.extend(y_tst.squeeze().tolist())
        outs.append((d,w,n,e,roc_auc_score(labels,scores)))
    return outs
outs3 = local('cpu')
from operator import itemgetter
sorted(outs3,key=itemgetter(-1),reverse=True)

[(5, 16, 'b', 7, 0.7040816326530612),
 (4, 16, 'b', 7, 0.5918367346938775),
 (4, 16, 'b', 6, 0.3137755102040816),
 (5, 16, 'b', 6, 0.24744897959183673)]

In [148]:
import timeit


def func1():
    torch.max(L, torch.abs(grad), out=L)
    sum_negative_gradients.sub_(grad)
    grad_norm_sum.add_(torch.abs(grad))
    reward.addcmul_(grad, data.sub(x0), value=-1)
    torch.maximum(reward, torch.zeros_like(reward), out=reward)
    den = torch.maximum(grad_norm_sum.add(L), L.mul(alpha)).mul(L)
    data.copy_(reward.add(L).mul(sum_negative_gradients).div(den).add(x0))
    print(data)
    
def func2():
    absgrad = torch.abs(grad)
    torch.maximum(L, absgrad, out=L)
    sum_negative_gradients.sub_(grad)
    grad_norm_sum.add_(absgrad)
    reward.sub_(grad*old).clamp_(0)
    den = torch.maximum(grad_norm_sum+L, L*alpha)*L
    new = reward.add(L)*sum_negative_gradients.div(den)
    data.sub_(old).add_(new)
    old.copy_(new)
    print(data)

sum_negative_gradients, grad_norm_sum, L, reward, x0 = state['sum_negative_gradients'],state['grad_norm_sum'],state['L'],state['reward'],state['x0']
data,grad,sum_negative_gradients = torch.clone(p.data),torch.clone(p.grad),torch.clone(sum_negative_gradients)
grad_norm_sum,L,reward,x0 = torch.clone(grad_norm_sum),torch.clone(L),torch.clone(reward),torch.clone(x0)
alpha,Z,O = 100,torch.zeros_like(reward),torch.ones_like(reward)
print(timeit.timeit(func1,number=3))

sum_negative_gradients, grad_norm_sum, L, reward, x0 = state['sum_negative_gradients'],state['grad_norm_sum'],state['L'],state['reward'],state['x0']
data,grad,sum_negative_gradients = torch.clone(p.data),torch.clone(p.grad),torch.clone(sum_negative_gradients)
grad_norm_sum,L,reward,x0 = torch.clone(grad_norm_sum),torch.clone(L),torch.clone(reward),torch.clone(x0)
old = p.data-x0
alpha,Z = 100,torch.zeros_like(reward)
print(timeit.timeit(func2,number=3))

tensor([-0.2075])
tensor([-0.2182])
tensor([-0.2298])
0.0032805311493575573
tensor([-0.2075])
tensor([-0.2182])
tensor([-0.2298])
0.0017711040563881397


In [304]:
%%time
from torch.utils.data import TensorDataset, DataLoader
from itertools import islice

def train_model(Xs, ys, model, opt, sched, loss, batch=8, epoch=1, device='cpu',autotype=None):
    loader = DataLoader(TensorDataset(Xs,ys),batch_size=batch,pin_memory=(device!='cpu'),drop_last=True,shuffle=False)
    for _ in range(epoch):
        for X,y in loader:
            opt.zero_grad()
            X,y = X.to(device),y.to(device)
            if not autotype:
                l = loss(model(X),y)
            else:
                with torch.autocast(device_type=device,dtype=autotype):
                    l = loss(model(X),y)
            l.backward()
            opt.step()
        if sched: sched.step()
    return model.eval()

def local(device):
    import parameterfree
    from itertools import product
    from sklearn.metrics import roc_auc_score 

    sigmoid = torch.nn.Sigmoid()
    
    X_all = torch.tensor(df.iloc[:,7:].to_numpy())
    y_all = torch.tensor(((df['experience_id'] != 1) & (df['phase_id'] == 1)).astype(int).to_numpy())[:,None]

    X_all = X_all.float()
    y_all = y_all.float()

    outs = []
    for d,w,n,e in product([4,5],[16],['b'],[1]):
        scores,labels=[],[]
        for _ in range(1):
            for pid in sorted(set(df.participant_id)):
                
                X_trn = X_all[df.participant_id!=pid]
                y_trn = y_all[df.participant_id!=pid]
                X_tst = X_all[df.participant_id==pid]
                y_tst = y_all[df.participant_id==pid]

                #model = ResNet(13,1,depth=d,width=w,norm=n).to(device)
                model = copy.deepcopy(M).to(device)
                loss  = torch.nn.BCEWithLogitsLoss()

                opt   = parameterfree.COCOB(model.parameters())
                #opt   = COCOB(model.parameters(),i=None)

                model = train_model(X_trn,y_trn,model,opt,None,loss,16,1,device)
                #model = train_model(X_trn,y_trn,model,opt,None,loss,16,e,device,autotype=torch.bfloat16)
                
                with torch.no_grad():
                    scores.extend(sigmoid(model(X_tst.to(device))).squeeze().tolist())
                    labels.extend(y_tst.squeeze().tolist())
        outs.append((d,w,n,e,roc_auc_score(labels,scores)))
    return outs
local('cpu')

CPU times: user 39min 50s, sys: 1.2 s, total: 39min 51s
Wall time: 2min 1s


[(4, 16, 'b', 1, 0.536012562041148), (5, 16, 'b', 1, 0.536012562041148)]

In [142]:
%%time
from torch.utils.data import TensorDataset, DataLoader
from itertools import islice

def train_model(Xs, ys, model, opt, sched, loss, batch=8, epoch=1, device='cpu',autotype=None):
    loader = DataLoader(TensorDataset(Xs,ys),batch_size=batch,pin_memory=(device!='cpu'),drop_last=True,shuffle=False)
    for _ in range(epoch):
        for X,y in loader:
            opt.zero_grad()
            X,y = X.to(device),y.to(device)
            if not autotype:
                l = loss(model(X),y)
            else:
                with torch.autocast(device_type=device,dtype=autotype):
                    l = loss(model(X),y)
            l.backward()
            opt.step()
        if sched: sched.step()
    return model.eval()

def local(device):
    import parameterfree
    from itertools import product
    from sklearn.metrics import roc_auc_score 

    sigmoid = torch.nn.Sigmoid()
    
    X_all = torch.tensor(df.iloc[:,7:].to_numpy())
    y_all = torch.tensor(((df['experience_id'] != 1) & (df['phase_id'] == 1)).astype(int).to_numpy())[:,None]

    X_all = X_all.float()
    y_all = y_all.float()

    outs = []
    for d,w,n,e in product([4,5],[16],['b'],[1]):
        scores,labels=[],[]
        for _ in range(1):
            for pid in sorted(set(df.participant_id))[:3]:
                
                X_trn = X_all[df.participant_id!=pid]
                y_trn = y_all[df.participant_id!=pid]
                X_tst = X_all[df.participant_id==pid]
                y_tst = y_all[df.participant_id==pid]

                #model = ResNet(13,1,depth=d,width=w,norm=n).to(device)
                model = copy.deepcopy(M).to(device)
                loss  = torch.nn.BCEWithLogitsLoss()

                #opt   = parameterfree.COCOB(model.parameters())
                opt   = COCOB2(model.parameters())

                model = train_model(X_trn,y_trn,model,opt,None,loss,16,1,device)
                #model = train_model(X_trn,y_trn,model,opt,None,loss,16,e,device,autotype=torch.bfloat16)

                with torch.no_grad():
                    scores.extend(sigmoid(model(X_tst.to(device))).squeeze().tolist())
                    labels.extend(y_tst.squeeze().tolist())
        outs.append((d,w,n,e,roc_auc_score(labels,scores)))
    return outs
local('cpu')

-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-

[(4, 16, 'b', 1, 0.478448275862069), (5, 16, 'b', 1, 0.478448275862069)]

In [105]:
t1 = torch.tensor([1,2])
t2 = torch.tensor([3,4])

import timeit

print(timeit.timeit(lambda:t1.add(t2),number=100000))
print(timeit.timeit(lambda:t1.sub(t2),number=100000))

torch.maximum(t1,t2)
#t1.clamp(min=t2)

0.27971268631517887
0.2534223939292133


tensor([3, 4])

In [None]:
import copy
M = ResNet(13,1,depth=4,width=16,norm='b')
copy.deepcopy(M)


In [19]:
%%time
def local(device):
    import parameterfree
    from itertools import product
    from sklearn.metrics import roc_auc_score 

    sigmoid = torch.nn.Sigmoid()
    
    X_all = torch.tensor(df.iloc[:,7:].to_numpy())
    y_all = torch.tensor(((df['experience_id'] != 1) & (df['phase_id'] == 1)).astype(int).to_numpy())[:,None]
    
    X_all = X_all.float()
    y_all = y_all.float()

    outs = []
    for d,w,n,e in product([4,5],[16],['b'],[5]):
        scores,labels=[],[]
        for _ in range(1):
            for pid in sorted(set(df.participant_id))[:1]:
                X_trn = X_all[df.participant_id!=pid]
                y_trn = y_all[df.participant_id!=pid]
                X_tst = X_all[df.participant_id==pid]
                y_tst = y_all[df.participant_id==pid]
        
                model = ResNet2(13,1,depth=d,width=w,norm=n).to(device)
                loss  = torch.nn.BCELoss()
                opt   = parameterfree.COCOB(model.parameters())
        
                model = train_model(X_trn,y_trn,model,opt,None,loss,24,e,device)
                with torch.no_grad():
                    scores.extend((model(X_tst)).squeeze().tolist())
                    labels.extend(y_tst.squeeze().tolist())
        outs.append((d,w,n,e,roc_auc_score(labels,scores)))
    return outs
outs3 = local('cpu')

CPU times: user 4min, sys: 163 ms, total: 4min
Wall time: 12.7 s


In [39]:
from torch.utils.data import TensorDataset, DataLoader
def train_model(Xs, ys, model, opt, sched, loss, batch=8, epoch=1, device='cpu'):
    loader = DataLoader(TensorDataset(Xs,ys),batch_size=batch,drop_last=True,shuffle=True)
    for _ in range(epoch):
        for X,y in loader:
            #opt.zero_grad()
            loss(model(X),y).backward()
            #opt.step()
        if sched: sched.step()
    return model.eval()

In [291]:
# Copyright (c) Francesco Orabona.
# All rights reserved.
# 
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.

import torch
from torch.optim.optimizer import Optimizer

class COCOB(Optimizer):
    r"""Implements COCOB algorithm.
    It has been proposed in `Training Deep Networks without Learning Rates Through Coin Betting`_.
    Arguments:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        alpha (float, optional): It was proposed to increase the stability in the first iterations,
            similarly and independently to the learning rate warm-up. The number roughly denotes the
            number of rounds of warm-up (default 100)
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-8)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
    .. _Training Deep Networks without Learning Rates Through Coin Betting:
        https://arxiv.org/abs/1705.07795
    """

    def __init__(self, params, alpha: float = 100, eps: float = 1e-8, weight_decay: float = 0,i=4):
        if not 0.0 <= eps:
            raise ValueError("Invalid epsilon value: {}".format(eps))
        if not 0.0 <= alpha:
            raise ValueError("Invalid alpha value: {}".format(alpha))
        if not 0.0 <= weight_decay:
            raise ValueError(
                "Invalid weight_decay value: {}".format(weight_decay))

        defaults = dict(weight_decay=weight_decay)
        self._alpha = alpha
        self._eps = eps
        self._i = i
        self._s = 0

        super().__init__(params, defaults)

    @torch.no_grad()
    def step(self, closure = None):
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        self._s += 1
        loss = None
        if closure is not None:
            with torch.enable_grad():
                loss = closure()
        i = 0
        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                i+=1
                grad = p.grad
                #print(grad)
                if grad.is_sparse:
                    raise RuntimeError('COCOB does not support sparse gradients')

                state = self.state[p]

                # State initialization
                if len(state) == 0:
                    # Sum of the negative gradients
                    state['sum_negative_gradients'] = torch.zeros_like(p).detach()
                    # Sum of the absolute values of the stochastic subgradients
                    state['grad_norm_sum'] = torch.zeros_like(p).detach()
                    # Maximum observed scale
                    state['L'] = self._eps*torch.ones_like(p).detach()
                    # Reward/wealth of the algorithm for each coordinate
                    state['reward'] = torch.zeros_like(p).detach()
                    # We need to save the initial point because this is a FTRL-based algorithm
                    state['x0'] = torch.clone(p.data).detach()

                sum_negative_gradients, grad_norm_sum, L, reward, x0 = (
                    state['sum_negative_gradients'],
                    state['grad_norm_sum'],
                    state['L'],
                    state['reward'],
                    state['x0'],
                )


                # sum_negative_gradients_ = torch.clone(sum_negative_gradients)
                # grad_norm_sum_          = torch.clone(grad_norm_sum)
                # L_                      = torch.clone(L)
                # reward_                 = torch.clone(reward)
                # old_                    = p.data-x0
                # data_                   = torch.clone(p.data)

                # d1 = grad*old_
                # d2 = grad*(p.data.sub(x0))

                
                if group['weight_decay'] != 0:
                    grad = grad.add(p, alpha=group['weight_decay'])

                
                # update maximum rage of the gradients
                torch.max(L, torch.abs(grad), out=L)
                # udpate dual vector
                sum_negative_gradients.sub_(grad)
                # update sum of the absolute values of the gradients
                grad_norm_sum.add_(torch.abs(grad))
                # update the wealth
                reward.addcmul_(grad, p.data.sub(x0), value=-1)
                # reset the wealth to zero in case we lost all
                torch.maximum(reward, torch.zeros_like(reward), out=reward)
                # calculate denominator
                den = torch.maximum(grad_norm_sum.add(L), L.mul(self._alpha)).mul(L)
                # update model parameters
                p.data.copy_(reward.add(L).mul(sum_negative_gradients).div(den).add(x0))                
                
                # absgrad = torch.abs(grad)
                # torch.maximum(L_, absgrad, out=L_)
                # sum_negative_gradients_.sub_(grad)
                # grad_norm_sum_.add_(absgrad)
                # reward_.sub_(grad*old_).clamp_(min=0)
                # den_ = torch.maximum(grad_norm_sum_+L_, L_*alpha)*L_
                # new = reward_.add(L_).mul(sum_negative_gradients_).div(den_)

                # #old = data_-x0
                
                # data_.sub_(old_).add_(new)
                # #data_.copy_(x0+new)

                # s = self._s

                # if not torch.isclose(d1,d2).all():
                #     print("----------------------------------")
                #     print(d1)
                #     print(d2)
                #     return

                # if not torch.isclose(L_, L).all(): print(f"L{s}")
                # if not torch.isclose(sum_negative_gradients_, sum_negative_gradients).all(): print(f"sum_negative_gradients{s}")
                # if not torch.isclose(grad_norm_sum_, grad_norm_sum).all(): print(f"grad_norm_sum{s}")
                # if not torch.isclose(reward_, reward).all(): print(f"reward{s}")
                # if not torch.isclose(den_, den).all(): print(f"den{s}")
                # if not torch.isclose(data_, p.data).all(): print(f"data{s}")            

                if i == self._i:
                    print(grad)

            return p,state

        return loss

In [143]:
# Copyright (c) Francesco Orabona.
# All rights reserved.
# 
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.

import torch
from torch.optim.optimizer import Optimizer

class COCOB2(Optimizer):
    r"""Implements COCOB algorithm.
    It has been proposed in `Training Deep Networks without Learning Rates Through Coin Betting`_.
    Arguments:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        alpha (float, optional): It was proposed to increase the stability in the first iterations,
            similarly and independently to the learning rate warm-up. The number roughly denotes the
            number of rounds of warm-up (default 100)
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-8)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
    .. _Training Deep Networks without Learning Rates Through Coin Betting:
        https://arxiv.org/abs/1705.07795
    """

    def __init__(self, params, alpha: float = 100, eps: float = 1e-8, weight_decay: float = 0):
        if not 0.0 <= eps:
            raise ValueError("Invalid epsilon value: {}".format(eps))
        if not 0.0 <= alpha:
            raise ValueError("Invalid alpha value: {}".format(alpha))
        if not 0.0 <= weight_decay:
            raise ValueError(
                "Invalid weight_decay value: {}".format(weight_decay))

        defaults = dict(weight_decay=weight_decay)
        self._alpha = alpha
        self._eps = eps

        super().__init__(params, defaults)

    @torch.no_grad()
    def step(self, closure = None):
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """

        loss = None
        if closure:
            with torch.enable_grad():
                loss = closure()

        for group in self.param_groups:
            for p in group['params']:
                
                if p.grad is None: continue

                grad = p.grad
                data = p.data

                if grad.is_sparse:
                    raise RuntimeError('COCOB does not support sparse gradients')

                if group['weight_decay']:
                    grad = grad.add(p, alpha=group['weight_decay'])

                state = self.state[p]
                if not state:
                    state = [
                        torch.zeros_like(p).detach(),
                        torch.zeros_like(p).detach(),
                        torch.ones_like(p).detach()*self._eps,
                        torch.zeros_like(p).detach(),
                        torch.clone(p.data).detach(),
                    ]
                    self.state[p] = state

                sum_negative_gradients, grad_norm_sum, L, reward, x0 = state
                    
                absgrad = torch.abs(grad)
                L.clamp_(min=absgrad)
                sum_negative_gradients.sub_(grad)
                grad_norm_sum.add_(absgrad)
                reward.addcmul_(grad, x0.sub(data)).clamp_(min=0)
                den = torch.maximum(grad_norm_sum.add(L), L.mul(self._alpha)).mul(L)
                data.copy_(reward.add(L).mul(sum_negative_gradients).div(den).add(x0))

        return loss

In [53]:
torch.tensor([[1,2,3],[4,5,6]]).float() + 1

tensor([[2., 3., 4.],
        [5., 6., 7.]])

In [134]:
device = 'cuda'
model = Linear(3,1).to(device)

for p in model.parameters():
    print(p.grad)

model(torch.tensor([[1,2,3]],device=device).float()).backward()

for p in model.parameters():
    print(p.grad)

for p in model.parameters():
    print(p.grad.get_device())

None
None
None
None
tensor([-0.5695,  0.0000,  0.2654], device='cuda:0')
tensor([0.4650, 0.2578, 0.2167], device='cuda:0')
tensor([[-1.2247,  0.0000,  1.2247]], device='cuda:0')
tensor([1.], device='cuda:0')
0
0
0
0
