In [2]:
import torch
import torch.nn as nn
import torchvision
import torchvision.datasets as dsets
import torchvision.transforms as transforms
import torch.nn.functional as F
from torch.autograd import Variable

import numpy as np
import matplotlib.pyplot as plt

device = torch.device("cuda:1" if torch.cuda.is_available else "cpu")

In [3]:
input_size = 784
hidden1_size = 1024
hidden2_size = 1024
hidden3_size = 1024
output_size = 10

initial_enable = True

num_classes = 10
batch_size = 128
num_epochs = 200
batch_size_for_train = 128
learning_rate = 0.0001

In [4]:
train_dataset = dsets.MNIST(root='../dataset',
                            train=True,
                            download=True,
                            transform=transforms.Compose([transforms.ToTensor(),
                                                         transforms.Normalize((0.1307,),(0.3081,))])
                           )

test_dataset = dsets.MNIST(root='../dataset',
                           train=False,
                           transform=transforms.Compose([transforms.ToTensor(),
                                                          transforms.Normalize((0.1307,),(0.3081,))]))

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Processing...
Done!


In [5]:
class SPLDataset(torch.utils.data.Dataset):
    def __init__(self, original_dataset, initial_prob):
        self.original_dataset = original_dataset
        self.initial_prob = initial_prob
        self.distrib = torch.distributions.bernoulli.Bernoulli(probs=self.initial_prob)
        self.mask_tensor = self.distrib.sample(sample_shape=torch.Size([self.__len__()]))
#        self.mask_tensor.requires_grad_(True)
        
    def __getitem__(self, index):
        data, target = self.original_dataset[index]
        not_masked = self.mask_tensor[index]
        
        return data, target, not_masked
    
    def __len__(self):
        return self.original_dataset.__len__()
    
    def m_len(self):
        return self.mask_tensor.sum(dim=0)

In [6]:
train_dataset_spl = SPLDataset(train_dataset, initial_prob=0.01)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset_spl,
                                           batch_size=batch_size_for_train,
                                           shuffle=True)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                         batch_size=batch_size,
                                         shuffle=False)

In [8]:
print(torch.__version__)
# https://github.com/ufoym/imbalanced-dataset-sampler
sampler = torch.utils.data.RandomSampler(train_dataset)


0.4.1


In [9]:
class splMLP(nn.Module):
    def __init__(self):
        super(splMLP, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden1_size)
        self.bn1 = nn.BatchNorm1d(hidden1_size)
        self.relu1 = nn.ReLU(hidden1_size)
        self.fc2 = nn.Linear(hidden1_size, hidden2_size)
        self.bn2 = nn.BatchNorm1d(hidden2_size)
        self.relu2 = nn.ReLU(hidden2_size)
        self.fc3 = nn.Linear(hidden2_size, hidden3_size)
        self.bn3 = nn.BatchNorm1d(hidden3_size)
        self.relu3 = nn.ReLU(hidden3_size)
        self.fc_out = nn.Linear(hidden3_size, output_size)
        
    def forward(self, x):
        out = self.fc1(x)
        out = self.bn1(out)
        out = self.relu1(out)
        out = self.fc2(out)
        out = self.bn2(out)
        out = self.relu2(out)
        out = self.fc3(out)
        out = self.bn3(out)
        out = self.relu3(out)
        out = self.fc_out(out)

        
        return out

In [10]:
net = splMLP()
net.to(device)

splMLP(
  (fc1): Linear(in_features=784, out_features=1024, bias=True)
  (bn1): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu1): ReLU(inplace)
  (fc2): Linear(in_features=1024, out_features=1024, bias=True)
  (bn2): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu2): ReLU(inplace)
  (fc3): Linear(in_features=1024, out_features=1024, bias=True)
  (bn3): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu3): ReLU(inplace)
  (fc_out): Linear(in_features=1024, out_features=10, bias=True)
)

In [11]:
def cal_entropy(logits):
    softmax_prob = nn.Softmax(dim=1)
    probs = softmax_prob(logits)
    log_probs = torch.log2(probs)
    sha_entropy = torch.mul(-probs, log_probs).sum(dim=1)
    
    avg_sha_entropy = sha_entropy.mean(dim=0)
    var_sha_entropy = sha_entropy.var(dim=0)
    
    return avg_sha_entropy, var_sha_entropy

In [13]:
def model_save(model):
    model_saved_dir = './saved_models/'
    model_name = model.__class__.__name__
    model_path = model_saved_dir + model_name + '.pkl'
    
    torch.save(model.state_dict(), model_path)
    print("Best model saved!")

In [14]:
#labels_distrib = {str(c):[] for c in range(10)}
labels_distrib = dict(zip([str(c) for c in range(10)] ,[[0 for _ in range(num_epochs)] for _ in range(10)]))

def get_labels_distrib(mask_tensor, labels, epoch):
    mask_tensor_temp = mask_tensor.to(device=device, dtype=torch.uint8)
    selected_labels = labels[mask_tensor_temp].data.cpu().numpy()

    for x in selected_labels:
        labels_distrib[str(x)][epoch] += 1

In [15]:
k = 1

In [16]:
def train(epoch):
    global k
    net.train()
    correct = 0
    total = 0
    total_loss = 0
    output_entropy = 0
    mask_k = 1. / k
    datalen = 0
    
    grad_abs = 0
    loss_abs = 0
    
    for i, (inputs, labels, mask_tensor) in enumerate(train_loader):
        inputs = inputs.view(-1,784)
        inputs = inputs.to(device=device)
        labels = labels.to(device=device)
        mask_tensor = mask_tensor.to(device=device)
        
        logits = net(inputs) # logits = outputs
        expected_logits, expected_labels = torch.max(logits, 1) # argmax(softmax) = expected
        avg_outputs_entropy, var_outputs_entropy = cal_entropy(logits)
        
        total += labels.size(0)
        correct += (expected_labels==labels).sum()
        
        original_loss = criterion(logits, labels)
        if (epoch+1) == 1 and initial_enable:
            pass
        else:
            mask_tensor_update = (original_loss > mask_k).to(device=device, dtype=torch.float32)
            mask_tensor = mask_tensor_update
        mask_loss = (mask_tensor.detach()*original_loss).mean(dim=0)
        get_labels_distrib(mask_tensor, labels, epoch)
        reg_loss = (-mask_k*mask_tensor).mean(dim=0)
        spl_loss = mask_loss + reg_loss
        total_loss += mask_loss.data
        datalen += mask_tensor.sum(dim=0)
        
        optimizer.zero_grad()
        spl_loss.backward()
#        grad_abs += logits.grad.detach().abs().sum()
#        loss_abs += mask_loss.detach().abs().sum()
#        print(grad_abs, loss_abs, net.fc1.weight.grad.detach().abs().sum())
        optimizer.step()
        
        if (i+1) % 100 == 0:
            print("<Normal Phase> Epoch [%d/%d], Step [%d/%d], Loss: %.8f"
                  %(epoch+1, num_epochs, i+1, len(train_dataset)//batch_size, mask_loss.data))
            
    correct = torch.tensor(correct, dtype=torch.float32, device=device)
    total = torch.tensor(total, dtype=torch.float32, device=device)
    accuracy = 100.*correct/total

    print("<Normal Phase> totally train at Epoch [%d/%d], Loss: %.6f, Train Acc.: %.4f" 
          %(epoch+1, num_epochs, total_loss.data, accuracy))
    print("**datalen=%d, k=%.6f" %(datalen, k))
    print("**datalen=%d" %(train_dataset_spl.m_len()))
    print("grad_abs, loss_abs, del_w_abs:")
    
    
    k = k * 1.05
    
    return accuracy.item(), total_loss, (avg_outputs_entropy.item(), var_outputs_entropy.item()), datalen

In [26]:
def test(epoch):
    net.eval()
    correct = 0
    total = 0
    total_loss = 0

    for inputs, labels in test_loader:
        inputs = inputs.view(-1,784)
        inputs = inputs.to(device)
        labels = labels.to(device)

        logits = net(inputs)
        expected_logits, expected_labels = torch.max(logits, 1)
        avg_outputs_entropy, var_outputs_entropy = cal_entropy(logits)

        total += labels.size(0)
        correct += (expected_labels == labels).sum()
        
        loss = criterion(logits, labels)
        loss = loss.mean(dim=0)
        total_loss += loss.data

    correct = torch.tensor(correct, dtype=torch.float32, device=device)
    total = torch.tensor(total, dtype=torch.float32, device=device)
    accuracy = 100.*correct/total
    
    print("<Test Phase> Accuacy of the network on the 10,000 test images:%.4f %%" % (accuracy))
        
    return accuracy.item(), total_loss, (avg_outputs_entropy.item(), var_outputs_entropy.item())

In [27]:
criterion = nn.CrossEntropyLoss(reduce=False)
#optimizer = torch.optim.SGD(net.parameters(), lr=learning_rate)
optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)



In [28]:
file_trace = dict()

train_trace = dict()
train_trace['loss']=[]
train_trace['acc']=[]
train_trace['logit']=[]
train_trace['output_entropy']=dict()
train_trace['output_entropy']['mean']=[]
train_trace['output_entropy']['var']=[]
train_trace['datalen_total']=[]
train_trace['datalen_class']=labels_distrib

validate_trace = dict()
validate_trace['datalen']=[]

test_trace = dict()
test_trace['loss']=[]
test_trace['acc']=[]
test_trace['logit']=[]
test_trace['output_entropy']=dict()
test_trace['output_entropy']['mean']=[]
test_trace['output_entropy']['var']=[]

file_trace['train']=train_trace
file_trace['validate']=validate_trace
file_trace['test']=test_trace

In [29]:
best_test_acc = 0

for epoch in range(num_epochs):
    train_acc, train_loss, train_output_entropy, datalen = train(epoch)
    test_acc, test_loss, test_output_entropy = test(epoch)
    
    if best_test_acc < test_acc:
        model_save(net)
        best_test_acc = test_acc
    
    train_trace['acc'].append(train_acc)
    train_trace['loss'].append(train_loss)
    for idx, key in enumerate(train_trace['output_entropy'].keys()):
        train_trace['output_entropy'][key].append(train_output_entropy[idx])
    train_trace['datalen_total'].append(datalen)
    
    test_trace['acc'].append(test_acc)
    test_trace['loss'].append(test_loss)
    for idx, key in enumerate(test_trace['output_entropy'].keys()):
        test_trace['output_entropy'][key].append(test_output_entropy[idx])
    
print("\n","=> Best saved model test acc.: %.4f %%" %(best_test_acc))
train_trace['datalen_class'] = labels_distrib


<Normal Phase> Epoch [1/200], Step [100/468], Loss: 0.00382417
<Normal Phase> Epoch [1/200], Step [200/468], Loss: 0.00212226
<Normal Phase> Epoch [1/200], Step [300/468], Loss: 0.00685871
<Normal Phase> Epoch [1/200], Step [400/468], Loss: 0.00000000




<Normal Phase> totally train at Epoch [1/200], Loss: 4.609850, Train Acc.: 75.1667
**datalen=620, k=1.000000
**datalen=620
grad_abs, loss_abs, del_w_abs:




<Test Phase> Accuacy of the network on the 10,000 test images:87.7100 %
Best model saved!
<Normal Phase> Epoch [2/200], Step [100/468], Loss: 0.08374874
<Normal Phase> Epoch [2/200], Step [200/468], Loss: 0.15255174
<Normal Phase> Epoch [2/200], Step [300/468], Loss: 0.10310934
<Normal Phase> Epoch [2/200], Step [400/468], Loss: 0.08227003
<Normal Phase> totally train at Epoch [2/200], Loss: 47.950111, Train Acc.: 95.0067
**datalen=4155, k=1.050000
**datalen=620
grad_abs, loss_abs, del_w_abs:
<Test Phase> Accuacy of the network on the 10,000 test images:96.7200 %
Best model saved!
<Normal Phase> Epoch [3/200], Step [100/468], Loss: 0.02379362
<Normal Phase> Epoch [3/200], Step [200/468], Loss: 0.00000000
<Normal Phase> Epoch [3/200], Step [300/468], Loss: 0.01671714
<Normal Phase> Epoch [3/200], Step [400/468], Loss: 0.04439760
<Normal Phase> totally train at Epoch [3/200], Loss: 17.698580, Train Acc.: 97.7900
**datalen=1802, k=1.102500
**datalen=620
grad_abs, loss_abs, del_w_abs:
<Tes

In [30]:
import pickle

saved_dir = './results/'
net_name = net.__class__.__name__
file_name = saved_dir + net_name +'2'+ '.pkl'

with open(file_name, 'wb') as f:
    pickle.dump(file_trace, f)

In [30]:
i = range(10)
print(i[2])

2


In [27]:
i = train_loader
for inputs, labels, mask_tensor in i:
    print(inputs)

tensor([1., 1., 0., 1., 0., 1., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 1.,
        0., 1., 1., 0., 1., 0., 1., 1., 0., 1., 0., 0., 1., 0., 0., 0., 0., 1.,
        1., 0., 1., 1., 0., 1., 0., 1., 0., 0., 0., 1., 0., 1., 1., 0., 0., 1.,
        1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 0., 1., 0., 0.,
        1., 1., 0., 1., 1., 0., 0., 1., 1., 0., 0., 0., 0., 1., 0., 0., 1., 1.,
        0., 1., 1., 0., 1., 1., 0., 0., 0., 1., 1., 0., 1., 1., 1., 0., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 0., 1., 0., 1.,
        0., 1.], grad_fn=<StackBackward>)
tensor([1., 1., 1., 0., 0., 0., 1., 0., 1., 1., 0., 1., 1., 1., 1., 0., 1., 0.,
        0., 1., 0., 1., 1., 1., 1., 0., 1., 1., 1., 0., 1., 0., 0., 0., 1., 1.,
        0., 1., 0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 1., 1., 1., 1., 0., 0.,
        1., 0., 1., 0., 1., 0., 1., 0., 1., 1., 0., 0., 0., 0., 1., 1., 0., 0.,
        1., 0., 1., 1., 1., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0.

KeyboardInterrupt: 

In [64]:
x = train_dataset


In [68]:
x.__getitem__(0)

(tensor([[[-0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242,
           -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242,
           -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242,
           -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242],
          [-0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242,
           -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242,
           -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242,
           -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242],
          [-0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242,
           -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242,
           -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242,
           -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242],
          [-0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242,
           -0.4242, -0.4242, -0.424

In [19]:
b = torch.distributions.bernoulli.Bernoulli(probs=0.5)
sample = b.sample(sample_shape=torch.Size([5]))
sample.requires_grad_(True)
print(sample, sample[0])

tensor([1., 1., 0., 1., 1.], requires_grad=True) tensor(1., grad_fn=<SelectBackward>)


In [32]:
a = torch.tensor([1.,2.], requires_grad=True)
print(a.grad_fn)

None


In [24]:
a = torch.tensor([1.,-5.,3.])
b = torch.tensor([0,3,1])
c = (a > b).to(device=device, dtype=torch.float32)
print(c.requires_grad)

RuntimeError: Expected object of scalar type Float but got scalar type Long for argument #2 'other'

In [25]:
print(a.mean(dim=0))

tensor(-0.3333)


In [103]:
test_acc, test_loss, test_output_entropy = test(6)


<Test Phase> Accuacy of the network on the 10,000 test images:29.3300 %




In [117]:
labels_distrib = {str(c):[] for c in range(10)}

In [128]:
labels_distrib = dict(zip([str(c) for c in range(10)] ,[0 for _ in range(10)]))

In [129]:
labels_distrib

{'0': 0,
 '1': 0,
 '2': 0,
 '3': 0,
 '4': 0,
 '5': 0,
 '6': 0,
 '7': 0,
 '8': 0,
 '9': 0}

In [132]:
a = torch.tensor([2,3,4,5,6])
b = torch.tensor([0,0,1,0,1], dtype=torch.uint8)
t = a[b].data.cpu().numpy()
#for x in t:
#    labels_distrib[str(x)] += 1
labels_distrib[str(x) for x in t] += 1

print(labels_distrib)

SyntaxError: invalid syntax (<ipython-input-132-35607bbfd757>, line 6)

In [123]:
str(x) for x in range(1)

SyntaxError: invalid syntax (<ipython-input-123-06d9756bcd0d>, line 1)

In [134]:
labels_distrib = dict(zip([str(c) for c in range(10)] ,[[0 for _ in range(num_epochs)] for _ in range(10)]))
print(labels_distrib)

{'0': [0, 0, 0, 0, 0], '1': [0, 0, 0, 0, 0], '2': [0, 0, 0, 0, 0], '3': [0, 0, 0, 0, 0], '4': [0, 0, 0, 0, 0], '5': [0, 0, 0, 0, 0], '6': [0, 0, 0, 0, 0], '7': [0, 0, 0, 0, 0], '8': [0, 0, 0, 0, 0], '9': [0, 0, 0, 0, 0]}


In [330]:
for i in range(num_epochs):
    print(labels_distrib['0'][i])

58
5316
5700
5758
5790
5786
5795
5806
5823
5838
5861
5822
5837
5862
5860
5864
5865
5874
5860
5869
5865
5871
5868
5867
5873
5883
5867
5887
5876
5879
5886
5881
5886
5880
5899
5898
5884
5896
5900
5894
5882
5895
5891
5869
5897
5897
5893
5904
5897
5891
5897
5893
5901
5897
5901
5899
5906
5905
5904
5903
5904
5905
5901
5903
5902
5900
5908
5909
5910
5909
5909
5908
5908
5909
5911
5901
5904
5910
5913
5909
5910
5909
5909
5912
5904
5907
5916
5901
5913
5901
5909
5897
5908
5912
5909
5905
5913
5913
5907
5915
5913
5912
5913
5914
5916
5912
5916
5915
5913
5904
5915
5907
5916
5915
5912
5915
5916
5916
5914
5901
5916
5909
5915
5915
5914
5916
5917
5915
5915
5916
5917
5913
5913
5917
5917
5917
5917
5918
5918
5917
5914
5918
5909
5919
5918
5918
5915
5909
5918
5919
5916
5919
5920
5919
5919
5916
5918
5918
5918
5916
5918
5919
5917
5919
5919
5919
5918
5917
5918
5917
5918
5918
5919
5918
5920
5919
5918
5920
5920
5920
5920
5921
5920
5921
5920
5921
5920
5921
5921
5921
5920
5920
5921
5921
5921
5921
5921
5921
5921
5921


In [34]:
l = [1,2,3,4,5]

for i in l:
    i=0
    print(i)
    
    
print(l)

0
0
0
0
0
[1, 2, 3, 4, 5]
