In [5]:
from torchvision import utils
from basic_fcn import *
from dataloader import *
from utils import *
import torchvision
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
import time
import math
from tqdm import tqdm
import gc
import os
import pickle

In [6]:
def init_weights(m):
    if isinstance(m, nn.Conv2d) or isinstance(m, nn.ConvTranspose2d):
        torch.nn.init.xavier_uniform_(m.weight.data)
        torch.nn.init.xavier_uniform_(m.bias.data.view(m.bias.data.shape[0],1))
        #a = math.sqrt(3) * math.sqrt(2/m.bias.data.shape[0])
        #torch.nn.init._no_grad_uniform_(m.bias.data, -a, a)
        
        


In [7]:

    
def train(model, criterion, epochs, train_loader, val_loader, test_loader, use_gpu, name):
    
    #Create non-existing logfiles
    logname = 'logfile.txt'
    logname_summary = 'logfile_summary.txt'    
    
    if os.path.exists('logfile.txt') == True:
        i = 1
        logname = 'logfile' + str(i) + '.txt'
        while os.path.exists('logfile' + str(i) + '.txt'):
            i+=1
            logname = 'logfile' + str(i) + '.txt'
        logname_summary = 'logfile' + str(i) + '_summary.txt'    

    print('Loading results to logfile: ' + logname)
    with open(logname, "a") as file:
        file.write("Lofile DATA: Validation Loss and Accuracy\n") 
    
    print('Loading Summary to : ' + logname_summary) 
    pickle_file = logname_summary[::-4] +'.pkl'
    print('Loading Variables to : ' + pickle_file) 
    
    
    optimizer = optim.Adam(fcn_model.parameters(), lr=5e-3)
    if use_gpu:
        device = torch.device("cuda:0")
        model = torch.nn.DataParallel(model)
        model.to(device)
        
        
    
    val_loss_set = []
    val_acc_set = []
    val_iou_set = []
    
    
    training_loss = []
    
    # Early Stop criteria
    minLoss = 1e6
    minLossIdx = 0
    earliestStopEpoch = 10
    earlyStopDelta = 5
    for epoch in range(epochs):
        ts = time.time()
        
        #print(np.array(val_loss).shape)
        # early-stopping 
#         if epoch > 11:
#             if val_loss[-1] < val_loss[-10]:
#                 open('save_param', 'w').close()
#                 torch.save(fcn_model.state_dict(), 'save_param')
                
                  
        for iter, (inputs, tar, labels) in tqdm(enumerate(train_loader)):
            #print("\n**********************************************\nIter")
            #checkM()
            optimizer.zero_grad()
            del tar
            
            if use_gpu:
                inputs = inputs.to(device)# Move your inputs onto the gpu
                labels = labels.to(device) # Move your labels onto the gpu
            
                
            outputs = model(inputs)
            del inputs
            loss = criterion(outputs, Variable(labels.long()))
            del labels
            del outputs
            #print("\n**********************************************\nPre back")
            #checkM()
            loss.backward()
            loss = loss#.item()
            optimizer.step()

            if iter % 10 == 0:
                print("epoch{}, iter{}, loss: {}".format(epoch, iter, loss))
                #if iter == 50:
                #    break

                    
                    
            #print("\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n")
        
        # calculate val loss each epoch
        val_loss, val_acc, val_iou = val(model, val_loader, criterion, use_gpu)
        val_loss_set.append(val_loss)
        val_acc_set.append(val_acc)
        val_iou_set.append(val_iou)
        
        print("epoch {}, time {}, train loss {}, val loss {}, val acc {}, val iou {}".format(epoch, time.time() - ts,
                                                                                                loss, val_loss,
                                                                                                val_acc,
                                                                                                val_iou))        
        training_loss.append(loss.item())
        
        with open(logname, "a") as file:
            file.write("writing!\n")
            file.write("Finish epoch {}, time elapsed {}".format(epoch, time.time() - ts))
            file.write("\n training Loss:   " + str(loss.item()))
            file.write("\n Validation Loss: " + str(val_loss_set[-1]))
            file.write("\n Validation acc:  " + str(val_acc_set[-1]))
            file.write("\n Validation iou:  " + str(val_iou_set[-1]) + "\n ")
        
                                                                                                
                                                                                                
        
        # Early stopping
        if val_loss < minLoss:
            # Store new best
            torch.save(model, name)
            minLoss = val_loss#.item()
            minLossIdx = epoch
            
        # If passed min threshold, and no new min has been reached for delta epochs
        elif epoch > earliestStopEpoch and (epoch - minLossIdx) > earlyStopDelta:
            print("Stopping early at {}".format(minLossIdx))
            
        # TODO what is this for?
        #model.train()
        with open(logname_summary, "w") as file:
            file.write("Summary!\n")
            file.write("Stopped early at {}".format(minLossIdx))
            file.write("\n training Loss:   " + str(training_loss))        
            file.write("\n Validation Loss: " + str(val_loss_set))
            file.write("\n Validation acc:  " + str(val_acc_set))
            file.write("\n Validation iou:  " + str(val_iou_set) + "\n ")
 
        with open(pickle_file, 'wb') as f:  # Python 3: open(..., 'wb')
            pickle.dump([training_loss, val_loss_set, val_acc_set,val_iou_set], f)
        
        

        
    return val_loss_set, val_acc_set, val_iou_set


def val(model, val_loader, criterion, use_gpu):
    
    # set to evaluation mode 
    model.eval()

    softmax = nn.Softmax(dim = 1)
    
    loss = []
    pred = []
    acc = []
    
    IOU_init = False
    if use_gpu:
        device = torch.device("cuda:0")
        
        #model.to(device)
        
    for iter, (X, tar, Y) in tqdm(enumerate(val_loader)):
        
        if not IOU_init:
            IOU_init = True
            IOU = np.zeros((1,19))
            
        if use_gpu:
            inputs = X.to(device)
            labels = Y.to(device)
            
        else:
            inputs, labels = X, Y

            
        with torch.no_grad():   
            outputs = model(inputs)    
            loss.append(criterion(outputs, labels.long()).item())
            prediction = softmax(outputs) 
            acc.append(pixel_acc(prediction, labels))
            IOU = IOU + np.array(iou(prediction, labels))
        
    
    acc = sum(acc)/len(acc)
    avg_loss = sum(loss)/len(loss) 
    IOU = IOU/iter  
    
    return avg_loss, acc, IOU      
       
    
    
    
def test(model, use_gpu):
    
    softmax = nn.Softmax(dim = 1)
    
    pred = []
    acc = []
    if use_gpu:
        device = torch.device("cuda:0")
        
        model.to(device)
    
    IOU_init = False
    for iter, (X, tar, Y) in enumerate(test_loader):
        
        if not IOU_init:
            IOU_init = True
            IOU = np.zeros((1,tar.shape[1]))
        
        if use_gpu:
            inputs = X.to(device)
            labels = Y.to(device)
        else:
            inputs, labels = X, Y
                    
        
        outputs = fcn_model(inputs)  
        
        prediction = softmax(outputs)
        acc.append(pixel_acc(prediction, labels))
        IOU = IOU + np.array(iou(prediction, Y))
        
    acc = sum(acc)/len(acc)        
    IOU = IOU/iter

    #Complete this function - Calculate accuracy and IoU 
    # Make sure to include a softmax after the output from your model
    
    return acc, IOU
    

In [8]:
def checkM():
    for obj in gc.get_objects():
        try:
            if torch.is_tensor(obj) or (hasattr(obj, 'data') and torch.is_tensor(obj.data)):
                print(type(obj), obj.size())
        except:
            pass

if __name__ == "__main__":
    train_dataset = CityScapesDataset(csv_file='train.csv')
    val_dataset = CityScapesDataset(csv_file='val.csv')
    test_dataset = CityScapesDataset(csv_file='test.csv')
    train_loader = DataLoader(dataset=train_dataset,
                          batch_size=3,
                          num_workers=8,
                          shuffle=True)
    val_loader = DataLoader(dataset=val_dataset,
                          batch_size=3,
                          num_workers=8,
                          shuffle=True)
    test_loader = DataLoader(dataset=test_dataset,
                          batch_size=4,
                          num_workers=4,
                          shuffle=True)
    
    
    epochs     = 100
    criterion = torch.nn.CrossEntropyLoss()
    # Fix magic number
    fcn_model = FCN(n_class=34)
    fcn_model.apply(init_weights)
    
    
    epochs     = 100
    use_gpu = torch.cuda.is_available()
#     if use_gpu:
#         device = torch.device("cuda:0")
#         fcn_model = torch.nn.DataParallel(fcn_model)
#         fcn_model.to(device)
#     val(fcn_model, val_loader, criterion, use_gpu)
    train(fcn_model, criterion, epochs, train_loader, val_loader, test_loader, use_gpu, "FCN")
    
    
    fcn_model.load_state_dict(torch.load('./save_param'))
    

Loading results to logfile: logfile4.txt
Loading Summary to : logfile4_summary.txt



0it [00:00, ?it/s][A
1it [00:10, 10.16s/it][A

epoch0, iter0, loss: 3.9489307403564453



2it [00:10,  7.18s/it][A
3it [00:12,  5.54s/it][A
4it [00:12,  4.11s/it][A
5it [00:14,  3.33s/it][A
6it [00:15,  2.54s/it][A
7it [00:16,  2.16s/it][A
8it [00:17,  1.73s/it][A
9it [00:20,  2.20s/it][A
10it [00:21,  1.76s/it][A
11it [00:22,  1.76s/it][A

epoch0, iter10, loss: 2.8530445098876953



12it [00:23,  1.29s/it][A
13it [00:24,  1.21s/it][A
14it [00:24,  1.06s/it][A
15it [00:25,  1.09s/it][A
16it [00:26,  1.02it/s][A
17it [00:28,  1.19s/it][A
18it [00:29,  1.04s/it][A
19it [00:30,  1.13s/it][A
20it [00:31,  1.02s/it][A
21it [00:32,  1.17s/it][A

epoch0, iter20, loss: 2.3090107440948486



22it [00:32,  1.13it/s][A
23it [00:33,  1.10it/s][A
24it [00:34,  1.18it/s][A
25it [00:35,  1.05it/s][A
26it [00:36,  1.17it/s][A
27it [00:39,  1.56s/it][A
28it [00:40,  1.32s/it][A
29it [00:41,  1.26s/it][A
30it [00:42,  1.07s/it][A
31it [00:43,  1.23s/it][A

epoch0, iter30, loss: 2.0575063228607178



32it [00:43,  1.06it/s][A
33it [00:45,  1.05s/it][A
34it [00:46,  1.05it/s][A
35it [00:47,  1.18s/it][A
36it [00:48,  1.06s/it][A
37it [00:49,  1.11s/it][A
38it [00:50,  1.00s/it][A
39it [00:51,  1.03s/it][A
40it [00:52,  1.07it/s][A
41it [00:53,  1.13s/it][A

epoch0, iter40, loss: 1.7211891412734985



42it [00:54,  1.17it/s][A
43it [00:57,  1.50s/it][A
44it [00:57,  1.26s/it][A
45it [00:58,  1.21s/it][A
46it [00:59,  1.03s/it][A
47it [01:00,  1.05s/it][A
48it [01:01,  1.08it/s][A
49it [01:02,  1.06it/s][A
50it [01:03,  1.11it/s][A

epoch0, iter50, loss: 1.6921566724777222




0it [00:00, ?it/s][A[A

1it [00:12, 12.49s/it][A[A

2it [00:15,  9.65s/it][A[A

3it [00:17,  7.32s/it][A[A

4it [00:18,  5.33s/it][A[A

5it [00:19,  4.25s/it][A[A

6it [00:20,  3.15s/it][A[A

7it [00:21,  2.65s/it][A[A

8it [00:22,  2.01s/it][A[A

9it [00:23,  1.86s/it][A[A

10it [00:27,  2.38s/it][A[A

11it [00:28,  2.06s/it][A[A

12it [00:29,  1.58s/it][A[A

13it [00:30,  1.47s/it][A[A

14it [00:31,  1.18s/it][A[A

15it [00:32,  1.25s/it][A[A

16it [00:32,  1.05s/it][A[A

17it [00:34,  1.22s/it][A[A

18it [00:36,  1.53s/it][A[A

19it [00:38,  1.47s/it][A[A

20it [00:38,  1.17s/it][A[A

21it [00:40,  1.22s/it][A[A

22it [00:40,  1.00s/it][A[A

23it [00:41,  1.12s/it][A[A

24it [00:42,  1.07it/s][A[A

25it [00:43,  1.11s/it][A[A

26it [00:45,  1.31s/it][A[A

27it [00:46,  1.25s/it][A[A

28it [00:47,  1.02s/it][A[A

29it [00:48,  1.13s/it][A[A

30it [00:49,  1.09it/s][A[A

31it [00:50,  1.04s/it][A[A

32it [00:50,  1.15it

epoch 0, time 250.9032440185547, train loss 1.6921566724777222, val loss 2.347329081889398, val acc 39.16963407855313, val iou [[0.40907043 0.         0.09092858        nan        nan 0.
         nan 0.         0.11539933        nan 0.38239661        nan
         nan 0.                nan        nan        nan        nan
         nan]]


  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


0it [00:00, ?it/s][A[A

1it [00:09,  9.24s/it][A[A

epoch1, iter0, loss: 2.0478007793426514




2it [00:09,  6.54s/it][A[A

3it [00:10,  4.97s/it][A[A

4it [00:11,  3.68s/it][A[A

5it [00:12,  2.94s/it][A[A

6it [00:13,  2.24s/it][A[A

7it [00:14,  1.96s/it][A[A

8it [00:15,  1.58s/it][A[A

9it [00:19,  2.24s/it][A[A

10it [00:19,  1.78s/it][A[A

11it [00:22,  1.93s/it][A[A

epoch1, iter10, loss: 2.751924514770508




12it [00:22,  1.43s/it][A[A

13it [00:23,  1.32s/it][A[A

14it [00:24,  1.13s/it][A[A

15it [00:25,  1.13s/it][A[A

16it [00:25,  1.00it/s][A[A

17it [00:28,  1.54s/it][A[A

18it [00:29,  1.27s/it][A[A

19it [00:30,  1.18s/it][A[A

20it [00:30,  1.00s/it][A[A

21it [00:32,  1.12s/it][A[A



epoch1, iter20, loss: 2.2730140686035156


22it [00:32,  1.18it/s][A[A

23it [00:33,  1.11it/s][A[A

24it [00:34,  1.19it/s][A[A

25it [00:37,  1.67s/it][A[A

26it [00:38,  1.35s/it][A[A

27it [00:38,  1.12s/it][A[A

28it [00:39,  1.03it/s][A[A

29it [00:40,  1.09it/s][A[A

30it [00:41,  1.18it/s][A[A

31it [00:42,  1.07s/it][A[A

epoch1, iter30, loss: 2.2935829162597656




32it [00:42,  1.20it/s][A[A

33it [00:44,  1.02s/it][A[A

34it [00:45,  1.09it/s][A[A

35it [00:46,  1.03it/s][A[A

36it [00:46,  1.16it/s][A[A

37it [00:47,  1.11it/s][A[A

38it [00:48,  1.19it/s][A[A

39it [00:49,  1.03it/s][A[A

40it [00:50,  1.16it/s][A[A

41it [00:54,  1.79s/it][A[A

42it [00:54,  1.30s/it][A[A

epoch1, iter40, loss: 2.650067090988159




43it [00:55,  1.15s/it][A[A

44it [00:55,  1.01s/it][A[A

45it [00:56,  1.00s/it][A[A

46it [00:57,  1.15it/s][A[A

47it [00:58,  1.07it/s][A[A

48it [00:59,  1.16it/s][A[A

49it [01:02,  1.60s/it][A[A

50it [01:03,  1.30s/it][A[A

epoch1, iter50, loss: 2.0041139125823975





0it [00:00, ?it/s][A[A[A


1it [00:14, 14.72s/it][A[A[A


2it [00:15, 10.48s/it][A[A[A


3it [00:17,  7.85s/it][A[A[A


4it [00:17,  5.67s/it][A[A[A


5it [00:19,  4.45s/it][A[A[A


6it [00:19,  3.24s/it][A[A[A


7it [00:20,  2.65s/it][A[A[A


8it [00:21,  2.01s/it][A[A[A


9it [00:24,  2.45s/it][A[A[A


10it [00:25,  1.87s/it][A[A[A


11it [00:27,  1.88s/it][A[A[A


12it [00:27,  1.44s/it][A[A[A


13it [00:28,  1.37s/it][A[A[A


14it [00:29,  1.10s/it][A[A[A


15it [00:30,  1.22s/it][A[A[A


16it [00:31,  1.01s/it][A[A[A


17it [00:32,  1.18s/it][A[A[A


18it [00:33,  1.02it/s][A[A[A


19it [00:35,  1.20s/it][A[A[A


20it [00:35,  1.01it/s][A[A[A


21it [00:37,  1.17s/it][A[A[A


22it [00:37,  1.03it/s][A[A[A


23it [00:39,  1.12s/it][A[A[A


24it [00:39,  1.06it/s][A[A[A


25it [00:41,  1.21s/it][A[A[A


26it [00:42,  1.02s/it][A[A[A


27it [00:43,  1.19s/it][A[A[A


28it [00:44,  1.02it/s][A[A[

epoch 1, time 246.26465916633606, train loss 2.0041139125823975, val loss 2.01199221967937, val acc 37.64487854734866, val iou [[0.41066587 0.         0.14479832        nan        nan 0.
         nan 0.         0.05135894        nan 0.41228797        nan
         nan 0.                nan        nan        nan        nan
         nan]]





0it [00:00, ?it/s][A[A[A


1it [00:11, 11.43s/it][A[A[A

epoch2, iter0, loss: 2.1532022953033447





2it [00:12,  8.45s/it][A[A[A


3it [00:14,  6.26s/it][A[A[A


4it [00:14,  4.58s/it][A[A[A


5it [00:15,  3.49s/it][A[A[A


6it [00:16,  2.65s/it][A[A[A


7it [00:17,  2.15s/it][A[A[A


8it [00:18,  1.71s/it][A[A[A


9it [00:22,  2.61s/it][A[A[A


10it [00:23,  2.01s/it][A[A[A


11it [00:24,  1.77s/it][A[A[A

epoch2, iter10, loss: 1.9768682718276978





12it [00:24,  1.31s/it][A[A[A


13it [00:25,  1.20s/it][A[A[A


14it [00:26,  1.03s/it][A[A[A


15it [00:27,  1.05s/it][A[A[A


16it [00:28,  1.06it/s][A[A[A


17it [00:31,  1.73s/it][A[A[A


18it [00:32,  1.38s/it][A[A[A


19it [00:33,  1.28s/it][A[A[A


20it [00:33,  1.06s/it][A[A[A


21it [00:35,  1.14s/it][A[A[A


22it [00:35,  1.19it/s][A[A[A

epoch2, iter20, loss: 1.8629802465438843





23it [00:36,  1.13it/s][A[A[A


24it [00:37,  1.13it/s][A[A[A


25it [00:41,  1.82s/it][A[A[A


26it [00:41,  1.47s/it][A[A[A


27it [00:42,  1.36s/it][A[A[A


28it [00:43,  1.16s/it][A[A[A


29it [00:44,  1.01s/it][A[A[A


30it [00:44,  1.13it/s][A[A[A


31it [00:46,  1.08s/it][A[A[A

epoch2, iter30, loss: 2.069167137145996





32it [00:46,  1.19it/s][A[A[A


33it [00:48,  1.07s/it][A[A[A


34it [00:49,  1.04it/s][A[A[A


35it [00:50,  1.06s/it][A[A[A


36it [00:50,  1.08it/s][A[A[A


37it [00:51,  1.06it/s][A[A[A


38it [00:52,  1.15it/s][A[A[A


39it [00:54,  1.04s/it][A[A[A


40it [00:54,  1.08it/s][A[A[A


41it [00:56,  1.31s/it][A[A[A


42it [00:57,  1.02it/s]

epoch2, iter40, loss: 1.786535382270813


[A[A[A


43it [00:58,  1.04it/s][A[A[A


44it [00:58,  1.17it/s][A[A[A


45it [00:59,  1.05it/s][A[A[A


46it [01:00,  1.17it/s][A[A[A


47it [01:01,  1.01s/it][A[A[A


48it [01:02,  1.09it/s][A[A[A


49it [01:06,  1.77s/it][A[A[A


50it [01:06,  1.42s/it][A[A[A

epoch2, iter50, loss: 2.0873043537139893






0it [00:00, ?it/s][A[A[A[A

RuntimeError: DataLoader worker (pid 346) is killed by signal: Killed. 


