In [49]:
!gdown --id '1aerVHZJo5GTRU-06PICcCvl39Y-VR3rz' --output 2021VRDL_HW1_datasets.zip

!apt-get install unzi
!unzip -q '2021VRDL_HW1_datasets.zip' -d 2021VRDL_HW1_datasets

Downloading...
From: https://drive.google.com/uc?id=1aerVHZJo5GTRU-06PICcCvl39Y-VR3rz
To: /content/2021VRDL_HW1_datasets/2021VRDL_HW1_datasets.zip
100% 678M/678M [00:04<00:00, 136MB/s]
Reading package lists... Done
Building dependency tree       
Reading state information... Done
E: Unable to locate package unzi


In [None]:
import os
os.chdir("/content/2021VRDL_HW1_datasets/")

!unzip 'training_images.zip' -d training_images
!unzip 'testing_images.zip' -d testing_images

In [None]:
# Import necessary packages.
import numpy as np
import os
import torch
import torch.nn as nn
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
from torchvision import utils
import matplotlib.pyplot as plt
from PIL import Image
import torchvision
import cv2
import torchvision.models as models
from tqdm.auto import tqdm

from PIL import Image
# "ConcatDataset" and "Subset" are possibly useful when doing semi-supervised learning.
from torch.utils.data import ConcatDataset, Subset
from torchvision.datasets import DatasetFolder

In [None]:
# data augmentation
#ref:https://chih-sheng-huang821.medium.com/03-pytorch-dataaug-a712a7a7f55e
#  https://pytorch.org/vision/stable/transforms.html
padding = (40, 40, 40, 40)
transform_set = [ 
      transforms.Pad(padding, padding_mode="edge"),
      transforms.RandomPerspective(distortion_scale=0.6, p=1.0),
      transforms.RandomRotation(30),
      transforms.RandomHorizontalFlip(p=0.9),
      transforms.RandomVerticalFlip(p=0.9),)
]
train_tfm = transforms.Compose([
    # Resize the image into a fixed shape (height = width = 128)
    transforms.Resize((224, 224)),
    transforms.RandomApply(transform_set, p=0.5),
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    
])

test_tfm = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

In [None]:
#ref:https://www.cnblogs.com/denny402/p/7512516.html
def default_loader(path):
    return Image.open(path).convert('RGB')
train_path = './training_images/'
test_path = './testing_images/'
class TrainDataset(Dataset):
    def __init__(self, txt, transform=None, target_transform=None, loader=default_loader):
        fh = open(txt, 'r')
        imgs = []
        for line in fh.readlines():
            line = line.strip('\n')
            line = line.rstrip()
            words = line.split()
            imgs.append((words[0], int(str(words[1])[0:3])))
        
        self.imgs = imgs
        self.transform = transform
        self.target_transform = target_transform
        self.loader = loader

    def __getitem__(self, index):
        fn, label = self.imgs[index]
        fn = './training_images/' + fn
        img = self.loader(fn)
        if self.transform is not None:
          img = self.transform(img)
        return img, label

    def __len__(self):
        return len(self.imgs)

train_data=TrainDataset(txt='training_labels.txt', transform=train_tfm)

class TestDataset(Dataset):
    def __init__(self, txt, transform=None, target_transform=None, loader=default_loader):
        fh = open(txt, 'r')
        imgs = []
        for line in fh.readlines():
            line = line.strip('\n')
            line = line.rstrip()
            words = line.split()
            #print(words[2:10])
            imgs.append((str(words)[2:10]))
        
        self.imgs = imgs
        self.transform = transform
        self.target_transform = target_transform
        self.loader = loader

    def __getitem__(self, index):
        fn = self.imgs[index]
        fn='./testing_images/' + fn
        img = self.loader(fn)
        if self.transform is not None:
            img = self.transform(img)
        return img

    def __len__(self):
        return len(self.imgs)
    
test_data=TestDataset(txt='testing_img_order.txt', transform=test_tfm)


#print(len(train_data))
train_size = len(train_data)
training_size = int(0.85 * train_size)
#print(training_size)
valid_size = train_size - training_size
#print(valid_size)

train_data, valid_data = torch.utils.data.random_split(train_data, [training_size, valid_size])
train_loader = DataLoader(train_data, batch_size=100, shuffle=True)
valid_loader = DataLoader(valid_data, batch_size=100, shuffle=True)
test_loader = DataLoader(test_data, batch_size=100, shuffle=False)
#print(len(data_loader))


def show_batch(imgs):
    grid = utils.make_grid(imgs)
    plt.imshow(grid.numpy().transpose((1, 2, 0)))
    plt.title('Batch from dataloader')


for i, (batch_x, batch_y) in enumerate(train_loader):
    if(i < 4):
        print(i, batch_x.size(), batch_y.size())
        #print(type(batch_x))
        show_batch(batch_x)
        plt.axis('off')
        plt.show()
        


In [None]:
#ref:https://pytorch-tutorial.readthedocs.io/en/latest/tutorial/chapter04_advanced/4_1_fine-tuning/
model=torchvision.models.resnext50_32x4d(pretrained=True,progress=True) 

device = "cuda" if torch.cuda.is_available() else "cpu"

for param in model.parameters():
    param.requires_grad = False

#print(model.fc)
num_fc_ftr = model.fc.in_features 
model.fc = nn.Linear(num_fc_ftr, 201) 
model=model.to(device)
#print(model) 


In [None]:
# "cuda" only when GPUs are available.
device = "cuda" if torch.cuda.is_available() else "cpu"


# For the classification task, we use cross-entropy as the measurement of performance.
criterion = nn.CrossEntropyLoss()

# Initialize optimizer, you may fine-tune some hyperparameters such as learning rate on your own.
optimizer = torch.optim.Adam(model.parameters(), lr=0.0003, weight_decay=1e-5)

# The number of training epochs.
n_epochs = 10

# Whether to do semi-supervised learning.
do_semi = False

for epoch in range(n_epochs):
    if do_semi:
        # Obtain pseudo-labels for unlabeled data using trained model.
        pseudo_set = get_pseudo_labels(unlabeled_set, model)

        concat_dataset = ConcatDataset([train_set, pseudo_set])
        train_loader = DataLoader(concat_dataset, batch_size=batch_size, shuffle=True, num_workers=8, pin_memory=True)

    # ---------- Training ----------
    # Make sure the model is in train mode before training.
    model.train()

    # These are used to record information in training.
    train_loss = []
    train_accs = []

    # Iterate the training set by batches.
    for batch in tqdm(train_loader):

        # A batch consists of image data and corresponding labels.
        imgs, labels = batch

        # Forward the data. (Make sure data and model are on the same device.)
        logits = model(imgs.to(device))

        # Calculate the cross-entropy loss.
        # We don't need to apply softmax before computing cross-entropy as it is done automatically.
        loss = criterion(logits, labels.to(device))

        # Gradients stored in the parameters in the previous step should be cleared out first.
        optimizer.zero_grad()

        # Compute the gradients for parameters.
        loss.backward()

        # Clip the gradient norms for stable training.
        grad_norm = nn.utils.clip_grad_norm_(model.parameters(), max_norm=10)

        # Update the parameters with computed gradients.
        optimizer.step()

        # Compute the accuracy for current batch.
        acc = (logits.argmax(dim=-1) == labels.to(device)).float().mean()

        # Record the loss and accuracy.
        train_loss.append(loss.item())
        train_accs.append(acc)

    # The average loss and accuracy of the training set is the average of the recorded values.
    train_loss = sum(train_loss) / len(train_loss)
    train_acc = sum(train_accs) / len(train_accs)

    # Print the information.
    print(f"[ Train | {epoch + 1:03d}/{n_epochs:03d} ] loss = {train_loss:.5f}, acc = {train_acc:.5f}")

    # ---------- Validation ----------
    # Make sure the model is in eval mode so that some modules like dropout are disabled and work normally.
    model.eval()

    # These are used to record information in validation.
    valid_loss = []
    valid_accs = []

    # Iterate the validation set by batches.
    for batch in tqdm(valid_loader):

        # A batch consists of image data and corresponding labels.
        imgs, labels = batch

        # We don't need gradient in validation.
        # Using torch.no_grad() accelerates the forward process.
        with torch.no_grad():
          logits = model(imgs.to(device))

        # We can still compute the loss (but not the gradient).
        loss = criterion(logits, labels.to(device))

        # Compute the accuracy for current batch.
        acc = (logits.argmax(dim=-1) == labels.to(device)).float().mean()

        # Record the loss and accuracy.
        valid_loss.append(loss.item())
        valid_accs.append(acc)

    # The average loss and accuracy for entire validation set is the average of the recorded values.
    valid_loss = sum(valid_loss) / len(valid_loss)
    valid_acc = sum(valid_accs) / len(valid_accs)

    # Print the information.
    print(f"[ Valid | {epoch + 1:03d}/{n_epochs:03d} ] loss = {valid_loss:.5f}, acc = {valid_acc:.5f}")

  0%|          | 0/26 [00:00<?, ?it/s]

[ Train | 001/010 ] loss = 5.38072, acc = 0.01154


  0%|          | 0/5 [00:00<?, ?it/s]

[ Valid | 001/010 ] loss = 5.24226, acc = 0.00200


  0%|          | 0/26 [00:00<?, ?it/s]

[ Train | 002/010 ] loss = 5.02384, acc = 0.06385


  0%|          | 0/5 [00:00<?, ?it/s]

[ Valid | 002/010 ] loss = 5.02975, acc = 0.04400


  0%|          | 0/26 [00:00<?, ?it/s]

[ Train | 003/010 ] loss = 4.76969, acc = 0.12923


  0%|          | 0/5 [00:00<?, ?it/s]

[ Valid | 003/010 ] loss = 4.87320, acc = 0.08800


  0%|          | 0/26 [00:00<?, ?it/s]

[ Train | 004/010 ] loss = 4.53123, acc = 0.22462


  0%|          | 0/5 [00:00<?, ?it/s]

[ Valid | 004/010 ] loss = 4.73334, acc = 0.09800


  0%|          | 0/26 [00:00<?, ?it/s]

[ Train | 005/010 ] loss = 4.31832, acc = 0.26692


  0%|          | 0/5 [00:00<?, ?it/s]

[ Valid | 005/010 ] loss = 4.58799, acc = 0.15000


  0%|          | 0/26 [00:00<?, ?it/s]

[ Train | 006/010 ] loss = 4.12812, acc = 0.33615


  0%|          | 0/5 [00:00<?, ?it/s]

[ Valid | 006/010 ] loss = 4.44097, acc = 0.18200


  0%|          | 0/26 [00:00<?, ?it/s]

[ Train | 007/010 ] loss = 3.95909, acc = 0.35808


  0%|          | 0/5 [00:00<?, ?it/s]

[ Valid | 007/010 ] loss = 4.33669, acc = 0.16600


  0%|          | 0/26 [00:00<?, ?it/s]

[ Train | 008/010 ] loss = 3.79918, acc = 0.39923


  0%|          | 0/5 [00:00<?, ?it/s]

[ Valid | 008/010 ] loss = 4.18743, acc = 0.20000


  0%|          | 0/26 [00:00<?, ?it/s]

[ Train | 009/010 ] loss = 3.63138, acc = 0.44154


  0%|          | 0/5 [00:00<?, ?it/s]

[ Valid | 009/010 ] loss = 4.13668, acc = 0.18000


  0%|          | 0/26 [00:00<?, ?it/s]

[ Train | 010/010 ] loss = 3.49414, acc = 0.45308


  0%|          | 0/5 [00:00<?, ?it/s]

[ Valid | 010/010 ] loss = 3.99993, acc = 0.21800


In [None]:
# Make sure the model is in eval mode.
# Some modules like Dropout or BatchNorm affect if the model is in training mode.
#ref:https://colab.research.google.com/drive/1eEZbnYmp_vBt59c5_LLBkRwI5gLRXah9#scrollTo=PHaFE-8oQtkC
model.eval()

# Initialize a list to store the predictions.
predictions = []

# Iterate the testing set by batches.
for batch in tqdm(test_loader):
    imgs = batch

    with torch.no_grad():
        logits = model(imgs.to(device))

    # Take the class with greatest logit as prediction and record it.
    predictions.extend(logits.argmax(dim=-1).cpu().numpy().tolist())

  0%|          | 0/31 [00:00<?, ?it/s]

In [None]:
# Save predictions into the file.
#ref:https://colab.research.google.com/drive/1eEZbnYmp_vBt59c5_LLBkRwI5gLRXah9#scrollTo=PHaFE-8oQtkC
with open("predict.txt", "w") as f:
    for i in range(len(predictions)):
        predictions[i] = int(predictions[i])
    for i in range(len(predictions)):
        if predictions[i] < 100 and predictions[i] >= 10:
            predictions[i] = "0"+str(predictions[i])
        elif predictions[i] < 10 and predictions[i] > 0:
            predictions[i] = "00"+str(predictions[i])
        f.write(str(predictions[i])+"\n")

In [None]:
with open('testing_img_order.txt') as f:
    test_images = [x.strip() for x in f.readlines()]  # all the testing images
classes = dict() 
with open('classes.txt') as f: 
    for line in f.readlines():
        line = line.strip('\n')
        words = line.split('.')
        classes.update({words[0]: words[1]})
test_class = []
with open('predict.txt', 'r') as f:
    for line in f.readlines():
        line = line.strip('\n')
        words = line.split()
        for i in range(len(words)):
            if words[i] in classes:
                test_class.append(words[i]+"."+str(classes.get(words[i])))
print(test_images[1]+" "+test_class[1])
with open('answer.txt', mode='w') as f:
    for i in range(len(test_images)):
        f.write(test_images[i]+" "+test_class[i]+"\n")
f.close()

1704.jpg 121.Grasshopper_Sparrow
