In [1]:
%matplotlib inline
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
from PIL import Image
import json
import os
import torchvision
import torchvision.transforms as transforms
from math import ceil
from tqdm import tqdm
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
from torchinfo import summary

## Dataset and Dataloader

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
batch_size = 32
epochs = 25
lr = 0.0005
split_size = 0.9

path_to_anns = "D:\\Studies\\ucenje\\keypoint_flickr\\data\\annotations\\all_data.json"
path_to_img = "D:\\Studies\\ucenje\\keypoint_flickr\\data\\images"

In [3]:
class FlickrDatasetRegression(Dataset):
    def __init__(self, path_to_anns, path_to_imgs, transform=None):
        with open(path_to_anns, 'r') as f:
            self.anns = json.loads(f.read())
        self.image_path = path_to_imgs
        self.transform = transform

    def __len__(self):
        return len(self.anns)

    def __getitem__(self, idx):
        img_path = os.path.join(self.image_path, self.anns[str(idx)]["file_name"])
        image = Image.open(img_path).convert("RGB")
        orig_width, orig_height = image.size
        image.thumbnail((224, 224))
        image = np.array(image, dtype=np.uint8)
        keypoints = np.array(self.anns[str(idx)]["face_landmarks"])
        keypoints = keypoints * [224 / orig_width, 224 / orig_height]
        keypoints = keypoints.flatten()

        if self.transform:
            image = self.transform(image)

        return image, torch.tensor(keypoints), idx

dataset = FlickrDatasetRegression(path_to_anns,path_to_img,
                        transform=transforms.ToTensor())
train_len = ceil(len(dataset) * split_size)
val_len = ceil(len(dataset) * (1-split_size))
train_dataset, val_dataset = torch.utils.data.random_split(dataset, (train_len, val_len))
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False)

## Create Model

In [4]:
class SimpleCNN(nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__()
        self.conv = nn.Conv2d(3, 16, 9)
        self.pool = nn.MaxPool2d(2,2)
        self.norm = nn.BatchNorm2d(16)
        self.linear1 = nn.LazyLinear(2048)
        self.linear2 = nn.Linear(2048, 136)
    def forward(self, x):
        x = self.norm(self.pool(F.relu(self.conv(x))))
        x = x.reshape(x.shape[0], -1)
        x = F.relu(self.linear1(x))
        x = self.linear2(x)
        return x


class SimpleCNN2(nn.Module):
    def __init__(self):
        super(SimpleCNN2, self).__init__()
        self.pool = nn.MaxPool2d(2,2)
        self.conv1 = nn.Conv2d(3, 32, 3)
        self.conv2 = nn.Conv2d(32, 64, 2)
        self.conv3 = nn.Conv2d(64, 128, 2)
        self.norm1 = nn.BatchNorm2d(32)
        self.norm2 = nn.BatchNorm2d(64)
        self.linear1 = nn.LazyLinear(1000)
        self.linear2 = nn.Linear(1000, 256)
        self.linear3 = nn.Linear(256, 136)

    def forward(self, x):
        x = self.norm1(self.pool(F.relu(self.conv1(x))))
        x = self.norm2(self.pool(F.relu(self.conv2(x))))
        x = self.pool(F.relu(self.conv3(x)))
        x = x.reshape(x.shape[0], -1)
        x = F.relu(self.linear1(x))
        x = F.relu(self.linear2(x))
        x = self.linear3(x)
        return x


class ClassificationCNN(nn.Module):
    def __init__(self):
        super(ClassificationCNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 16, 3)
        self.conv2 = nn.Conv2d(16, 32, 3)
        self.conv3 = nn.Conv2d(32, 64, 5)
        self.batch1 = nn.BatchNorm2d(16)
        self.batch2 = nn.BatchNorm2d(32)
        self.batch3 = nn.BatchNorm2d(64)
        self.pool = nn.MaxPool2d(2,2)
        self.conv_trans1 = nn.ConvTranspose2d(64, 32, 5)  # 30
        self.conv_trans2 = nn.ConvTranspose2d(32, 16, 3)  # 58
        self.conv_trans3 = nn.ConvTranspose2d(16, 68, 3)  # 114

    def forward(self, x):
        x = self.batch1(self.pool(F.relu(self.conv1(x))))
        x = self.batch2(self.pool(F.relu(self.conv2(x))))
        x = self.batch3(self.pool(F.relu(self.conv3(x))))
        x = F.relu(self.conv_trans1(x))
        x = F.relu(self.conv_trans2(x))
        x = F.relu(self.conv_trans3(x))
        return x

 Train loop for simple CNN

In [None]:
torch.cuda.empty_cache()
model = SimpleCNN().to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

train_loss = []

for epoch in range(epochs):
    train_running_loss = 0.0
    for i, data in tqdm(enumerate(train_loader), total=(len(train_loader))):
        images, keypoints = data[0].to(device, dtype=torch.float32), data[1].to(device, dtype=torch.float32)
        preds = model(images)
        loss = criterion(preds, keypoints)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        train_running_loss += loss.item()
    train_loss.append(train_running_loss / (ceil(len(train_dataset) / batch_size)))

    #model.eval()

    #with torch.no_grad():
    #    val_running_loss = 0.0
    #    for i, data in tqdm(enumerate(val_loader), total=(len(val_loader))):
    #        image, keypoints = data[0].to(device), data[1].to(device)
    #        preds = model(image)
    #        loss = criterion(preds, keypoints)
    #        val_running_loss += loss.item()
    #    val_loss.append(val_running_loss / ceil(len(val_dataset) / batch_size))
    print(f'Epoch: {epoch + 1}')
    print(f'Train_loss at epoch {epoch + 1}: {train_loss[-1]}')


## Plot loss

In [None]:
plt.figure(figsize=(15,7))
plt.plot(train_loss, label="train")
plt.xlabel("iterations")
plt.ylabel("Loss")
plt.legend()
plt.show()

## Find top 10 best and worst performing images

In [None]:
model.eval()

val_losses = []
with torch.no_grad():
    for i, data in tqdm(enumerate(val_loader), total=(len(val_loader))):
        image, keypoints, index = data[0].to(device), data[1].to(device), data[2]
        pred = model(image)
        loss = criterion(pred, keypoints)
        val_losses.append((index, loss.item())) # (index, loss)


In [None]:
def get_top_images_dataloader(losses, dataset):
    sorted_loss = sorted(losses, key = lambda x: x[1])
    dataset_to_plot = sorted_loss[:10] + sorted_loss[-10:]
    indexes_to_plot = [x[0].item() for x in dataset_to_plot]
    new_data = [dataset[index] for index in indexes_to_plot]
    dataloader_for_plots = DataLoader(new_data, batch_size=1, shuffle=False)
    return dataloader_for_plots

In [None]:
dataloader_for_plots = get_top_images_dataloader(val_losses, dataset)

In [14]:

model.eval()
predictions = []
with torch.no_grad():
    for i, data in enumerate(dataloader_for_plots):
        images, keypoints, index = data[0].to(device), data[1].to(device), data[2]
        pred = model(image)
        predictions.append((index, pred))

SyntaxError: '(' was never closed (1144211643.py, line 6)

In [None]:
def plot_img(predictions): # predictions list of tuple (index, predicted_keypoints)
    transform = transforms.ToPILImage() # transform tensor back to PILImage

    images, original_keypoints_list, predicted_keypoints_list = [], [], []

    for i, data in enumerate(predictions):
        image = dataset[data[0].item()][0]
        image = transform(image.cpu().detach())  # Images to PIL, keypoints to numpy array
        images.append(image)
        original_keypoints = dataset[data[0].item()][1]
        original_keypoints = original_keypoints.cpu().detach().numpy().reshape(-1, 2)
        original_keypoints_list.append(original_keypoints)
        predicted_keypoints = data[1].cpu().detach().numpy().reshape(-1, 2)
        predicted_keypoints_list.append(predicted_keypoints)

    original_keypoints_list = np.array(original_keypoints_list)
    predicted_keypoints_list = np.array(predicted_keypoints_list)

    plt.figure(figsize=(25,60))

    for i in range(len(images) * 2):
        plt.subplot(20, 2, i + 1)
        plt.imshow(images[i // 2])
        if i % 2 == 0:
            for p in range(original_keypoints_list[i//2].shape[0]):
                plt.plot(original_keypoints_list[i//2][p, 0], original_keypoints_list[i//2][p, 1], 'g.')
        else:
            for p in range(predicted_keypoints_list[i//2].shape[0]):
                plt.plot(predicted_keypoints_list[i//2][p, 0], predicted_keypoints_list[i//2][p, 1], 'g.')
    plt.plot()

In [None]:
plot_img(predictions)

In [12]:
class FlickrDatasetClassification(Dataset):
    def __init__(self, path_to_anns, path_to_imgs, transform=None):
        with open(path_to_anns, 'r') as f:
            self.anns = json.loads(f.read())
        self.image_path = path_to_imgs
        self.transform = transform

    def __len__(self):
        return len(self.anns)

    def __getitem__(self, idx):
        img_path = os.path.join(self.image_path, self.anns[str(idx)]["file_name"])
        image = Image.open(img_path).convert("RGB")
        orig_width, orig_height = image.size
        image.thumbnail((224, 224))
        image = np.array(image, dtype=np.uint8)
        keypoints = np.array(self.anns[str(idx)]["face_landmarks"])
        keypoints = keypoints * [224 / orig_width, 224 / orig_height]
        keypoints = keypoints.astype('uint8')
        print(keypoints)
        new_keypoints = torch.zeros(68, 224, 224)
        for i in range(len(new_keypoints)):
            new_keypoints[i][keypoints[i][0]][keypoints[i][1]] = 1
        if self.transform:
            image = self.transform(image)

        return image, new_keypoints, idx

dataset_classification = FlickrDatasetClassification(path_to_anns,path_to_img,
                        transform=transforms.ToTensor())
train_len = ceil(len(dataset_classification) * split_size)
val_len = ceil(len(dataset_classification) * (1-split_size))
train_dataset_classification, val_dataset_classification = torch.utils.data.random_split(dataset_classification, (train_len, val_len))
train_loader_classification = DataLoader(train_dataset_classification, batch_size=batch_size, shuffle=True)
val_loader_classification = DataLoader(val_dataset_classification, batch_size=batch_size, shuffle=False)

In [13]:
model = ClassificationCNN().to(device)
summary(model, input_size=(batch_size, 3, 224, 224))

Layer (type:depth-idx)                   Output Shape              Param #
ClassificationCNN                        [32, 68, 33, 33]          --
├─Conv2d: 1-1                            [32, 16, 222, 222]        448
├─MaxPool2d: 1-2                         [32, 16, 111, 111]        --
├─BatchNorm2d: 1-3                       [32, 16, 111, 111]        32
├─Conv2d: 1-4                            [32, 32, 109, 109]        4,640
├─MaxPool2d: 1-5                         [32, 32, 54, 54]          --
├─BatchNorm2d: 1-6                       [32, 32, 54, 54]          64
├─Conv2d: 1-7                            [32, 64, 50, 50]          51,264
├─MaxPool2d: 1-8                         [32, 64, 25, 25]          --
├─BatchNorm2d: 1-9                       [32, 64, 25, 25]          128
├─ConvTranspose2d: 1-10                  [32, 32, 29, 29]          51,232
├─ConvTranspose2d: 1-11                  [32, 16, 31, 31]          4,624
├─ConvTranspose2d: 1-12                  [32, 68, 33, 33]          9,

In [11]:
torch.cuda.empty_cache()
#model = ClassificationCNN().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

train_loss = []

for epoch in range(epochs):
    train_running_loss = 0.0
    for i, data in tqdm(enumerate(train_loader_classification), total=(len(train_loader_classification))):
        images, keypoints = data[0].to(device, dtype=torch.float32), data[1].to(device, dtype=torch.float32)
        preds = model(images)
        print(preds.size())
        print(keypoints.size())
        loss = criterion(preds, keypoints)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        train_running_loss += loss.item()
    train_loss.append(train_running_loss / (ceil(len(train_dataset) / batch_size)))

    #model.eval()

    #with torch.no_grad():
    #    val_running_loss = 0.0
    #    for i, data in tqdm(enumerate(val_loader), total=(len(val_loader))):
    #        image, keypoints = data[0].to(device), data[1].to(device)
    #        preds = model(image)
    #        loss = criterion(preds, keypoints)
    #        val_running_loss += loss.item()
    #    val_loss.append(val_running_loss / ceil(len(val_dataset) / batch_size))
    print(f'Epoch: {epoch + 1}')
    print(f'Train_loss at epoch {epoch + 1}: {train_loss[-1]}')


  0%|          | 0/141 [00:00<?, ?it/s]

[[ 59 113]
 [ 60 127]
 [ 62 140]
 [ 65 153]
 [ 71 166]
 [ 80 176]
 [ 90 185]
 [101 193]
 [113 196]
 [127 193]
 [143 186]
 [158 179]
 [171 168]
 [178 153]
 [181 137]
 [181 121]
 [182 105]
 [ 64 103]
 [ 69  97]
 [ 78  95]
 [ 87  96]
 [ 96  98]
 [120  96]
 [130  93]
 [141  91]
 [152  91]
 [162  95]
 [108 107]
 [108 115]
 [108 123]
 [108 131]
 [ 99 138]
 [104 140]
 [110 141]
 [116 139]
 [123 137]
 [ 76 110]
 [ 82 107]
 [ 89 106]
 [ 95 110]
 [ 89 111]
 [ 82 111]
 [128 107]
 [134 103]
 [141 102]
 [148 105]
 [141 107]
 [135 107]
 [ 87 153]
 [ 96 152]
 [105 150]
 [112 151]
 [119 149]
 [131 149]
 [144 150]
 [133 160]
 [121 164]
 [113 165]
 [106 165]
 [ 97 162]
 [ 91 154]
 [105 153]
 [112 153]
 [120 152]
 [141 151]
 [121 159]
 [113 159]
 [106 159]]
[[ 46 104]
 [ 48 120]
 [ 50 137]
 [ 52 152]
 [ 56 168]
 [ 64 182]
 [ 76 192]
 [ 90 201]
 [106 204]
 [122 200]
 [134 191]
 [146 181]
 [154 168]
 [157 154]
 [159 139]
 [160 126]
 [161 112]
 [ 63  90]
 [ 70  83]
 [ 81  79]
 [ 92  80]
 [100  87]
 [129  89

  0%|          | 0/141 [00:00<?, ?it/s]

[[ 49 107]
 [ 51 123]
 [ 55 137]
 [ 60 151]
 [ 68 164]
 [ 79 174]
 [ 92 183]
 [107 190]
 [121 190]
 [134 186]
 [146 175]
 [156 163]
 [163 149]
 [167 135]
 [168 120]
 [167 105]
 [166  90]
 [ 63 103]
 [ 71  98]
 [ 80  95]
 [ 91  95]
 [100  98]
 [121  95]
 [129  89]
 [138  86]
 [147  85]
 [155  89]
 [112 106]
 [113 117]
 [114 127]
 [116 138]
 [106 144]
 [111 145]
 [117 145]
 [122 143]
 [127 140]
 [ 76 110]
 [ 82 108]
 [ 89 107]
 [ 96 109]
 [ 90 111]
 [ 83 113]
 [126 105]
 [132 101]
 [138 100]
 [144 101]
 [140 104]
 [133 106]
 [100 163]
 [107 159]
 [113 156]
 [118 157]
 [123 155]
 [129 155]
 [136 156]
 [132 162]
 [126 166]
 [121 167]
 [115 168]
 [108 166]
 [103 162]
 [114 160]
 [119 160]
 [124 158]
 [133 157]
 [125 158]
 [119 160]
 [114 160]]
[[ 40  96]
 [ 40 114]
 [ 40 132]
 [ 45 148]
 [ 54 163]
 [ 65 174]
 [ 78 184]
 [ 93 190]
 [107 192]
 [117 189]
 [124 180]
 [130 169]
 [136 157]
 [141 144]
 [147 130]
 [150 117]
 [150 102]
 [ 64  92]
 [ 75  88]
 [ 86  87]
 [ 98  89]
 [110  92]
 [127  91




IndexError: index 225 is out of bounds for dimension 0 with size 224

In [None]:
plt.figure(figsize=(15,7))
plt.plot(train_loss, label="train")
plt.xlabel("iterations")
plt.ylabel("Loss")
plt.legend()
plt.show()