## Implementing CNN-RNN multi-label classification

In [419]:
import argparse
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import os
import pickle
from torch.utils.data import DataLoader
import sys
from CNN_RNN.model_attention import EncoderCNN, DecoderRNN
from torch.nn.utils.rnn import pack_padded_sequence
from torchvision import transforms

In [420]:
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [421]:
with open('../data/filenames.pkl', 'rb') as infile:
    filenames = pickle.load(infile)
    
with open('../data/labels.pkl', 'rb') as infile2:
    labels = pickle.load(infile2)
    
df = pd.concat([pd.Series(filenames, name='filenames'), pd.Series(labels, name='labels')], axis=1)
df = shuffle(df, random_state=42)

labels_dict = {0:'people',1:'objects',2:'places',3:'architecture',4:'abstraction',5:'society',\
          6:'nature',7:'emotions, concepts and ideas',8:'interiors',9:'work and occupations', \
          10:'symbols & personifications',11:'religion and belief',12:'leisure and pastimes',\
          13:'history',14:'literature and fiction',15:'group/movement'}

labels_names = [[]]*len(df.labels)
for k in range(len(df.labels)):
    labels_names[k] = []
    for i in range(len(df.labels[k])):
        if df.labels[k][i] == 1:
            labels_names[k].append(labels_dict[i])
            
print(len(labels_names))

24999


In [422]:
out = [[filenames[i].split('\\')[-1],labels_names[i],labels[i]] for i in range(len(labels_names))] 
import pickle
from pathlib import Path
with open('../data/img_tag.pkl', 'wb') as outF:
    pickle.dump(out, outF)
    #for line in out:
     #   json.dump([line[0],line[1]], outF)
      #  outF.write("\n")

In [423]:
with open('../data/img_tag.pkl', 'rb') as inF:
    out = pickle.load(inF)

In [382]:
# Image preprocessing, normalization for the pretrained resnet
transform = transforms.Compose([ 
        transforms.RandomCrop(224),
        transforms.RandomHorizontalFlip(), 
        transforms.ToTensor(), 
        transforms.Normalize((0.485, 0.456, 0.406), 
                             (0.229, 0.224, 0.225))])

In [413]:
import torch
from torch.utils import data

class Dataset(data.Dataset):
    'Characterizes a dataset for PyTorch'
    def __init__(self, root, pickle, transform, batch_size):
        'Initialization'
        self.root = root
        self.batch_size = batch_size
        self.transform = transform
        self.labels = [item[-1] for item in pickle]
        self.list_IDs = [item[0] for item in pickle]
    
    def __len__(self):
        'Denotes the total number of samples'
        return len(self.list_IDs)
    
    def __getitem__(self, index):
        'Generates one sample of data'
        # Select sample
        ID = self.list_IDs[index]
        img = os.path.join('..','..','data_tate', ID)
        img = Image.open(img) # use pillow to open a file
        img = img.resize((224, 224)) # resize the file to 256x256
        img = img.convert('RGB') #convert image to RGB channel
        if self.transform is not None:
            img = self.transform(img)
        # Load data and get label
        img = np.asarray(img).transpose(-1, 0, 1) # we have to change the dimensions from width x height x channel (WHC) to channel x width x height (CWH)
        X = torch.from_numpy(np.asarray(img)) # create the image tensor
        y = torch.from_numpy(np.asarray(self.labels[index]))
        length = torch.from_numpy(np.asarray([self.batch_size]*(len(self.list_IDs))))
        return X, y, length[index]

In [414]:
def get_loader(root, pickle, batch_size, shuffle, num_workers, transform):
    """Returns torch.utils.data.DataLoader for custom dataset."""

    tate = Dataset(root=root, pickle=pickle, transform=transform, batch_size=batch_size)
    # Data loader for dataset
    data_loader = torch.utils.data.DataLoader(dataset=tate, batch_size=batch_size,
                                              shuffle=shuffle, num_workers=num_workers,
                                             )
    return data_loader

In [415]:
print("build data loader ...")
# Build data loader
#should have a length argument, should be split into images, captions, lengths
data_loader = get_loader(os.path.join('..','..','data_tate'), \
                         pickle=out, batch_size=64, shuffle=True, num_workers=0, transform=transform) 

build data loader ...


In [416]:
print("build the models ...")
# Build the models
encoder = EncoderCNN(512).to(device)
decoder = DecoderRNN(512, 1024, len(vocab), 1).to(device)

build the models ...


In [417]:
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
params = list(decoder.parameters())# + list(encoder.linear.parameters()) + list(encoder.bn.parameters())
optimizer = torch.optim.Adam(params, lr=0.001)

In [418]:
# Train the models
total_step = len(data_loader)
for epoch in range(5):
    for i, (captions, images, length) in enumerate(data_loader):
            # Set mini-batch dataset
            images = images.to(device)
            captions = captions.to(device)
            targets = pack_padded_sequence(captions, length.cpu().numpy(), batch_first=True)[0]
            
            # Forward, backward and optimize
            features = encoder(images)
            outputs = decoder(features, captions, length.cpu().numpy())
            outputs = pack_padded_sequence(outputs, length.cpu().numpy(), batch_first=True)[0]
            loss = criterion(outputs, targets)
            decoder.zero_grad()
            encoder.zero_grad()
            loss.backward()
            optimizer.step()

            # Print log info
            if i % args.log_step == 0:
                print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}'
                      .format(epoch, 5, i, total_step, loss.item(), np.exp(loss.item()))) 
                
            # Save the model checkpoints
            if (i+1) % args.save_step == 0:
                torch.save(decoder.state_dict(), os.path.join(
                    '.', 'decoder-{}-{}.ckpt'.format(epoch+1, i+1)))
                torch.save(encoder.state_dict(), os.path.join(
                    '.', 'encoder-{}-{}.ckpt'.format(epoch+1, i+1)))

RuntimeError: Expected 4-dimensional input for 4-dimensional weight 64 3 3 3, but got 2-dimensional input of size [64, 16] instead