In [1]:
from __future__ import division
from __future__ import print_function

import argparse
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "0"

import numpy as np
import torch
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable
import torch.backends.cudnn as cudnn
import sys
sys.path.append("/home/marta/jku/SBNet/ssnet_fop")

import pandas as pd
from scipy import random
from sklearn import preprocessing
from sklearn.metrics import precision_score, recall_score, f1_score
# import matplotlib.pyplot as plt
import torch.nn.functional as F
import torch.nn as nn

from tqdm import tqdm
from retrieval_model import FOP


In [2]:
torch.cuda.is_available()

True

In [3]:
data_folder = '/home/marta/jku/LLaVA/mmimdb'

In [4]:
texts_folder = os.path.join(data_folder, 'llava_encoded_texts')

train_text_df = os.path.join(texts_folder, 'llava_plot_first_latent_train.csv')
test_text_df = os.path.join(texts_folder, 'llava_plot_first_latent_test.csv')

images_folder = os.path.join(data_folder, 'llava_encoded_images')

train_image_df = os.path.join(images_folder, 'llava_images_latent_train.csv')
test_image_df = os.path.join(images_folder, 'llava_images_latent_test.csv')

labels = ['action', 'adult', 'adventure', 'animation', 'biography', 'comedy',
       'crime', 'documentary', 'drama', 'family', 'fantasy', 'film-noir',
       'history', 'horror', 'music', 'musical', 'mystery', 'news',
       'reality-tv', 'romance', 'sci-fi', 'short', 'sport', 'talk-show',
       'thriller', 'war', 'western']

In [5]:
def read_data(FLAGS):

    print('Split Type: %s'%(FLAGS.split_type))

    if FLAGS.split_type == 'text_only':
        print('Reading Text Train')
        train_file_text = train_text_df
        train_data = pd.read_csv(train_file_text, index_col='item_id')
        train_label = train_data[labels]
        train_data = train_data.drop(columns=labels)
        train_data = np.asarray(train_data)
        # Shuffle the data also if only one modality is used
        # combined = list(zip(train_data, train_label))
        # random.shuffle(combined)
        # train_data, train_label = zip(*combined)

        return train_data, train_label

    elif FLAGS.split_type == 'image_only':
        print('Reading Image Train')
        train_file_image = train_image_df
        train_data = pd.read_csv(train_file_image, index_col='item_id')
        train_label = train_data[labels]
        train_data = train_data.drop(columns=labels)
        train_data = np.asarray(train_data)
        # Shuffle the data also if only one modality is used
        # combined = list(zip(train_data, train_label))
        # random.shuffle(combined)
        # train_data, train_label = zip(*combined)

        return train_data, train_label

    train_data = []
    train_label = []

    train_file_face = '/share/hel/datasets/voxceleb/sbnet_feats/data/face/facenetfaceTrain.csv'
    train_file_voice = '/share/hel/datasets/voxceleb/sbnet_feats/data/voice/voiceTrain.csv'

    print('Reading Train Faces')
    img_train = pd.read_csv(train_file_face, header=None)
    train_tmp = img_train[512]
    img_train = np.asarray(img_train)
    img_train = img_train[:, :-1]

    train_tmp = np.asarray(train_tmp)
    train_tmp = train_tmp.reshape((train_tmp.shape[0], 1))
    print('Reading Train Voices')
    voice_train = pd.read_csv(train_file_voice, header=None)
    voice_train = np.asarray(voice_train)
    voice_train = voice_train[:, :-1]

    combined = list(zip(img_train, voice_train, train_tmp))
    # todo marta: why do we need to shuffle here?
    random.shuffle(combined)
    img_train, voice_train, train_tmp = zip(*combined)

    if FLAGS.split_type == 'random':
        # todo marta: aren't we doubling the dataset, like this?
        train_data = np.vstack((img_train, voice_train))
        train_label = np.vstack((train_tmp, train_tmp))
        combined = list(zip(train_data, train_label))
        random.shuffle(combined)
        train_data, train_label = zip(*combined)
        train_data = np.asarray(train_data).astype(np.float)
        train_label = np.asarray(train_label)

    elif FLAGS.split_type == 'vfvf':
        for i in range(len(voice_train)):
            train_data.append(voice_train[i])
            train_data.append(img_train[i])
            train_label.append(train_tmp[i])
            train_label.append(train_tmp[i])

    elif FLAGS.split_type == 'fvfv':
        for i in range(len(voice_train)):
            train_data.append(img_train[i])
            train_data.append(voice_train[i])
            train_label.append(train_tmp[i])
            train_label.append(train_tmp[i])

    elif FLAGS.split_type == 'hefhev':
        train_data = np.vstack((img_train, voice_train))
        train_label = np.vstack((train_tmp, train_tmp))

    elif FLAGS.split_type == 'hevhef':
        train_data = np.vstack((voice_train, img_train))
        train_label = np.vstack((train_tmp, train_tmp))

    else:
        print('Invalid Split Type')

    le = preprocessing.LabelEncoder()
    le.fit(train_label)
    train_label = le.transform(train_label)

    # print("Train file length", len(img_train))
    # print('Shuffling\n')

    train_data = np.asarray(train_data).astype(np.float)
    train_label = np.asarray(train_label)

    return train_data, train_label

def get_batch(batch_index, batch_size, labels, f_lst):
    start_ind = batch_index * batch_size
    end_ind = (batch_index + 1) * batch_size
    return np.asarray(f_lst[start_ind:end_ind]), np.asarray(labels[start_ind:end_ind])

def init_weights(m):
    if type(m) == nn.Linear:
        torch.nn.init.xavier_uniform(m.weight)
        m.bias.data.fill_(0.01)

def main(train_data, train_label):
    """
    train_data.shape = (num_train_instances, embedding_size)
    train_label.shape = (num_train_instances, num_labels)
    """
    n_class = train_label.shape[1]
    train_label_np = np.asarray(train_label)
    model = FOP(FLAGS, train_data.shape[1], n_class)
    model.apply(init_weights)
    
    # ce_loss = nn.CrossEntropyLoss().cuda()
    bce_logits_loss = nn.BCEWithLogitsLoss().cuda()
    # We do not necessarily want orthogonal projection loss imo
    # opl_loss = OrthogonalProjectionLoss().cuda()
    # TODO adapt code to remove opl
    opl_loss = None
    
    if FLAGS.cuda:
        model.cuda()
        # ce_loss.cuda()    
        bce_logits_loss.cuda()
        if opl_loss:
            opl_loss.cuda()
        cudnn.benchmark = True
    
    optimizer = optim.Adam(model.parameters(), lr=FLAGS.lr, weight_decay=0.01)

    n_parameters = sum([p.data.nelement() for p in model.parameters()])
    print('  + Number of params: {}'.format(n_parameters))
    
    
    for alpha in FLAGS.alpha_list:
        epoch = 1
        # todo check: why len and not train_label[0] ? 
        num_of_batches = (len(train_label) // FLAGS.batch_size)
        loss_plot = []
        train_precision_list = []
        train_recall_list = []
        train_f1_list = []
        
        loss_per_epoch = 0.
        # todo adapt code to remove s_fac and d_fac
        s_fac_per_epoch = 0.
        d_fac_per_epoch = 0.
        txt_dir = 'output'
        save_dir = 'fc2_%s_%s_alpha_%0.2f'%(FLAGS.split_type, FLAGS.save_dir, alpha)
        txt = '%s/ce_opl_%03d_%0.2f.txt'%(txt_dir, FLAGS.max_num_epoch, alpha)
        
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        
        if not os.path.exists(txt_dir):
            os.makedirs(txt_dir)
        
        with open(txt,'w+') as f:
            f.write('EPOCH\tLOSS\tprecision\trecall\tS_FAC\tD_FAC\n')
        
        save_best = 'best_%s'%(save_dir)
        
        if not os.path.exists(save_best):
            os.mkdir(save_best)
        with open(txt,'a+') as f:
            while (epoch < FLAGS.max_num_epoch):
                print('%s\tEpoch %03d'%(FLAGS.split_type, epoch))
                for idx in tqdm(range(num_of_batches)):
                    train_batch, batch_labels = get_batch(idx, FLAGS.batch_size, train_label, train_data)
                    # voice_feats, _ = get_batch(idx, FLAGS.batch_size, train_label, voice_train)
                    loss_tmp, loss_opl, loss_soft, s_fac, d_fac = train(train_batch, 
                                                                 batch_labels, 
                                                                 model, optimizer, bce_logits_loss, opl_loss, alpha)
                    loss_per_epoch += loss_tmp
                    s_fac_per_epoch += s_fac
                    d_fac_per_epoch += d_fac
                
                # todo check: should we really divide by the number of batches? this leads to an incorrect loss 
                #  if the last batch is smaller.
                loss_per_epoch /= num_of_batches
                s_fac_per_epoch /= num_of_batches
                d_fac_per_epoch /= num_of_batches
                
                loss_plot.append(loss_per_epoch)
                # print(train_data, train_label)
                precision, recall, f1 = eval(train_data, train_label_np, model)
                train_precision_list.append(precision)
                train_recall_list.append(recall)
                train_f1_list.append(f1)
                
                # ToDo
                # precision, recall, f1 = eval(, , model)
                # test_precision_list += [precision]
                # test_recall_list += [recall]
                # test_f1_list += [f1]
                
                save_checkpoint({
                   'epoch': epoch,
                   'state_dict': model.state_dict()}, save_dir, 'checkpoint_%04d_%0.3f.pth.tar'%(epoch, f1*100))

                # print('==> Epoch: %d/%d Loss: %0.2f Alpha:%0.2f, Min_train_f1: %0.2f'%(epoch, FLAGS.max_num_epoch, loss_per_epoch, alpha, min(train_f1_list)))
#                 if eer <= min(eer_list):
#                     min_eer = eer
#                     max_auc = auc
#                     save_checkpoint({
#                     'epoch': epoch,
#                     'state_dict': model.state_dict()}, save_best, 'checkpoint.pth.tar')
                # ToDo 
                # eer, auc = 0., 0.
                # f.write('%04d\t%0.4f\t%0.2f\t%0.2f\t%0.2f\t%0.2f\n'%(epoch, loss_per_epoch, eer, auc, s_fac_per_epoch, d_fac_per_epoch))
                # loss_per_epoch = 0
                # s_fac_per_epoch = 0
                # d_fac_per_epoch = 0
                # epoch += 1
        
        return loss_plot, train_f1_list                
#         return loss_plot, min_eer, max_auc


def eval(train_batch, labels, model):
    model.eval()
    with torch.no_grad():
        train_batch = torch.from_numpy(train_batch).float()
        
        if FLAGS.cuda:
            train_batch = train_batch.cuda()
        train_batch = Variable(train_batch)
        comb = model.train_forward(train_batch)
        
        predictions = torch.sigmoid(comb[1]).cpu().numpy()
        predictions = (predictions > 0.5).astype(int).reshape(-1)
        
        precision = precision_score(labels, predictions, average='samples')
        recall = recall_score(labels, predictions, average='samples')
        f1 = f1_score(labels, predictions, average='samples')
        
        return precision, recall, f1
    
    
def train(train_batch, labels, model, optimizer, bce_logits_loss, opl_loss, alpha):
    
    average_loss = RunningAverage()
    soft_losses = RunningAverage()
    if opl_loss:
        opl_losses = RunningAverage()

    model.train()
    # face_feats = torch.from_numpy(face_feats).float()
    train_batch = torch.from_numpy(train_batch).float()
    labels = torch.from_numpy(labels).float()
    
    if FLAGS.cuda:
        train_batch, labels = train_batch.cuda(), labels.cuda()

    train_batch, labels = Variable(train_batch), Variable(labels)
    comb = model.train_forward(train_batch)
    
    # loss_soft = ce_loss(comb[1], labels)
    loss_soft = bce_logits_loss(comb[1], labels)    
    
    if opl_loss:
        loss_opl, s_fac, d_fac = opl_loss(comb[0], labels)
        loss = loss_soft + alpha * loss_opl
    else: 
        loss = loss_soft
        s_fac, d_fac = 0., 0.
        opl_losses = 0.

    optimizer.zero_grad()
    
    loss.backward()
    average_loss.update(loss.item())
    if opl_loss:
        opl_losses.update(loss_opl.item())
    soft_losses.update(loss_soft.item())
    
    optimizer.step()
    if opl_loss:
        return average_loss.avg(), opl_losses.avg(), soft_losses.avg(), s_fac, d_fac
    else:
        return average_loss.avg(), opl_losses, soft_losses.avg(), s_fac, d_fac

class RunningAverage(object):
    def __init__(self):
        self.value_sum = 0.
        self.num_items = 0. 

    def update(self, val):
        self.value_sum += val 
        self.num_items += 1

    def avg(self):
        average = 0.
        if self.num_items > 0:
            average = self.value_sum / self.num_items

        return average
 
def save_checkpoint(state, directory, filename):
    filename = os.path.join(directory, filename)
    torch.save(state, filename)
    

In [6]:
global FLAGS

In [7]:
parser = argparse.ArgumentParser()
parser.add_argument('--seed', type=int, default=1, metavar='S', help='Random Seed')
parser.add_argument('--cuda', action='store_true', default=True, help='CUDA Training')
parser.add_argument('--save_dir', type=str, default='model', help='Directory for saving checkpoints.')
parser.add_argument('--lr', type=float, default=1e-2, metavar='LR',
                    help='learning rate (default: 1e-4)') 
parser.add_argument('--batch_size', type=int, default=128, help='Batch size for training.')
parser.add_argument('--max_num_epoch', type=int, default=100, help='Max number of epochs to train, number')
parser.add_argument('--alpha_list', type=list, default=[1], help='Alpha Values List')
parser.add_argument('--dim_embed', type=int, default=64,
                    help='Embedding Size')
parser.add_argument('--split_type', type=str, default='image_only', help='split_type')

_StoreAction(option_strings=['--split_type'], dest='split_type', nargs=None, const=None, default='image_only', type=<class 'str'>, choices=None, help='split_type', metavar=None)

In [8]:
FLAGS, unparsed = parser.parse_known_args()

In [9]:
train_data, train_label = read_data(FLAGS)

Split Type: image_only
Reading Image Train


In [10]:
print('Split Type: %s'%(FLAGS.split_type))

Split Type: image_only


In [11]:
losses, f1_scores = main(train_data, train_label)



  + Number of params: 464987
image_only	Epoch 001


100%|██████████| 121/121 [00:02<00:00, 46.71it/s]


ValueError: Found input variables with inconsistent numbers of samples: [15552, 419904]