# CCA on MS COCO dataset

In [None]:
%load_ext autoreload
%autoreload 2

import time
import os.path as op
import os

import numpy as np
from tqdm import tqdm
from keras.applications import vgg19
from keras.optimizers import SGD
from sklearn.manifold import TSNE
from sklearn.cross_decomposition import CCA
import pandas as pd
from pycocotools.coco import COCO
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

from image_processing import load_images, categories, ann_file
from vgg import compute_nn_features_ids
from text_processing import create_caption_dataframe
from word2vec import compute_textual_features
from tools import intersect_sort, split_ids, get_all_sorted_ids

### Load Visual features and Textual features

In [None]:
coco = COCO(ann_file)

## Visual features
Compute visual features part by part

In [None]:
net = vgg19.VGG19()
sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
net.compile(optimizer=sgd, loss='categorical_crossentropy')

In [None]:
visual_embeddings_folder = 'data/visual_embeddings'
if not op.exists(visual_embeddings_folder):
    os.makedirs(visual_embeddings_folder)

n_parts = 100

In [None]:
for part in range(n_parts//2, n_parts):
    path_embeddings = op.join(visual_embeddings_folder, 'V_{}.npy'.format(part+1))
    path_ids = op.join(visual_embeddings_folder, 'processed_ids_{}.npy'.format(part+1))
    if (not op.exists(path_embeddings)) and (not op.exists(path_ids)):
        print('Part: {}'.format(part+1))
        ids = split_ids(part, n_parts, coco)
        V, processed_ids = compute_nn_features_ids(ids, net, coco)

        # Save embeddings and ids
        np.save(path_embeddings, V)
        np.save(path_ids, processed_ids)

Load and merge computed features

In [None]:
V = []
processed_ids = []
for part in tqdm(range(n_parts)):
    path_embeddings = op.join(visual_embeddings_folder, 'V_{}.npy'.format(part+1))
    path_ids = op.join(visual_embeddings_folder, 'processed_ids_{}.npy'.format(part+1))
    if op.exists(path_embeddings) and op.exists(path_ids):
        V.append(np.load(path_embeddings))
        processed_ids.append(np.load(path_ids))
    else:
        raise IOError('Files for part {} not found'.format(part+1))

visual_img_ids = np.hstack(processed_ids)
V = np.vstack(V)
np.save('data/V.npy', V)
np.save('data/visual_img_ids', visual_img_ids)

In [None]:
V = np.load('data/V.npy')
visual_img_ids = np.load('data/visual_img_ids.npy')

## Textual features

In [None]:
df_caption = create_caption_dataframe()
T = compute_textual_features(df_caption, overwrite=False)
textual_img_ids = df_caption.index.values

## Semantic features

In [None]:
# Get all categories (sorted and no duplicates)
cat_ids = coco.getCatIds()
cat_ids = sorted(set(cat_ids))

img_ids = get_all_sorted_ids(coco)
df = pd.DataFrame(0, index=img_ids, columns=cat_ids)

# For each row set the column corresponding to the image category to one
for cat_id in tqdm(cat_ids):
    img_ids = coco.getImgIds(catIds=cat_id)
    df.loc[img_ids, cat_id] = 1

Print image with category

In [None]:
from image_processing import plot_image_by_id

img_id = 30

# Print category names
cat_ids = df.iloc[0,(df.loc[img_id] == 1).values].index.values
cats = coco.loadCats(cat_ids.tolist())
print([cat['name'] for cat in cats])

plot_image_by_id(img_id, coco)

In [None]:
semantic_img_ids = df.index.values
C = df.as_matrix()

### CCA

In [None]:
# Take only the features corresponding to common ids and sort by id
V, visual_img_ids, T, textual_img_ids = intersect_sort(V, visual_img_ids, T, textual_img_ids)
V, visual_img_ids, C, semantic_img_ids = intersect_sort(V, visual_img_ids, C, semantic_img_ids)
T, textual_img_ids, C, semantic_img_ids = intersect_sort(T, textual_img_ids, C, semantic_img_ids)
assert len(V) == len(T) and len(T) == len(C)
assert len(visual_img_ids) == len(textual_img_ids) and len(textual_img_ids) == len(semantic_img_ids)
assert len(V) == len(visual_img_ids)
print('{} images in common kept'.format(len(V)))

In [None]:
'''Quoted form the CCA paper:
We search a range from 16 to 1,024, doubling the
dimensionality each time, and the resulting values typically
fall around 128-256 on all our datasets.'''
d = 128 # Dimension of the final joint latent space
cca = CCA(n_components=d, scale=False)
print('Fitting CCA ...')
tic = time.time()
cca.fit(V,T)
print(time.time() - tic)

# New basis projection matrices
W1 = cca.x_weights_
W2 = cca.y_weights_

# Compute features in the new latent space
V_latent = np.dot(V,W1)
T_latent = np.dot(T,W2)

35878 seconds for 195 components out of 256
-> 180 seconds per component (500 iter per component)

In [None]:
#np.save('W1.npy', W1)
#np.save('W2.npy', W2)

In [None]:
W1 = np.load('W1.npy')
W2 = np.load('W2.npy')
V_latent = np.dot(V,W1)
T_latent = np.dot(T,W2)

### Plot latent space with t-SNE

In [None]:
tsne = TSNE()
tic = time.time()
embeddings = tsne.fit_transform(np.vstack((V_latent, T_latent)))
print(time.time() - tic)

In [None]:
# Get the categories of each image
cat_ids = coco.getCatIds(catNms=categories)

df = pd.DataFrame(index=visual_img_ids, columns=['cat_id'])
for cat_id in cat_ids:
    img_ids = coco.getImgIds(catIds=cat_id)
    for img_id in img_ids:
        df.loc[img_id] = cat_id
df = df.loc[visual_img_ids]

In [None]:
# Plot the t-SNE embeddings by category (color) and visula/textual (shape)
Vx = embeddings[:V.shape[0],0]
Vy = embeddings[:V.shape[0],1]
Tx = embeddings[V.shape[0]:,0]
Ty = embeddings[V.shape[0]:,1]

plt.figure(figsize=(12,8))
colors = ['r', 'b', 'g']
for cat_id, color in zip(cat_ids, colors):
    idx = (df['cat_id'] == cat_id).as_matrix()
    plt.plot(Vx[idx], Vy[idx], color+'o', markersize=3,
             label='Category {} - Visual'.format(cat_id))
    plt.plot(Tx[idx], Ty[idx], color+'^', markersize=4,
             label='Category {} - Textual'.format(cat_id))
plt.legend()
plt.title('Latent space - 3 MS COCO categories')
plt.show()