# CCA on MS COCO dataset

In [None]:
%load_ext autoreload
%autoreload 2

import time

import numpy as np
from tqdm import tqdm
from keras.applications import vgg19
from keras.optimizers import SGD
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.cross_decomposition import CCA
import pandas as pd
from pycocotools.coco import COCO

from image_processing import load_images, categories, ann_file
from vgg import compute_nn_features
from text_processing import create_caption_dataframe
from word2vec import compute_textual_features
from tools import intersect_sort

### Load Visual features and Textual features

In [None]:
coco = COCO(ann_file)

In [None]:
X_visual, visual_img_ids = load_images(categories, coco=coco)
#np.save('X_visual.npy', X_visual)
#np.save('visual_img_ids.npy', visual_img_ids)

In [None]:
X_visual = np.load('X_visual.npy')
visual_img_ids = np.load('visual_img_ids.npy')

In [None]:
net = vgg19.VGG19()
sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
net.compile(optimizer=sgd, loss='categorical_crossentropy')

Compute visual features batch by batch

In [None]:
V = np.zeros((X_visual.shape[0], 4096))
for i in tqdm(range(X_visual.shape[0]//10+1)):
    start_index = (i)*10
    end_index = (i+1)*10
    end_index = min(end_index, X_visual.shape[0])
    X_temp = X_visual[start_index:end_index]
    V_temp = compute_nn_features(X_temp, net, layer=2)
    V[start_index:end_index,:] = V_temp

In [None]:
#np.save('V.npy', V)

In [None]:
V = np.load('V.npy')

Retrieve textual features

In [None]:
df_caption = create_caption_dataframe()
T = compute_textual_features(df_caption)
textual_img_ids = df_caption.index.values

### CCA

In [None]:
# Take only the features corresponding to common ids and sort by id
V, visual_img_ids, T, textual_img_ids = intersect_sort(V, visual_img_ids, T, textual_img_ids)

In [None]:
'''Quoted form the CCA paper:
We search a range from 16 to 1,024, doubling the
dimensionality each time, and the resulting values typically
fall around 128-256 on all our datasets.'''
d = 128 # Dimension of the final joint latent space
cca = CCA(n_components=d, scale=False)
cca.fit(V,T)

# New basis projection matrices
W1 = cca.x_weights_
W2 = cca.y_weights_

# Compute features in the new latent space
V_latent = np.dot(V,W1)
T_latent = np.dot(T,W2)

In [None]:
#np.save('W1.npy', W1)
#np.save('W2.npy', W2)

### Plot latent space with t-SNE

In [None]:
tsne = TSNE()
tic = time.time()
embeddings = tsne.fit_transform(np.vstack((V_latent, T_latent)))
print(time.time() - tic)

In [None]:
# Get the categories of each image
cat_ids = coco.getCatIds(catNms=categories)

df = pd.DataFrame(index=visual_img_ids, columns=['cat_id'])
for cat_id in cat_ids:
    img_ids = coco.getImgIds(catIds=cat_id)
    for img_id in img_ids:
        df.loc[img_id] = cat_id
df = df.loc[visual_img_ids]

In [None]:
# Plot the t-SNE embeddings by category (color) and visula/textual (shape)
Vx = embeddings[:V.shape[0],0]
Vy = embeddings[:V.shape[0],1]
Tx = embeddings[V.shape[0]:,0]
Ty = embeddings[V.shape[0]:,1]

colors = ['r', 'b', 'g']
for cat_id, color in zip(cat_ids, colors):
    idx = (df['cat_id'] == cat_id).as_matrix()
    plt.plot(Vx[idx], Vy[idx], color+'o', markersize=4)
    plt.plot(Tx[idx], Ty[idx], color+'^')

plt.show()