In [1]:
import argparse
import logging

import numpy as np
import torch.autograd
import torch.cuda
import torch.nn as nn
import torch.optim
from torch.utils.data import DataLoader
from tqdm import tqdm
import random

import copy
from datasets.maps_alt import MAPSDataset

#from cnn_ws.transformations.homography_augmentation import HomographyAugmentation
from cnn_ws.losses.cosine_loss import CosineLoss

from cnn_ws.models.myphocnet import PHOCNet
from cnn_ws.evaluation.retrieval import map_from_feature_matrix, map_from_query_test_feature_matrices
from torch.utils.data.dataloader import _DataLoaderIter as DataLoaderIter
from torch.utils.data.sampler import WeightedRandomSampler

from cnn_ws.utils.save_load import my_torch_save, my_torch_load

#import matplotlib.pyplot as plt

In [2]:
if not torch.cuda.is_available():
    logger.warning('Could not find CUDA environment, using CPU mode')
    gpu_id = None
else:
    gpu_id = [0]
#torch.cuda.get_device_name(gpu_id[0])
pass

In [3]:
model_ = torch.load('PHOCNet_a1.pt')
cnn = model_.module#list(model_.named_parameters())
if gpu_id is not None:
        if len(gpu_id) > 1:
            cnn = nn.DataParallel(cnn, device_ids=gpu_id)
            cnn.cuda()
        else:
            cnn.cuda(gpu_id[0])
cnn.training = False

In [4]:
# find the file names
f = open('../splits/val_files.txt', 'rb')
A = f.readlines()
f.close()
A = [x.rstrip('\n') for x in A]

In [5]:
def load_and_transform(map_name):
    images = np.load('../../../detection_outputs_ready_for_test/detected_regions/'+map_name+'.npy')
    words = np.load('../../../detection_outputs_ready_for_test/detected_labels/'+map_name+'.npy')
    images = np.transpose(images, (0,3,1,2))
    print 'Images Shape ', images.shape
    print 'Words Shape ', words.shape
    return images, words

In [6]:
def gen_img_phoc_embs(cnn, images):
    outputs = []
    for i in tqdm(range(len(images)), ascii=True, desc='Converting Images to Embeddings'):
        word_img = images[i]
        word_img = 1 - word_img.astype(np.float32) / 255.0
        word_img = word_img.reshape((1,) + word_img.shape)
        word_img = torch.from_numpy(word_img).float()
        word_img = word_img.cuda(gpu_id[0])
        word_img = torch.autograd.Variable(word_img)
        output = torch.sigmoid(cnn(word_img))
        output = output.data.cpu().numpy().flatten()
        outputs.append(output)
    return outputs

In [7]:
from cnn_ws.string_embeddings.phoc import build_phoc_descriptor

# function to create word variations
# word_var is a dictionary that contains all variations as key and 0,1,-1 as value
# 0 denotes the root word, -1 denotes var = root_word[:-1], +1 denotes var = root_word[1:]
# root_word_var is a dict that stores original_word => all_variations
# enable_conf: boolean flag that controls if the confusion logic should be used.
# when enabled if a word is a root word as well as a word variation (happens if root words ar rand and grand)
# it marks it as to be extended and also stores it in the confusion list
def create_word_variations(words, enable_conf=False):
    word_var = {}
    root_word_var = {}
    # create the root word variation dict and set word_var as -1 or +1
    for w in words:
        if len(w) <= 2:
            continue
        root_var_list = [w, w.lower(), w.upper(), w.capitalize()]
        var_set = set()
        for var in root_var_list:
            word_var[var[1:]] = 1
            word_var[var[:-1]] = -1
            var_set.add(var)
            var_set.add(var[1:])
            var_set.add(var[:-1])
        root_word_var[w] = var_set
    # explicitly set all root words to have direction 0
    # mark the words that already have a direction set
    conf_words = set()
    for w in words:
        if len(w) <= 2:
            continue
        root_var_list = [w, w.lower(), w.upper(), w.capitalize()]
        for var in root_var_list:
            if var in word_var and word_var[var] != 0 and enable_conf:
                conf_words.add(var)
            else:
                word_var[var] = 0
    return word_var, root_word_var, conf_words

def gen_text_phoc_embs(words):
    word_strings = words
    unigrams = [chr(i) for i in range(ord('&'), ord('&')+1) + range(ord('A'), ord('Z')+1) + \
                    range(ord('a'), ord('z') + 1) + range(ord('0'), ord('9') + 1)]
    bigram_levels = None
    bigrams = None
    phoc_unigram_levels=(1, 2, 4, 8)
    
    word_var_dir, root_word_var, conf_words = create_word_variations(word_strings, enable_conf=True)
    
    embedding = build_phoc_descriptor(words=word_strings,
                                  phoc_unigrams=unigrams,
                                  bigram_levels=bigram_levels,
                                  phoc_bigrams=bigrams,
                                  unigram_levels=phoc_unigram_levels)

    word_var_strings = word_var_dir.keys()
    embedding_var = build_phoc_descriptor(words=word_var_strings,
                                  phoc_unigrams=unigrams,
                                  bigram_levels=bigram_levels,
                                  phoc_bigrams=bigrams,
                                  unigram_levels=phoc_unigram_levels)
    
    return embedding, embedding_var, word_var_strings, word_var_dir, root_word_var, conf_words
    

In [8]:
# the new report matches method that handles variations
from scipy.spatial.distance import cdist, pdist, squareform
def report_matches_with_variations(outputs, embedding_var, matching, word_strings, 
                                   word_var_strings, word_var_dir, root_word_var, k, length):
    # length sorting stuff
    qualified_ids = [x for x in range(len(word_strings)) if len(word_strings[x]) > length]
    outputs = np.array(outputs)
    word_strings = np.array(word_strings)
    outputs = list(outputs[qualified_ids])
    word_strings = list(word_strings[qualified_ids])
    
    # same stuff for variations
    qualified_ids_vars = [x for x in range(len(word_var_strings)) if len(word_var_strings[x]) > (length-1)]
    embedding_var = np.array(embedding_var)
    word_var_strings = np.array(word_var_strings)
    embedding_var = list(embedding_var[qualified_ids_vars])
    word_var_strings = list(word_var_strings[qualified_ids_vars])
    
    # the real computation
    dist_mat = cdist(XA=outputs, XB=embedding_var, metric=matching)
    retrieval_indices = np.argsort(dist_mat, axis=1)
    q = retrieval_indices[:,:k]
    count = 0
    matched_words = []
    img_dir = []
    words_len = []
    # get all matched words
    for i in range(len(q)):
        matched = []
        for j in q[i]:
            matched.append(word_var_strings[j])
            curr_len = len(word_var_strings[j])
            curr_dir = word_var_dir[word_var_strings[j]]
            words_len.append(curr_len + abs(curr_dir))
            img_dir.append(curr_dir)
        matched_words.append(matched)
    
    # calculate accuracies
    for i in range(len(word_strings)):
        #print word_strings[i]
        if word_strings[i] in matched_words[i]:
            count = count+1
        else:
            for w in matched_words[i]:
                if w in root_word_var[word_strings[i]]:
                    count = count+1
                    break

    return (count, matched_words, qualified_ids, img_dir, words_len, outputs, word_strings)

In [9]:
# the old original report matches method
from scipy.spatial.distance import cdist, pdist, squareform
def report_matches(outputs, embedding, matching, word_strings, k, length):
    # length sorting stuff
    qualified_ids = [x for x in range(len(word_strings)) if len(word_strings[x]) > length]
    outputs = np.array(outputs)
    embedding = np.array(embedding)
    word_strings = np.array(word_strings)
    outputs = list(outputs[qualified_ids])
    embedding = list(embedding[qualified_ids])
    word_strings = list(word_strings[qualified_ids])
    # the real computation
    dist_mat = cdist(XA=outputs, XB=embedding, metric=matching)
    retrieval_indices = np.argsort(dist_mat, axis=1)
    q = retrieval_indices[:,:k]
    count = 0
    matched_words = []
    # get all matched words
    for i in range(len(q)):
        matched = []
        for j in q[i]:
            matched.append(word_strings[j])
        matched_words.append(matched)
    
    for i in range(len(word_strings)):
        if word_strings[i] in matched_words[i]:
            count = count+1

    return (count, matched_words, outputs, embedding, word_strings, qualified_ids)

In [10]:
# given the image name, this driver function computes the following
# 1. loads the words and images and transforms them based on image name
# 2. generates embeddings for images using the cnn model
# 3. gets the original and variation embeddings
# 4. generate report with word variations (prints accuracy)
# 5. generate report original (prints accuracy)
# 6. returns the image_dir_info that needs to be saved as numpy files
def image_ext_with_word_var(map_name, cnn, global_stats):
    images, words = load_and_transform(map_name)
    img_phoc_embs = gen_img_phoc_embs(cnn, images)
    embedding, embedding_var, word_var_strings, word_var_dir, root_word_var, conf_set = gen_text_phoc_embs(words)
    print set([s.lower() for s in conf_set])
    original_report = report_matches(img_phoc_embs, embedding, 'cosine', words, 1, 2)
    global_stats['correct_original'] += original_report[0]
    print 'Original Accuracy ', str(original_report[0]/float(len(original_report[4])))
    word_var_report = report_matches_with_variations(img_phoc_embs, embedding_var,'cosine', words, \
                                                     word_var_strings, word_var_dir, root_word_var, 1, 2)
    global_stats['correct_word_var'] += word_var_report[0]
    print 'Accuracy With Word Variations ', str(word_var_report[0]/float(len(word_var_report[4])))
    global_stats['total'] += len(word_var_report[4])
    img_dir_info = np.array([word_var_report[2], word_var_report[3], word_var_report[4]])
    return img_dir_info, word_var_report[6], conf_set
    

In [11]:
global_stats = {'correct_original':0, 'correct_word_var':0, 'total':0}
for i in range(len(A)):
    print A[i]
    img_dir_info, words, conf_words = image_ext_with_word_var(A[i], cnn, global_stats)
    np.save('../../../images_to_extend/image_dir_'+A[i]+'.npy', img_dir_info)
    np.save('../../../images_to_extend/image_labels_'+A[i]+'.npy', words)
    np.save('../../../images_to_extend/word_confusions_'+A[i]+'.npy', conf_words)
print 'Accuracy Original', global_stats['correct_original']/float(global_stats['total'])
print 'Accuracy With Word Variations', global_stats['correct_word_var']/float(global_stats['total'])

Converting Images to Embeddings:   0%|          | 0/536 [00:00<?, ?it/s]

D0090-5242001
Images Shape  (536, 3, 135, 487)
Words Shape  (536,)


Converting Images to Embeddings: 100%|##########| 536/536 [00:09<00:00, 58.41it/s]
100%|██████████| 536/536 [00:00<00:00, 3677.02it/s]
100%|██████████| 1871/1871 [00:00<00:00, 5952.12it/s]


set(['and', 'wayne'])
Original Accuracy  0.373134328358
Accuracy With Word Variations  0.383795309168
D0117-5755018


Converting Images to Embeddings:   0%|          | 7/3131 [00:00<00:48, 64.34it/s]

Images Shape  (3131, 3, 135, 487)
Words Shape  (3131,)


Converting Images to Embeddings: 100%|##########| 3131/3131 [00:52<00:00, 59.41it/s]
100%|██████████| 3131/3131 [00:00<00:00, 6938.73it/s]
100%|██████████| 4941/4941 [00:01<00:00, 4621.03it/s]


set(['rand', 'for', 'linton', 'trail', 'state', 'indian', 'green', 'road', 'highway'])
Original Accuracy  0.496530454896
Accuracy With Word Variations  0.50501156515
D0117-5755024


Converting Images to Embeddings:   0%|          | 6/3309 [00:00<00:57, 57.75it/s]

Images Shape  (3309, 3, 135, 487)
Words Shape  (3309,)


Converting Images to Embeddings: 100%|##########| 3309/3309 [00:58<00:00, 56.49it/s]
100%|██████████| 3309/3309 [00:00<00:00, 8627.59it/s]
100%|██████████| 5585/5585 [00:01<00:00, 5063.34it/s]


set(['rand', 'for', 'route', 'lake', 'trail', 'state', 'road', 'orth', 'highway'])
Original Accuracy  0.453776041667
Accuracy With Word Variations  0.453776041667
D0117-5755025


Converting Images to Embeddings:   0%|          | 6/2197 [00:00<00:39, 55.19it/s]

Images Shape  (2197, 3, 135, 487)
Words Shape  (2197,)


Converting Images to Embeddings: 100%|##########| 2197/2197 [00:38<00:00, 56.73it/s]
100%|██████████| 2197/2197 [00:00<00:00, 6444.65it/s]
100%|██████████| 3865/3865 [00:00<00:00, 5164.03it/s]


set(['rand', 'lincoln', 'alton', 'state', 'road', 'bluff', 'mill', 'highway'])
Original Accuracy  0.51061452514
Accuracy With Word Variations  0.513966480447
D0117-5755033


Converting Images to Embeddings:   0%|          | 5/2276 [00:00<00:47, 48.16it/s]

Images Shape  (2276, 3, 135, 487)
Words Shape  (2276,)


Converting Images to Embeddings: 100%|##########| 2276/2276 [00:40<00:00, 55.83it/s]
100%|██████████| 2276/2276 [00:00<00:00, 6559.70it/s]
100%|██████████| 4463/4463 [00:00<00:00, 5626.63it/s]


set(['rand', 'rush', 'for', 'camp', 'lake', 'trail', 'state', 'road', 'rock', 'ind', 'ear', 'highway', 'victor'])
Original Accuracy  0.451358457493
Accuracy With Word Variations  0.460122699387
Accuracy Original 0.466092169352
Accuracy With Word Variations 0.471524915699


In [None]:
# # image plot using the variations
# import matplotlib.pyplot as plt

# count, matched_words, qualified_ids, img_dirs, words_len, new_outputs, new_word_strings, \
# = report_matches_with_variations(outputs, embedding_var,'cosine', word_strings,word_var_strings,1, 2)

# img_dir_info = np.array([qualified_ids, img_dirs, words_len, new_word_strings])

# print "the accuracy is: "+str(count/float(len(new_word_strings)))

# _len = min(100, len(matched_words))
# new_images = images[qualified_ids]
# for i in range(_len):
#     print "************************************************************************"
#     print "************************************************************************"
#     print "Original image:"
#     q = np.transpose(new_images[i],(1,2,0))
#     plt.imshow(q)
#     plt.show()
#     print "the matched words are (inorder): "+str(matched_words[i])
#     print "the gound truth is:" + str(new_word_strings[i])
#     print "------------------------------------------------------------------------"
#     print "------------------------------------------------------------------------"
# pass

In [None]:
# # image plots using original without variation method for comparisons
# import matplotlib.pyplot as plt

# count, matched_words, new_outputs, new_embedding, new_word_strings, \
#     qualified_ids = report_matches(outputs, embedding, 'cosine', word_strings, 1, 2)

# print "the accuracy is: "+str(count/float(len(new_word_strings)))

# _len = min(100, len(matched_words))
# new_images = images[qualified_ids]
# for i in range(_len):
#     print "************************************************************************"
#     print "************************************************************************"
#     print "Original image:"
#     q = np.transpose(new_images[i],(1,2,0))
#     plt.imshow(q)
#     plt.show()
#     print "the matched words are (inorder): "+str(matched_words[i])
#     print "the gound truth is:" + str(new_word_strings[i])
#     print "------------------------------------------------------------------------"
#     print "------------------------------------------------------------------------"
# pass