In [21]:
import argparse
import logging

import numpy as np
import torch.autograd
import torch.cuda
import torch.nn as nn
import torch.optim
from torch.utils.data import DataLoader
from tqdm import tqdm_notebook as tqdm
import random

import copy
from datasets.maps_alt import MAPSDataset

#from cnn_ws.transformations.homography_augmentation import HomographyAugmentation
from cnn_ws.losses.cosine_loss import CosineLoss

from cnn_ws.models.myphocnet import PHOCNet
from cnn_ws.evaluation.retrieval import map_from_feature_matrix, map_from_query_test_feature_matrices
from torch.utils.data.dataloader import _DataLoaderIter as DataLoaderIter
from torch.utils.data.sampler import WeightedRandomSampler

from cnn_ws.utils.save_load import my_torch_save, my_torch_load

#import matplotlib.pyplot as plt

In [22]:
word_filter_len = 1 # only words above this length are considered valid

In [23]:
if not torch.cuda.is_available():
    logger.warning('Could not find CUDA environment, using CPU mode')
    gpu_id = None
else:
    gpu_id = [0]
#torch.cuda.get_device_name(gpu_id[0])
pass

In [24]:
model_ = torch.load('models/PHOCNet_SDMD.pt')
cnn = model_.module#list(model_.named_parameters())
if gpu_id is not None:
        if len(gpu_id) > 1:
            cnn = nn.DataParallel(cnn, device_ids=gpu_id)
            cnn.cuda()
        else:
            cnn.cuda(gpu_id[0])
cnn.training = False

In [25]:
# find the file names
f = open('../splits/train_files.txt', 'rb')
A = f.readlines()
f.close()
A = [x.rstrip('\n') for x in A]
# train maps to remove
A.remove('D0042-1070013')
# test maps to remove
# A.remove('D5005-5028102')

In [26]:
from strlocale import BasicLocale

def clean_words(words):
    lc = BasicLocale()
    for i, w in enumerate(words):
        try:
            words[i] = lc.represent(w).encode('ascii',errors='ignore')
        except:
            words[i] = w
    return words

def load_and_transform(map_name):
    images = np.load('../../../detection_outputs_ready_for_test/detected_regions/'+map_name+'.npy')
    words = np.load('../../../detection_outputs_ready_for_test/detected_labels/'+map_name+'.npy')
    images = np.transpose(images, (0,3,1,2))
    words = clean_words(words)
    print 'Images Shape ', images.shape
    print 'Words Shape ', words.shape
    return images, words

def load_and_clean_gis_data():
    with open('../../../GIS_data/GIS_combined.txt') as f:
        gis_data = np.array(f.read().splitlines())
    gis_data = clean_words(gis_data)
    print 'GIS Data', gis_data.shape
    return gis_data

In [27]:
def gen_img_phoc_embs(cnn, images):
    outputs = []
    for i in tqdm(range(len(images)), ascii=True, desc='Converting Images to Embeddings'):
        word_img = images[i]
        word_img = 1 - word_img.astype(np.float32) / 255.0
        word_img = word_img.reshape((1,) + word_img.shape)
        word_img = torch.from_numpy(word_img).float()
        word_img = word_img.cuda(gpu_id[0])
        word_img = torch.autograd.Variable(word_img)
        output = torch.sigmoid(cnn(word_img))
        output = output.data.cpu().numpy().flatten()
        outputs.append(output)
    return outputs

In [28]:
from cnn_ws.string_embeddings.phoc import build_phoc_descriptor

# function to create word variations
# word_var is a dictionary that contains all variations as key and 0,1,-1 as value
# 0 denotes the root word, -1 denotes var = root_word[:-1], +1 denotes var = root_word[1:]
# root_word_var is a dict that stores original_word => all_variations
# enable_conf: boolean flag that controls if the confusion logic should be used.
# when enabled if a word is a root word as well as a word variation (happens if root words ar rand and grand)
# it marks it as to be extended and also stores it in the confusion list
def create_word_variations(words, enable_conf=False):
    word_var = {}
    root_word_var = {}
    # create the root word variation dict and set word_var as -1 or +1
    for w in words:
        if len(w) <= word_filter_len:
            continue
        root_var_list = [w, w.lower(), w.upper(), w.capitalize()]
        var_set = set()
        for var in root_var_list:
            word_var[var[1:]] = 1
            word_var[var[:-1]] = -1
            var_set.add(var)
            var_set.add(var[1:])
            var_set.add(var[:-1])
        root_word_var[w] = var_set
    # explicitly set all root words to have direction 0
    # mark the words that already have a direction set
    conf_words = set()
    for w in words:
        if len(w) <= word_filter_len:
            continue
        root_var_list = [w, w.lower(), w.upper(), w.capitalize()]
        for var in root_var_list:
            if var in word_var and word_var[var] != 0 and enable_conf:
                conf_words.add(var)
            else:
                word_var[var] = 0
    return word_var, root_word_var, conf_words

def gen_text_phoc_embs(words):
    word_strings = words
    unigrams = [chr(i) for i in range(ord('&'), ord('&')+1) + range(ord('A'), ord('Z')+1) + \
                    range(ord('a'), ord('z') + 1) + range(ord('0'), ord('9') + 1)]
    bigram_levels = None
    bigrams = None
    phoc_unigram_levels=(1, 2, 4, 8)
    
    word_var_dir, root_word_var, conf_words = create_word_variations(word_strings, enable_conf=True)
    
    embedding = build_phoc_descriptor(words=word_strings,
                                  phoc_unigrams=unigrams,
                                  bigram_levels=bigram_levels,
                                  phoc_bigrams=bigrams,
                                  unigram_levels=phoc_unigram_levels)

    word_var_strings = word_var_dir.keys()
    embedding_var = build_phoc_descriptor(words=word_var_strings,
                                  phoc_unigrams=unigrams,
                                  bigram_levels=bigram_levels,
                                  phoc_bigrams=bigrams,
                                  unigram_levels=phoc_unigram_levels)
    
    return (embedding, embedding_var, word_var_strings, word_var_dir, root_word_var, conf_words)
    

In [29]:
# the new report matches method that handles variations
from scipy.spatial.distance import cdist, pdist, squareform
def report_matches_with_variations(outputs, embedding, matching, ground_truth, 
                                   words, word_var_dir, root_word_var, k, length):
    # length sorting stuff
    qualified_ids = [x for x in range(len(ground_truth)) if len(ground_truth[x]) > length]
    outputs = np.array(outputs)
    ground_truth = np.array(ground_truth)
    outputs = list(outputs[qualified_ids])
    ground_truth = list(ground_truth[qualified_ids])
    
    # the real computation
    dist_mat = cdist(XA=outputs, XB=embedding, metric=matching)
    retrieval_indices = np.argsort(dist_mat, axis=1)
    q = retrieval_indices[:,:k]
    count = 0
    matched_words = []
    img_dir = []
    words_len = []
    # get all matched words
    for i in range(len(q)):
        matched = []
        for j in q[i]:
            matched.append(words[j])
            curr_len = len(words[j])
            curr_dir = word_var_dir[words[j]]
            words_len.append(curr_len + abs(curr_dir))
            img_dir.append(curr_dir)
        matched_words.append(matched)
    
    # calculate accuracies
    for i in range(len(ground_truth)):
        #print word_strings[i]
        if ground_truth[i].lower() in [mw.lower() for mw in matched_words[i]]:
            count = count+1
        else:
            for w in matched_words[i]:
                if ground_truth[i] in root_word_var and w in root_word_var[ground_truth[i]]:
                    count = count+1
                    break

    return (count, matched_words, qualified_ids, img_dir, words_len, outputs, ground_truth)

In [30]:
# the old original report matches method
from scipy.spatial.distance import cdist, pdist, squareform
def report_matches(outputs, embedding, matching, words, ground_truth, k, length):
    # length sorting stuff
    qualified_ids = [x for x in range(len(ground_truth)) if len(ground_truth[x]) > length]
    outputs = np.array(outputs)
    ground_truth = np.array(ground_truth)
    outputs = list(outputs[qualified_ids])
    ground_truth = list(ground_truth[qualified_ids])
    # the real computation
    dist_mat = cdist(XA=outputs, XB=embedding, metric=matching)
    retrieval_indices = np.argsort(dist_mat, axis=1)
    q = retrieval_indices[:,:k]
    count = 0
    matched_words = []
    # get all matched words
    for i in range(len(q)):
        matched = []
        for j in q[i]:
            matched.append(words[j])
        matched_words.append(matched)
    
    for i in range(len(ground_truth)):
        if ground_truth[i].lower() in [mw.lower() for mw in matched_words[i]]:
            count = count+1

    return (count, matched_words, outputs, embedding, ground_truth, qualified_ids)

In [31]:
# given the image name, this driver function computes the following
# 1. loads the words and images and transforms them based on image name
# 2. generates embeddings for images using the cnn model
# 3. gets the original and variation embeddings
# 4. generate report with word variations (prints accuracy)
# 5. generate report original (prints accuracy)
# 6. returns the image_dir_info that needs to be saved as numpy files
def image_ext_with_word_var(map_name, cnn, gis_data, text_phoc_info, global_stats):
    images, words = load_and_transform(map_name)
    img_phoc_embs = gen_img_phoc_embs(cnn, images)
    embedding, embedding_var, word_var_strings, word_var_dir, root_word_var, conf_set = text_phoc_info
    original_report = report_matches(img_phoc_embs, embedding, 'cosine', gis_data, words, 1, word_filter_len)
    global_stats['correct_original'] += original_report[0]
    print 'Original Accuracy ', str(original_report[0]/float(len(original_report[4])))
    word_var_report = report_matches_with_variations(img_phoc_embs, embedding_var,'cosine', words, \
                                                     word_var_strings, word_var_dir, root_word_var, 1, word_filter_len)
    global_stats['correct_word_var'] += word_var_report[0]
    print 'Accuracy With Word Variations ', str(word_var_report[0]/float(len(word_var_report[4])))
    global_stats['total'] += len(word_var_report[4])
    img_dir_info = np.array([word_var_report[2], word_var_report[3], word_var_report[4]])
    return img_dir_info, word_var_report[6], conf_set
    

In [32]:
gis_data = load_and_clean_gis_data()
text_phoc_info = gen_text_phoc_embs(gis_data)

GIS Data (477196,)


100%|██████████| 477196/477196 [01:47<00:00, 4445.50it/s]
100%|██████████| 1290899/1290899 [04:41<00:00, 4582.73it/s]


In [None]:
global_stats = {'correct_original':0, 'correct_word_var':0, 'total':0}

for i in tqdm(range(len(A)), ascii=True, desc = 'Main Iteration'):
    print A[i]
    img_dir_info, words, conf_words = image_ext_with_word_var(A[i], cnn, gis_data, text_phoc_info, global_stats)
    # np.save('../../../images_to_extend/image_dir_'+A[i]+'.npy', img_dir_info)
    # np.save('../../../images_to_extend/image_labels_'+A[i]+'.npy', words)
print 'Accuracy Original', global_stats['correct_original']/float(global_stats['total'])
print 'Accuracy With Word Variations', global_stats['correct_word_var']/float(global_stats['total'])

Converting Images to Embeddings:   1%|1         | 5/371 [00:00<00:07, 49.71it/s]

D0006-0285025
Images Shape  (371, 3, 135, 487)
Words Shape  (371,)


Converting Images to Embeddings: 100%|##########| 371/371 [00:06<00:00, 61.60it/s]


328
Original Accuracy  0.19512195122
Accuracy With Word Variations  0.216463414634
D0017-1592006


Converting Images to Embeddings:   1%|          | 6/716 [00:00<00:12, 55.01it/s]

Images Shape  (716, 3, 135, 487)
Words Shape  (716,)


Converting Images to Embeddings: 100%|##########| 716/716 [00:11<00:00, 63.01it/s]


705
Original Accuracy  0.222695035461
Accuracy With Word Variations  0.258156028369
D0041-5370006


  if c in self._diacrit_dict: # Replace simple diacritic
  return [c if (c not in self.__filtered) else u'' for c in chars]
Converting Images to Embeddings:   1%|1         | 6/514 [00:00<00:09, 54.63it/s]

Images Shape  (514, 3, 135, 487)
Words Shape  (514,)


Converting Images to Embeddings: 100%|##########| 514/514 [00:08<00:00, 63.28it/s]


492
Original Accuracy  0.115853658537


Converting Images to Embeddings:   2%|1         | 6/398 [00:00<00:07, 55.57it/s]

Accuracy With Word Variations  0.123983739837
D0041-5370026
Images Shape  (398, 3, 135, 487)
Words Shape  (398,)


Converting Images to Embeddings: 100%|##########| 398/398 [00:06<00:00, 63.98it/s]


365
Original Accuracy  0.13698630137


Converting Images to Embeddings:   0%|          | 0/1188 [00:00<?, ?it/s]

Accuracy With Word Variations  0.205479452055
D0042-1070001
Images Shape  (1188, 3, 135, 487)
Words Shape  (1188,)


Converting Images to Embeddings: 100%|##########| 1188/1188 [00:18<00:00, 63.80it/s]


875
Original Accuracy  0.225142857143


Converting Images to Embeddings:   0%|          | 0/692 [00:00<?, ?it/s]

Accuracy With Word Variations  0.269714285714
D0042-1070002
Images Shape  (692, 3, 135, 487)
Words Shape  (692,)


Converting Images to Embeddings: 100%|##########| 692/692 [00:10<00:00, 63.70it/s]


631
Original Accuracy  0.198098256735
Accuracy With Word Variations  0.234548335975
D0042-1070003


Converting Images to Embeddings:   1%|          | 6/1090 [00:00<00:20, 54.03it/s]

Images Shape  (1090, 3, 135, 487)
Words Shape  (1090,)


Converting Images to Embeddings: 100%|##########| 1090/1090 [00:17<00:00, 63.85it/s]


869
Original Accuracy  0.262370540852
Accuracy With Word Variations  0.26582278481
D0042-1070004


Converting Images to Embeddings:   0%|          | 5/1189 [00:00<00:25, 46.33it/s]

Images Shape  (1189, 3, 135, 487)
Words Shape  (1189,)


Converting Images to Embeddings: 100%|##########| 1189/1189 [00:18<00:00, 64.01it/s]


1001
Original Accuracy  0.20979020979


In [None]:
# # image plot using the variations
# import matplotlib.pyplot as plt

# count, matched_words, qualified_ids, img_dirs, words_len, new_outputs, new_word_strings, \
# = report_matches_with_variations(outputs, embedding_var,'cosine', word_strings,word_var_strings,1, 2)

# img_dir_info = np.array([qualified_ids, img_dirs, words_len, new_word_strings])

# print "the accuracy is: "+str(count/float(len(new_word_strings)))

# _len = min(100, len(matched_words))
# new_images = images[qualified_ids]
# for i in range(_len):
#     print "************************************************************************"
#     print "************************************************************************"
#     print "Original image:"
#     q = np.transpose(new_images[i],(1,2,0))
#     plt.imshow(q)
#     plt.show()
#     print "the matched words are (inorder): "+str(matched_words[i])
#     print "the gound truth is:" + str(new_word_strings[i])
#     print "------------------------------------------------------------------------"
#     print "------------------------------------------------------------------------"
# pass

In [None]:
# # image plots using original without variation method for comparisons
# import matplotlib.pyplot as plt

# count, matched_words, new_outputs, new_embedding, new_word_strings, \
#     qualified_ids = report_matches(outputs, embedding, 'cosine', word_strings, 1, 2)

# print "the accuracy is: "+str(count/float(len(new_word_strings)))

# _len = min(100, len(matched_words))
# new_images = images[qualified_ids]
# for i in range(_len):
#     print "************************************************************************"
#     print "************************************************************************"
#     print "Original image:"
#     q = np.transpose(new_images[i],(1,2,0))
#     plt.imshow(q)
#     plt.show()
#     print "the matched words are (inorder): "+str(matched_words[i])
#     print "the gound truth is:" + str(new_word_strings[i])
#     print "------------------------------------------------------------------------"
#     print "------------------------------------------------------------------------"
# pass