In [1]:
cfg={
    'main_folder':'/Users/matejkvassay/data/sketch-testing/',
    'dataset_file':'profi-neuralnet-100K.data',
    'dist_estimation_ref_obj_count': 100,
    'dist_estimation_p_vals':[1,2]
}

In [2]:
%matplotlib inline
import logging
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import yaml
from scipy.spatial.distance import minkowski, hamming
from time import time

In [3]:
logger = logging.getLogger()
logger.setLevel('INFO')
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter)
# add the handlers to the logger
logger.addHandler(ch)

In [4]:
def log_pfx(obj, msg, *args):
    logger.info(obj.__class__.__name__+'| '+msg, *args)
    
def save_to(obj, file_path):
    with open(file_path, 'wb') as f:
        cPickle.dump(obj,f)

def load_from(file_path):
    with open(file_path, 'rb') as f:
        return cPickle.load(f)

class IterationLogger(object):
    def __init__(self, message='Started iteration no.: %s\n Time from previous iteration: %ss\n Time from start: %ss',log_by=1000):
        self.log_by=log_by
        self.num_iter=0
        self.msg=message
        self.time_started=None
        self.time_this_iter=None
        
    def next_iter(self):
        if self.time_this_iter is None:
            self.time_this_iter=time()
        if self.time_started is None:
            self.time_started=time()
        self.num_iter+=1
        if self.num_iter%self.log_by==0:
            dur_from_previous=time()-self.time_this_iter
            self.time_this_iter=time()
            dur_from_start=time()-self.time_started
            log_pfx(self, self.msg, str(self.num_iter), str(dur_from_previous), str(dur_from_start))
            

In [5]:
class CaffeVectorsIterator(object):
    def __init__(self, file_path, limit=None):
        log_pfx(self,'Reading Caffe Vector file %s...',file_path)
        self.f = open(file_path, 'rb')
        self.limit=limit
        if self.limit is not None:
            self.returned=0

    def next(self):
        try:
            # (id: int, vector: np array of float32)
            if self.limit is not None:
                if self.returned==self.limit:
                    self.f.close()
                    raise StopIteration
                    
            lineA = self.f.next()
            lineB = self.f.next()
            if self.limit is not None:
                self.returned+=1
                
            return (int(lineA.split(' ')[2]),  np.fromstring(lineB, dtype='f', sep=' '))
        except StopIteration:
            self.f.close()
            raise StopIteration

    def __del__(self):
        if self.f:
            self.f.close()


class CaffeVectorsIterable(object):
    def __init__(self, file_path, limit=None):
        self.file_path = file_path
        self.limit=limit

    def __iter__(self):
        return CaffeVectorsIterator(self.file_path,limit=self.limit)

### LOAD DATA

In [6]:
dataset_path=cfg['main_folder']+cfg['dataset_file']

In [7]:
obj_ids, decaf_vecs=zip(*[obj for obj in CaffeVectorsIterable(dataset_path)])

2017-04-11 13:41:52,013 - root - INFO - CaffeVectorsIterator| Reading Caffe Vector file /Users/matejkvassay/data/sketch-testing/profi-neuralnet-100K.data...


In [8]:
#create numpy arrays and shuffle rows data 
obj_ids=np.array(obj_ids)
decaf_vecs=np.matrix(decaf_vecs)
indices=np.random.permutation(decaf_vecs.shape[0])
obj_ids=obj_ids[indices]
decaf_vecs=decaf_vecs[indices]


In [9]:
decaf_vecs.shape #(vectors count, vector length)

(100000, 4096)

### DATA ANALYSIS

#### Distance distribution estimation

In [28]:
def estimate_minkowski_dist_distr(vec_matrix, ref_obj_cnt, p):
    '''
    p can be either single value or list of p parameters for minkowski distance
    '''
    distances=[list()]*len(p)
    reference_objs=vec_matrix[np.random.choice(vec_matrix.shape[0], size=ref_obj_cnt, replace=False)] #sampling without replacement
    iter_log=IterationLogger(log_by=5000)
    for decaf_vec in decaf_vecs[:10]:
        iter_log.next_iter()
        for ref_obj in reference_objs:
            for i, val_p in enumerate(p):
                dist=minkowski(ref_obj, decaf_vec, p=val_p)
                distances[i].append(dist)
    return tuple(distances)

In [29]:
dist_l1, dist_l2 = estimate_minkowski_dist_distr(decaf_vecs,  cfg['dist_estimation_ref_obj_count'], cfg['dist_estimation_p_vals'])

In [30]:
np.array(dist_l1).shape

(2000,)

In [31]:
dist[0].append(1)

NameError: name 'dist' is not defined

In [22]:
len(dist[0])

NameError: name 'dist' is not defined

In [None]:
np.array(dist_l1).shape

In [None]:
print(len(dist_l2))

In [None]:
dist_l1_ser=pd.Series(dist_l1)
dist_l2_ser=pd.Series(dist_l2)

In [None]:
dist_l1_ser.describe()

In [None]:
dist_l2_ser.describe()

In [None]:
dist_l1_ser.hist(bins=200, figsize=(10,10))

In [None]:
dist_l2_ser.hist(bins=200)