In [1]:
import h5py
import numpy as np

In [2]:
f1 = h5py.File('glove-hamming-128.h5','r')

In [3]:
list(f1.keys())

[u'test', u'train']

In [4]:
f1[u'test']

<HDF5 dataset "test": shape (2000,), type "<u8">

In [5]:
train = f1['train'][:]
test = f1['test'][:]

In [6]:
from scipy.spatial.distance import hamming

In [7]:
def gen_distance(base,query,dim):
    shape0 = query.shape[0]
    shape1 = base.shape[0]
    results = np.zeros((shape0,shape1))
    for i in range(shape0):
        for j in range(shape1):
            results[i,j] = int(dim*hamming(query[i],base[j]))
    return results

In [8]:
def gen_data(data,dim):
    shape1 = int(dim / 64)
    shape0 = int(data.shape[0]/shape1)
    results = np.zeros((shape0,dim))
    for i in range(shape0):
        for j in range(shape1):
            index_temp = i*shape1 + j
            temp = "{0:64b}".format(data[index_temp])
            for k in range(64):
                results[i,j*64+k] = (int(temp[k]) if temp[k] != ' ' else 0)
    return results

In [9]:
dim = 128
base = gen_data(train,dim)
query = gen_data(test,dim)

In [10]:
query.shape

(1000, 128)

In [11]:
distance = gen_distance(base,query,dim)

In [12]:
f = h5py.File("glove-hamming-128-uncompact.hdf5", "w")

In [13]:
f.create_dataset('Base',dtype='i8',data= base)

<HDF5 dataset "Base": shape (1192505, 128), type "<i8">

In [14]:
f.create_dataset('Query',dtype='i8',data= query)

<HDF5 dataset "Query": shape (1000, 128), type "<i8">

In [15]:
f.create_dataset('Distance',dtype='i8',data= distance)

<HDF5 dataset "Distance": shape (1000, 1192505), type "<i8">

In [17]:
f1 =  h5py.File("glove-hamming-128-uncompact.hdf5", "r")

In [18]:
f1['Distance'].shape

(1000, 1192505)