In [1]:
import numpy as np
import csv
import pickle

from sklearn.datasets import make_biclusters
from sklearn.datasets import samples_generator as sg
from sklearn.cluster.bicluster import SpectralCoclustering
from sklearn.metrics import consensus_score

from matplotlib import pyplot as plt

In [2]:
def get_paths_from_file(filepath):
	with open(filepath, 'r') as f:
		reader = csv.reader(f)
		next(reader)  # flushing first row which as directory
		image_path_with_label = []
		for row in reader:
			image_path_with_label.append(row)
		return image_path_with_label

In [3]:
text_file = open(u'predicate-matrix-binary.txt', "r")
predicate = text_file.readlines()
predicate = [x.strip().strip("'").split(' ') for x in predicate]
text_file = open(u'classes.txt', "r")
classes = text_file.readlines()
classes = [x.strip().split('\t')[1] for x in classes]
predicate = dict(zip(classes, predicate))

text_file = open(u'predicates.txt', "r")
predicates = text_file.readlines()
predicates = [x.strip().split('\t')[1] for x in predicates]

train_data_path = 'image_paths_ZSL_GBU_train.txt'
validation_data_path = 'image_paths_ZSL_GBU_dev.txt'
test_data_path = 'image_paths_ZSL_GBU_test.txt'

train_classes = list(set([line[1].strip() for line in get_paths_from_file(train_data_path)]))
dev_classes = list(set([line[1].strip() for line in get_paths_from_file(validation_data_path)]))
test_classes = list(set([line[1].strip() for line in get_paths_from_file(test_data_path)]))

In [5]:
paths = get_paths_from_file(train_data_path)

In [6]:
train_count_dict = {}
for p in paths:
    if p[1] not in train_count_dict:
        train_count_dict[p[1]] = 1
    else:
        train_count_dict[p[1]] += 1

In [7]:
train_count_dict

{'lion': 1019,
 'grizzly+bear': 852,
 'cow': 1338,
 'collie': 1028,
 'chihuahua': 567,
 'siamese+cat': 500,
 'buffalo': 895,
 'squirrel': 1200,
 'otter': 758,
 'polar+bear': 868,
 'pig': 713,
 'killer+whale': 291,
 'mouse': 185,
 'wolf': 589,
 'zebra': 1170,
 'rhinoceros': 696,
 'tiger': 877,
 'antelope': 1046,
 'fox': 664,
 'german+shepherd': 1033,
 'persian+cat': 747,
 'humpback+whale': 709,
 'hippopotamus': 684,
 'elephant': 1038,
 'weasel': 272,
 'skunk': 188,
 'spider+monkey': 291}

In [8]:
train_attribute_mat = np.zeros((85,27))

In [9]:
for i,c in enumerate(train_classes):
    for j in range(len(predicate[c])):
        train_attribute_mat[j,i] = train_count_dict[c]*float(predicate[c][j])

In [10]:
test_attribute_mat = np.zeros((85,10))

In [11]:
for i,c in enumerate(test_classes):
    for j in range(len(predicate[c])):
        test_attribute_mat[j,i] = float(predicate[c][j])

In [15]:
train_attribute_mat[0]

array([   0.,  589.,    0., 1033.,    0.,  291.,  291.,  500.,  709.,
          0.,  877.,  852.,    0., 1338.,  758.,  713.,  895.,    0.,
        567.,    0.,    0.,    0.,  188.,    0., 1170.,  272.,    0.])

In [13]:
corr_train = np.corrcoef(train_attribute_mat)
nans = np.isnan(corr_train)
corr_train[nans] = 0

  c /= stddev[:, None]
  c /= stddev[None, :]


In [16]:
corr_train

array([[ 1.        ,  0.34730603, -0.04280213, ..., -0.1188454 ,
        -0.15092876,  0.32049104],
       [ 0.34730603,  1.        ,  0.04564141, ...,  0.03101577,
        -0.18680959,  0.50216271],
       [-0.04280213,  0.04564141,  1.        , ..., -0.03032281,
        -0.14612684,  0.11244662],
       ...,
       [-0.1188454 ,  0.03101577, -0.03032281, ...,  1.        ,
         0.27875138,  0.14924999],
       [-0.15092876, -0.18680959, -0.14612684, ...,  0.27875138,
         1.        , -0.28749983],
       [ 0.32049104,  0.50216271,  0.11244662, ...,  0.14924999,
        -0.28749983,  1.        ]])

In [None]:
corr_test = np.corrcoef(test_attribute_mat)
nans = np.isnan(corr_test)
corr_test[nans] = 0

In [None]:
def diff_corr(corr_train, corr_test):
    dis_corr = (corr_train - corr_test)
    dis_corr = np.sign(corr_train)*dis_corr
    return dis_corr.clip(0,np.inf)

In [None]:
dis_corr = diff_corr(corr_train, corr_test)
dis_corr += 0.01*np.random.rand(len(corr_train), len(corr_train))

In [None]:
dis_corr += np.random.rand()

In [None]:
dis_corr[loc[0], loc[1]] = 0.0
print dis_corr.max()
loc = np.unravel_index(dis_corr.argmax(), dis_corr.shape)
print corr_train[loc[0], loc[1]]
print corr_test[loc[0], loc[1]]
print predicates[loc[0]]
print predicates[loc[1]]

In [None]:
test_classes

In [None]:
def find_class(attribute):
    train = {'pos':[], 'neg':[]}
    test = {'pos':[], 'neg':[]}
    idx = predicates.index(attribute)
    for c in predicate:
        attr = predicate[c]
        if c in train_classes:
            if attr[idx] == '1':
                train['pos'].append(c)
            else:
                train['neg'].append(c)
        elif c in test_classes:
            if attr[idx] == '1':
                test['pos'].append(c)
            else:
                test['neg'].append(c)
    return train, test

In [None]:
train, test = find_class('mountains')
test

In [None]:
model = SpectralCoclustering(n_clusters=6, random_state=0)
model.fit(dis_corr)

In [None]:
group_dict = {}
for i,val in enumerate(model.row_labels_):
    if 'g_' + str(val) not in group_dict:
        group_dict['g_' + str(val)] = [predicates[i]]
    else:
        group_dict['g_' + str(val)].append(predicates[i])

In [None]:
fit_data = dis_corr[np.argsort(model.row_labels_)]
fit_data = fit_data[:, np.argsort(model.column_labels_)]

In [None]:
plt.matshow(fit_data, cmap='gray')
plt.colorbar()
plt.show()

In [None]:
plt.matshow(dis_corr, cmap='gray')
plt.colorbar()
plt.show()

In [None]:
pickle.dump(group_dict, open('AwA_dis-linear_groups-6.pkl', 'wb'))

### For CUB: co-clustering

In [None]:
def get_paths_from_file(filepath):
	with open(filepath, 'r') as f:
		reader = csv.reader(f)
		reader.next()  # flushing first row which as directory
		image_path_with_label = []
		for row in reader:
			image_path_with_label.append(row)
		return image_path_with_label

In [None]:
cd ../../CUB/code

In [None]:
text_file = open(u'../classes.txt', "r")
classes = text_file.readlines()
classes = [x.strip().split(' ')[1] for x in classes ]

In [None]:
train_data_path =  u'../train_images_gbu.txt'
test_data_path =  u'../test_images_gbu.txt'
validation_data_path = u'../valid_images_gbu.txt'

train_classes = classes[:100]
dev_classes = classes[100:150]
test_classes = classes[-50:]

In [None]:
prior_matrix = np.loadtxt("../attributes/class_attribute_labels_continuous.txt")
prior_matrix_tr = prior_matrix[:100, :]
prior_matrix_ts = prior_matrix[-50:, :]
prior_matrix_v = prior_matrix[100:150, :]

In [None]:
paths = get_paths_from_file(train_data_path)

In [None]:
train_count_dict = {}
for p in paths:
    if p[1] not in train_count_dict:
        train_count_dict[p[1]] = 1
    else:
        train_count_dict[p[1]] += 1

In [None]:
text_file = open(u'../attributes/attributes.txt', "r")
predicate = text_file.readlines()

In [None]:
prior_matrix = np.loadtxt("../attributes/class_attribute_labels_continuous.txt")
prior_matrix_tr = prior_matrix[:100, :]
prior_matrix_ts = prior_matrix[-50:, :]
prior_matrix_v = prior_matrix[100:150, :]

nb_train_samples = len(get_paths_from_file(train_data_path))
nb_validation_samples = len(get_paths_from_file(validation_data_path))
nb_test_samples = len(get_paths_from_file(test_data_path))

# list of attributes
# animal_to_attribute dictionary
text_file = open(u'../attributes/attributes.txt', "r")
predicate = text_file.readlines()

# group_to_attribute dictionary
from collections import defaultdict
predicate_groups = defaultdict(list)

for x in predicate:
	group = x.strip().split(' ')[1].split('::')[0][4:]
	predicate_groups[group].append(x)

predicates = [x for x in predicate]
#predicates = predicate.keys()

num_attributes = 0
for g in predicate_groups:
	num_attributes += len(predicate_groups[g])
	print g, len(predicate_groups[g])

predicate = np.load(u'../attributes/image_attribute_labels.npy').item()

In [None]:
predicate = np.load(u'../attributes/image_attribute_labels.npy').item()

In [None]:
predicate

In [17]:
a=np.load('/Users/soumava/labels_dict.npy', allow_pickle=True).item()

In [20]:
b=np.array(list(a.values()))

In [32]:
c, indices, counts=np.unique(b, axis=0, return_counts=True, return_inverse=True)

In [37]:
c[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1])

In [34]:
counts[619]

2774

In [42]:
np.max(indices)

13475

In [43]:
c[620]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0])

In [44]:
b[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0])