In [1]:
import os
import sys
import numpy as np
from operator import itemgetter
import operator
import scipy.sparse as sp
import random as rnd
import seaborn as sns
from numpy import genfromtxt
import pandas as pd
import matplotlib.pyplot as plt
import struct
from sklearn.svm import LinearSVC
import heapq
from sklearn.preprocessing import normalize
from IPython.core.debugger import set_trace
from sklearn.decomposition import PCA
from multiprocessing import Pool
from scipy import sparse
from tqdm import tqdm
import torch
import nmslib
import time
from scipy.sparse import csr_matrix
np.random.seed(22)

import xclib.evaluation.xc_metrics as xc_metrics
import xclib.data.data_utils as data_utils

In [2]:
import xclib
from tabulate import tabulate
plt.style.use('dark_background')
from io import StringIO

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [3]:
if '__sysstdout__' not in locals():
    __sysstdout__ = sys.stdout

In [4]:
def get_text(x, text, X_Xf, sep=' ', K=-1, attr='bold underline', f = lambda ind, val : '%.2f, %d'%(val, ind)):
    if K == -1: K = X_Xf[x].nnz
    sorted_inds = X_Xf[x].indices[np.argsort(-X_Xf[x].data)][:K]
    return 'x = %d : \n'%x + sep.join(['%s(%s)'%(_c(text[i], attr=attr), f(i, X_Xf[x, i])) for i in sorted_inds])

class Visualize:
    mats = {};
    colors = ['green', 'yellow', 'purple', 'blue', 'red']
    def __init__(self, mats, row_text = None, col_text = None):
        # first mat is base mat
        self.base_mat = list(mats.values())[0]
        self.mats = {k : {'mat' : v, 'color' : self.colors[i], 'base' : i==0} for i, (k, v) in enumerate(mats.items()) }
        self.row_text = row_text
        self.col_text = col_text
        
    def get_row_text(self, x):
        return self.row_text[x]
    
    def get_col_text(self, x):
        return self.col_text[x]
    
    def getX(self, x, K=10):
        print('Raw text \t: %d : %s'%(x, _c(self.get_row_text(x), attr='bold underline')))
        print('Feature text \t: %s'%get_text(x, Xf, sp_tst_X_Xf))
        for name, obj in self.mats.items():
            print(_c('\n' + name + ' : ', attr='bold %s'%(obj['color'])))
            sorted_inds = obj['mat'][x].indices[np.argsort(-obj['mat'][x].data)[:K]]
            for i, ind in enumerate(sorted_inds):
                attr = 'ENDC'
                if self.base_mat[x, ind] and not obj['base']: attr = 'reverse'
                print(_c('%d : %s[%d] : %.4f'%(i+1, self.get_col_text(ind), ind, obj['mat'][x, ind]), attr=attr))

In [5]:
class CaptureIO(list):
    def __enter__(self):
        self._stdout = sys.stdout
        sys.stdout = self._stringio = StringIO()
        return self
    def __exit__(self, *args):
        self.append(''.join(self._stringio.getvalue()))
        del self._stringio    # free up some memory
        sys.stdout = self._stdout
        
def dump_output_log(filename = None, clear=False, overwrite=False):
    if filename is None: filename = '%s/visual_analysis.log'%results_dir
    global output
    mode= 'a+'
    if overwrite: mode = 'w'
    
    with open(filename, mode) as f:
        f.write(';'.join([*output, '']))
        
    if clear:
        with CaptureIO() as output: print('init')
            
def load_output_log(filename = None):
    if filename is None: filename = '%s/visual_analysis.log'%results_dir    
    log = None
    with open(filename, 'r') as f:
        log = f.read().split(';')
    return log

In [6]:
import decimal
def myprint(*args, sep = ' ', end = '\n'):
    __builtins__.print(*("%.2f" % a if isinstance(a, float) else a
                         for a in args), sep = sep, end = end)

def drange(x, y, jump):
    x = decimal.Decimal(x)
    y = decimal.Decimal(y)
    while x < y:
        yield float(x)
        x += decimal.Decimal(jump)
        
def recall(spmat, X_Y = None, K = [1, 10, 50, 100]):
    if X_Y is None : X_Y = tst_X_Y.copy()
    X_Y.data[:] = 1
    ans = {}
    rank_mat = xclib.utils.sparse.rank(spmat)
    
    for k in K:
        temp = rank_mat.copy()
        temp.data[temp.data > k] = 0.0
        temp.eliminate_zeros()
        temp.data[:] = 1.0
        intrsxn = temp.multiply(X_Y)

        num = np.array(intrsxn.sum(axis=1)).ravel()
        den = np.maximum(np.array(X_Y.sum(axis=1)).ravel(), 1)

        recall = (num/den).mean()
        ans['R@%d'%k] = recall*100
    
    return ans

In [None]:
def count_nzrows(spmat):
    nzr = np.zeros_like(spmat.indptr)
    nzr[:-1] = spmat.indptr[1:]
    nzr = nzr - spmat.indptr
    nzr = nzr[:-1]
    return np.where(nzr > 0)[0]

def _filter(score_mat, filter_mat, copy=True):
    if filter_mat is None:
        return score_mat
    if copy:
        score_mat = score_mat.copy()
    
    temp = filter_mat.tocoo()
    score_mat[temp.row, temp.col] = 0
    del temp
    score_mat = score_mat.tocsr()
    score_mat.eliminate_zeros()
    return score_mat

def get_sorted_spmat(spmat):
    coo = spmat.tocoo()
    temp = np.array([coo.col, -coo.data, coo.row])
    temp = temp[:, np.lexsort(temp)]
    del coo

    inds, cnts = np.unique(temp[2].astype(np.int32), return_counts=True)
    indptr = np.zeros_like(spmat.indptr)
    indptr[inds+1] = cnts
    indptr = np.cumsum(indptr)

    new_spmat = csr_matrix((-temp[1], temp[0].astype(np.int32), indptr), (spmat.shape))
    del inds, cnts, indptr, temp
    return new_spmat

### Binary readers

In [1]:
size_dict = {bool : 1, np.int32 : 4, np.float32 : 4, np.int64 : 8}

def readbuf(buf, dtype, offset=0, count=1):
    val = np.frombuffer(buf, offset=offset, dtype=dtype, count=count)
    if count == 1: val = val[0]
    offset += size_dict[dtype]*count
    return val, offset

def read_buf_bin_vecif(buf, dtype, offset = 0, totlen = -1):
    if totlen < 0: totlen, offset = readbuf(buf, np.int64, offset, 1)
    totlen *= 2
    
    temp, _ = readbuf(buf, np.int32, offset, totlen)
    inds = temp.reshape(-1, 2)[:, 0];
    temp, offset = readbuf(buf, dtype, offset, totlen)
    data = temp.reshape(-1, 2)[:, 1];
    
    nr = 1
    if inds.shape[0] > 0: nr = inds.max()+1
    return csr_matrix((data, inds, [0, len(data)]), (1, nr)), offset

def read_buf_bin_vec(buf, dtype, offset = 0, totlen = -1):
    if totlen < 0: totlen, offset = readbuf(buf, np.int64, offset, 1)
    return readbuf(buf, dtype, offset, totlen)

def read_buf_bin_spmat(buf, dtype, offset = 0, old = False):
    (nr, nc), offset = readbuf(buf, np.int32, offset, 2)
    
    size = None
    if old: size, offset = read_buf_bin_vec(buf, np.int32, offset, nr)
    else: size, offset = read_buf_bin_vec(buf, np.int32, offset)

    data = []; inds = []; indptr = np.zeros(nr+1, int)
    indptr[1:] = size.cumsum()
    totlen = indptr[-1]
    temp, offset = read_buf_bin_vecif(buf, dtype, offset, totlen)

    return csr_matrix((temp.data, temp.indices, indptr), (nr, nc)), offset

def read_bin_spmat(fname, old = False):
    buf = open(fname, 'rb').read()
    print('loaded bin file in buffer')
    spmat, _ = read_buf_bin_spmat(buf, np.float32, 0, old)
    return spmat

def write_pybin_spmat(fname, spmat):
    with open(filename, 'wb') as f:
        f.write(struct.pack('i', spmat.shape[0]))
        f.write(struct.pack('i', spmat.shape[1]))
        f.write(struct.pack('i', spmat.shape[0]))
        size = spmat.getnnz(axis=1)
        size.astype(np.int32).tofile(f)
        spmat.indices.astype(np.int32).tofile(f)
        spmat.data.astype(np.float32).tofile(f)
        
def read_pybin_spmat(fname):
    buf = open(fname, 'rb').read()
    (nr, nc), offset = readbuf(buf, np.int32, offset, 2)
    size, offset = read_buf_bin_vec(buf, np.int32, offset)
    inds, offset = read_buf_bin_vec(buf, np.int32, offset, nnz)
    data, offset = read_buf_bin_vec(buf, np.float32, offset, nnz)
    spmat = csr_matrix((data, inds, np.hstack(([0], size)).cumsum()), (nr, nc))
    return spmat

In [None]:
def load_dmat(filename):
    binary = bool(filename[filename.rfind("."):] == ".bin")
    data = []; num = 0; dim = 0
    if binary:
        fp = open(filename,'rb')
        num = struct.unpack('i', fp.read(4))[0]
        dim = struct.unpack('i', fp.read(4))[0]
        data = np.fromfile(fp,dtype=np.float32,count=-1,sep='')
        data = np.reshape(data,(int(len(data)/dim),dim))
    else:
        fp = open(filename,'r')
        num, dim = map(int, fp.readline().split(' '))
        data = np.fromfile(fp, dtype=np.float32, sep=' ')
        data = np.reshape(data, (num, dim))
    return data

def load_w_nav(filename):
    fp = open(filename,'rb')
    num = np.fromfile(fp,dtype=np.int32,count=1,sep='')[0]
    w_nav = [None]*num
    for i in range(num):
        num2 = np.fromfile(fp,dtype=np.int32,count=1,sep='')[0]
        w_nav[i] = [None]*num2
        for j in range(num2):
            num3 = np.fromfile(fp,dtype=np.int32,count=1,sep='')[0]
            w_nav[i][j] = np.fromfile(fp,dtype=np.float32,count=num3,sep='')
            
    return w_nav

def dump_dmat(filename, data):
    binary = bool(filename[filename.rfind("."):] == ".bin")
    if binary:
        with open(filename, 'wb') as f:
            f.write(struct.pack('i', data.shape[0]))
            f.write(struct.pack('i', data.shape[1]))
            data.astype(np.float32).tofile(f)
    else:
        with open(filename, 'w') as f:
            f.write('%d %d\n'%(data.shape[0], data.shape[1]))
            for row in data:
                row.tofile(f, sep=' ', format='%.6f')
                f.write('\n')
                
def write_sparse_mat(X, filename, header=True):
    if not isinstance(X, csr_matrix):
        X = X.tocsr()
    X.sort_indices()
    with open(filename, 'w') as f:
        if header:
            print("%d %d" % (X.shape[0], X.shape[1]), file=f)
        for y in X:
            idx = y.__dict__['indices']
            val = y.__dict__['data']
            sentence = ' '.join(['%d:%.5f'%(x, v)
                                 for x, v in zip(idx, val)])
            print(sentence, file=f)
            
def read_sparse_mat(filename, use_xclib=True):
    if use_xclib:
        return xclib.data.data_utils.read_sparse_file(filename)
    else:
        with open(filename) as f:
            nr, nc = map(int, f.readline().split(' '))
            data = []; indices = []; indptr = [0]
            for line in tqdm(f):
                if len(line) > 1:
                    row = [x.split(':') for x in line.split()]
                    tempindices, tempdata = list(zip(*row))
                    indices.extend(list(map(int, tempindices)))
                    data.extend(list(map(float, tempdata)))
                    indptr.append(indptr[-1]+len(tempdata))
                else:
                    indptr.append(indptr[-1])
            score_mat = csr_matrix((data, indices, indptr), (nr, nc))
            del data, indices, indptr
            return score_mat

In [None]:
dataset = ""; trn_X_Xf_file = ""; trn_X_Y_file = ""; tst_X_Y_file = ""; tst_X_Xf_file = ""; Y_file = ""; stat_file = ""; model_dir = ""
Y = None; trn_X_Y = None; tst_X_Y = None; trn_X_Xf = None; tst_X_Xf = None; numy = 0; numxf = 0; trn_numx = 0; tst_numx = 0; trn_Y_X = None; tst_Y_X = None;
coo_mat = None; inv_prop = None; A = None; B = None

EXP_DIR = "/nilesh_experiments"
DATA_DIR = "%s/Datasets"%EXP_DIR

def load_y():
    global Y
    Y = []
    with open(Y_file) as f:
        for line in f:
            Y.append(line.strip())
            
def printacc(score_mat, K = 5, X_Y = None, disp = True, inv_prop_ = -1):
    if X_Y is None: X_Y = tst_X_Y
    if inv_prop_ is -1 : inv_prop_ = inv_prop
        
    acc = xc_metrics.Metrics(X_Y.tocsr().astype(np.bool), inv_prop_)
    metrics = np.array(acc.eval(score_mat, K))*100
    df = pd.DataFrame(metrics)
    
    if inv_prop_ is None : df.index = ['P', 'nDCG']
    else : df.index = ['P', 'nDCG', 'PSP', 'PSnDCG']
        
    df.columns = [str(i+1) for i in range(K)]
    if disp: display(df.round(2))
    return metrics
            
def load_mats(loadxf=True):
    global trn_X_Y, tst_X_Y, trn_X_Xf, tst_X_Xf, trn_Y_X, tst_Y_X, numy, trn_numx, tst_numx, numxf, coo_mat, inv_prop, A, B
    trn_X_Y = read_sparse_mat(trn_X_Y_file)
    tst_X_Y = read_sparse_mat(tst_X_Y_file)
    trn_Y_X = trn_X_Y.transpose().tocsr()
    tst_Y_X = tst_X_Y.transpose().tocsr()
    
    if "Amazon" in dataset: A = 0.6; B = 2.6
    elif "Wiki" in dataset: A = 0.5; B = 0.4
    else : A = 0.55; B = 1.5
    inv_prop = xc_metrics.compute_inv_propesity(trn_X_Y, A, B)
    
    numy = trn_X_Y.shape[1]
    trn_numx = trn_X_Y.shape[0]
    tst_numx = tst_X_Y.shape[0]
    
#     coo_mat = trn_X_Y*trn_Y_X
#     coo_mat.setdiag(0)

    if loadxf:
        binary = bool(trn_X_Xf_file[trn_X_Xf_file.rfind("."):] == ".bin")
        if binary:
            trn_X_Xf = load_dmat(trn_X_Xf_file)
        else:
            temp = load_dmat(trn_X_Xf_file)
            temp = normalize(temp, axis=1, norm='l2')
            trn_X_Xf = np.ones((temp.shape[0], temp.shape[1]+1))
            trn_X_Xf[:, :-1] = temp
            del temp

        binary = bool(tst_X_Xf_file[tst_X_Xf_file.rfind("."):] == ".bin")
        if binary:
            tst_X_Xf = load_dmat(tst_X_Xf_file)
        else:
            temp = load_dmat(tst_X_Xf_file)
            temp = normalize(temp, axis=1, norm='l2')
            tst_X_Xf = np.ones((temp.shape[0], temp.shape[1]+1))
            tst_X_Xf[:, :-1] = temp
            del temp

        numxf = trn_X_Xf.shape[1]

def init_dataset(_dataset, loadxf=True):
    global dataset, trn_X_Xf_file, trn_X_Y_file, tst_X_Y_file, tst_X_Xf_file, Y_file, stat_file, model_dir
    dataset = _dataset
    trn_X_Y_file = "%s/%s/trn_X_Y.txt"%(DATA_DIR, dataset)
    tst_X_Y_file = "%s/%s/tst_X_Y.txt"%(DATA_DIR, dataset)
    
    if os.path.isfile("%s/%s/dense_trn_X_Xf.bin"%(DATA_DIR, dataset)): trn_X_Xf_file = "%s/%s/dense_trn_X_Xf.bin"%(DATA_DIR, dataset)
    else : trn_X_Xf_file = "%s/%s/dense_trn_X_Xf.txt"%(DATA_DIR, dataset)
    if os.path.isfile("%s/%s/dense_tst_X_Xf.bin"%(DATA_DIR, dataset)): tst_X_Xf_file = "%s/%s/dense_tst_X_Xf.bin"%(DATA_DIR, dataset)
    else : tst_X_Xf_file = "%s/%s/dense_tst_X_Xf.txt"%(DATA_DIR, dataset)
        
    Y_file = "%s/%s/Y.txt"%(DATA_DIR, dataset)
    model_dir = "%s/graphxml-manik/Results/%s/model"%(EXP_DIR, dataset)
    stat_file = "%s/graphxml-manik/Results/%s/model/w_stats.txt"%(EXP_DIR, dataset)
#     load_y();
    load_mats(loadxf)
    
class bcolors:
    purple = '\033[95m'
    blue = '\033[94m'
    green = '\033[92m'
    warn = '\033[93m' # dark yellow
    fail = '\033[91m' # dark red
    white = '\033[37m'
    yellow = '\033[33m'
    red = '\033[31m'
    
    ENDC = '\033[0m'
    bold = '\033[1m'
    underline = '\033[4m'
    reverse = '\033[7m'
    
    on_grey = '\033[40m'
    on_yellow = '\033[43m'
    on_red = '\033[41m'
    on_blue = '\033[44m'
    on_green = '\033[42m'
    on_magenta = '\033[45m'
    
def _c(*args, attr='bold'):
    string = ''.join([bcolors.__dict__[a] for a in attr.split()])
    string += ' '.join([str(arg) for arg in args])+bcolors.ENDC
    return string

In [None]:
def getscore(prod, prevscore=0.0):
    return (-(np.maximum(0, 1-prod)**2))+prevscore

def get_pos_pts(lbls, Y_X = None):
    if Y_X is None : Y_X = trn_Y_X
    mask = np.zeros(Y_X.shape[1], bool)
    for lbl in lbls:
        mask[Y_X[lbl].indices] = True
    return np.where(mask == True)[0]

def get_pos_lbls(pts, X_Y = None):
    if X_Y is None : X_Y = trn_X_Y
    mask = np.zeros(X_Y.shape[1], bool)
    for pt in pts:
        mask[X_Y[pt].indices] = True
    return np.where(mask == True)[0]

def get_w_acc(w, pos_inds, all_inds = None, X_Xf = None):
    if X_Xf is None : X_Xf = trn_X_Xf
    if all_inds is None : all_inds = range(trn_numx)
    acc = 0.0
    
    pos_pts = X_Xf[pos_inds]
    scores = np.dot(pos_pts, w.transpose())
    scores = np.exp(getscore(scores))
    sns.distplot(scores, hist=False, rug=False)
    acc += np.sum(scores)
    print('pos acc : %.4f'%(np.mean(scores)))
    
    mask = np.zeros(X_Xf.shape[0], bool)
    mask[all_inds] = True; mask[pos_inds] = False
    neg_pts = trn_X_Xf[mask]
    scores = np.dot(neg_pts, w.transpose())
    scores = np.exp(getscore(scores))
    sns.distplot(scores, hist=False, rug=False)
    acc += np.sum(1-scores)
    acc /= len(all_inds)
    print('neg acc : %.4f'%(np.mean(1-scores)))
    
    print("acc : %.4f"%(acc))
    return acc

In [None]:
def train_classifier(pos_inds, all_inds = None, X_Xf = None):
    if X_Xf is None : X_Xf = trn_X_Xf
    if all_inds is None : all_inds = range(trn_numx)
        
    if len(pos_inds) == 0:
        w = np.zeros(X_Xf.shape[1])
        w[:] = -100.0
        return w
        
    y = -np.ones(len(all_inds), dtype=int)
    newmap = np.vectorize({val : i for i, val in enumerate(all_inds)}.get)
    new_pos_inds = np.intersect1d(pos_inds, all_inds)
    y[newmap(new_pos_inds)] = 1
    
    if len(y[y == 1]) == len(y):
        w = np.zeros(X_Xf.shape[1])
        w[:] = 100.0
        return w
    
    clf = LinearSVC(random_state=0, fit_intercept=False, tol=1e-5, C=1.0)
    clf.fit(X_Xf[all_inds], y)
    
    return clf.coef_[0]