In [101]:
%cd /mnt/disks/disk2/nileshgupta/ELIAS

/mnt/disks/disk2/nileshgupta/ELIAS


In [2]:
import sys, os, time, socket, yaml, wandb, logging
import logging.config
from tqdm import tqdm

from nets import *
from losses import *
from optimizer_bundles import *
from resources import _c, load_config_and_runtime_args, dump_diff_config, get_free_gpu
from datasets import DATA_MANAGERS, XMCEvaluator, XMCDataManager
from dl_helper import unwrap

import torch
import transformers
transformers.set_seed(42)
os.environ["TOKENIZERS_PARALLELISM"] = "false"

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
args = load_config_and_runtime_args(f'python Results/ELIAS-new/amazon-670k/ELIAS-2/config.yaml'.split())

In [4]:
score_mat = sp.load_npz(f'Results/ELIAS-new/amazon-670k/ELIAS-2/tst_score_mat.npz')
ranker_score_mat = sp.load_npz(f'Results/ELIAS-new/amazon-670k/ELIAS-2/tst_ranker_score_mat.npz')

In [5]:
accelerator=None

In [6]:
data_manager = DATA_MANAGERS['xmc'](args)

In [7]:
trn_loader, val_loader, tst_loader = data_manager.build_data_loaders()

In [27]:
import scipy.sparse as sp
import numpy as np
import pandas as pd

# Adapted sorted_csr from https://github.com/amzn/pecos/blob/f73537aa7e963eedce1373fbb4184143e2381e6d/pecos/utils/smat_util.py#L256
def sorted_csr_from_coo(shape, row_idx, col_idx, val, only_topk=None, replace_val_with_rank=False):
    csr = sp.csr_matrix((val, (row_idx, col_idx)), shape=shape)
    csr.sort_indices()
    for i in range(shape[0]):
        rng = slice(csr.indptr[i], csr.indptr[i + 1])
        sorted_idx = np.argsort(-csr.data[rng], kind="mergesort")
        csr.indices[rng] = csr.indices[rng][sorted_idx]
        csr.data[rng] = csr.data[rng][sorted_idx]

    if only_topk is not None:
        assert isinstance(only_topk, int), f"Wrong type: type(only_topk) = {type(only_topk)}"
        only_topk = max(min(1, only_topk), only_topk)
        nnz_of_insts = csr.indptr[1:] - csr.indptr[:-1]
        row_idx = np.repeat(np.arange(shape[0], dtype=csr.indices.dtype), nnz_of_insts)
        selected_idx = (np.arange(len(csr.data)) - csr.indptr[row_idx]) < only_topk
        row_idx = row_idx[selected_idx]
        col_idx = csr.indices[selected_idx]
        val = csr.data[selected_idx]
        indptr = np.cumsum(np.bincount(row_idx + 1, minlength=(shape[0] + 1)))
        csr = sp.csr_matrix((val, col_idx, indptr), shape=shape, dtype=val.dtype)

    if replace_val_with_rank:
        csr.data = np.concatenate([np.arange(x)+1 for x in csr.getnnz(1)])
        
    return csr

def sorted_csr(csr, only_topk=None, replace_val_with_rank=False):
    if not isinstance(csr, sp.csr_matrix):
        raise ValueError("the input matrix must be a csr_matrix.")

    row_idx = np.repeat(np.arange(csr.shape[0], dtype=np.uint32), csr.indptr[1:] - csr.indptr[:-1])
    return sorted_csr_from_coo(csr.shape, row_idx, csr.indices, csr.data, only_topk, replace_val_with_rank)

def compute_precision_recall(score_mat, true_mat, pK=[1,3,5], rK=[10,20,50,100]):
    allK = list(set([*pK, *rK]))
    maxK = max(allK)
    bin_true_mat = true_mat.copy()
    bin_true_mat.data[:] = 1.0
    rank_score_mat = sorted_csr(score_mat, only_topk=maxK, replace_val_with_rank=True)
    metrics = {}
    for k in sorted(allK, reverse=True):
        rank_score_mat.data[rank_score_mat.data > k] = 0
        rank_score_mat.eliminate_zeros()
        intrsxn = rank_score_mat.multiply(bin_true_mat)
        row_wise_overlap = intrsxn.getnnz(1)
        if k in rK:
            metrics[f'R@{k}'] = 100*(row_wise_overlap/bin_true_mat.getnnz(1)).mean()
        if k in pK:
            metrics[f'P@{k}'] = 100*row_wise_overlap.mean()/k
            # intrsxn.data = 1 / np.log2(intrsxn.data + 1)
            # ndcg_denom = np.cumsum(1 / np.log2(np.arange(k) + 2))[np.minimum(k-1, bin_true_mat.getnnz(1))]
            # metrics[f'nDCG@{k}'] = 100*(np.array(intrsxn.sum(axis=1)).ravel()/ndcg_denom).mean()

    metrics = pd.DataFrame(metrics, index=['Metrics']).round(2)
    metrics = metrics.reindex(sorted(metrics.columns, key=lambda x: (x.split('@')[0], int(x.split('@')[1]))), axis=1)
    return metrics

In [30]:
compute_precision_recall(ranker_score_mat, data_manager.tst_X_Y)

Unnamed: 0,P@1,P@3,P@5,R@10,R@20,R@50,R@100
Metrics,50.79,45.33,41.29,51.43,58.1,64.16,68.87


In [28]:
compute_precision_recall(score_mat, data_manager.tst_X_Y)

Unnamed: 0,P@1,P@3,P@5,R@10,R@20,R@50,R@100
Metrics,48.73,43.85,40.03,50.26,57.6,64.58,68.87


In [10]:
compute_xmc_metrics(ranker_score_mat, data_manager.tst_X_Y, data_manager.inv_prop);

P@1	P@3	P@5	nDCG@1	nDCG@3	nDCG@5	PSP@1	PSP@3	PSP@5	R@10	R@20	R@50	R@100
50.56	45.08	41.08	50.56	47.74	46.01	30.19	34.88	39.2	51.09	57.7	63.61	67.76

P@1 P@3 P@5 nDCG@1 nDCG@3 nDCG@5 PSP@1 PSP@3 PSP@5 R@10 R@20 R@50 R@100
50.56 45.08 41.08 50.56 47.74 46.01 30.19 34.88 39.2 51.09 57.7 63.61 67.76



In [57]:
nnz = trn_labels.getnnz(0)

In [60]:
zero_nnz_labels = np.where(nnz == 0)[0]

In [None]:
val_labels = val_loader.dataset.labels

In [61]:
zero_nnz_labels

array([    11,    185,    437, ..., 669779, 669824, 669885])

In [31]:
net = NETS[args.net](args)
net.load(f'{args.OUT_DIR}/model.pt')
net.to(args.device);

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'bert.pooler.dense.bias', 'cls.seq_relationship.weight', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [34]:
trn_score_mat = unwrap(net).predict(trn_loader, K=args.eval_topk, accelerator=accelerator)
val_score_mat = unwrap(net).predict(val_loader, K=args.eval_topk, accelerator=accelerator)
tst_score_mat = unwrap(net).predict(tst_loader, K=args.eval_topk, accelerator=accelerator)

trn_embs = unwrap(net).get_embs(trn_loader, accelerator=accelerator)
val_embs = unwrap(net).get_embs(val_loader, accelerator=accelerator)
tst_embs = unwrap(net).get_embs(tst_loader, accelerator=accelerator)

Predicting: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1901/1901 [02:48<00:00, 11.29it/s]
Predicting: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:01<00:00,  5.29it/s]
Predicting: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 299/299 [00:49<00:00,  6.00it/s]
Embedding: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1901/1901 [02:27<00:00, 12.91it/s]
Embedding: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:01<00:00,  5.96it/s]
Embedding:

In [36]:
from sklearn.preprocessing import normalize
trn_X_Xf, val_X_Xf, tst_X_Xf = data_manager.load_bow_fts(normalize=True)
trn_ranker_embs = sp.hstack((normalize(trn_embs, copy=False), trn_X_Xf)).tocsr().astype(np.float32)
val_ranker_embs = sp.hstack((normalize(val_embs, copy=False), val_X_Xf)).tocsr().astype(np.float32)
tst_ranker_embs = sp.hstack((normalize(tst_embs, copy=False), tst_X_Xf)).tocsr().astype(np.float32)

trn_labels = trn_loader.dataset.labels.astype(np.float32)

In [37]:
from pecos.xmc import MLModel, MLProblem
prob = MLProblem(trn_ranker_embs, trn_labels, C=sp.identity(trn_labels.shape[1]).tocsr(), M=trn_score_mat, R=None)
mlm = MLModel.train(prob)
mlm.pred_params.post_processor = 'l3-hinge'
val_ranker_score_mat = mlm.predict(val_ranker_embs, csr_codes=val_score_mat.astype(np.float32), only_topk=val_score_mat.shape[1])    
tst_ranker_score_mat = mlm.predict(tst_ranker_embs, csr_codes=tst_score_mat.astype(np.float32), only_topk=tst_score_mat.shape[1])

In [41]:
compute_precision_recall(val_ranker_score_mat, val_loader.dataset.labels)

Unnamed: 0,P@1,P@3,P@5,R@10,R@20,R@50,R@100
Metrics,44.55,40.48,36.92,45.58,53.38,59.81,64.17


In [82]:
# rand_inds = np.random.choice(np.arange(val_score_mat.shape[0]), size=min(5000, val_score_mat.shape[0]))
nnz = trn_labels.getnnz(0)

In [124]:
from sklearn import tree
def get_tree_fts(smats, nnz, tmat = None, clf = None, mode='test'):
    temp = smats[0].tocoo()
    scores = []
    for smat in smats:
        scores.append(np.array(smat[temp.row, temp.col]).reshape(-1, 1))
    scores.append(nnz[temp.col].reshape(-1, 1))
    scores = np.hstack(scores)

    if mode == 'train':
        targets = np.array(tmat[temp.row, temp.col]).ravel()
        return scores, targets
    else:
        res = smats[0].copy()
        res[temp.row, temp.col] = clf.predict_proba(scores)[:, 1]
        return res.tocsr()
        
clf = tree.DecisionTreeClassifier(max_depth=5)
clf = clf.fit(
        *get_tree_fts(
            [val_ranker_score_mat, val_score_mat], 
            nnz=nnz,
            tmat=val_loader.dataset.labels, 
            mode='train'
            )
    )

In [125]:
tree_score_mat = get_tree_fts([val_ranker_score_mat, val_score_mat], nnz=nnz, clf=clf, mode='test')

In [126]:
compute_precision_recall(tree_score_mat*0.5 + val_ranker_score_mat*0.5, val_loader.dataset.labels)

Unnamed: 0,P@1,P@3,P@5,R@10,R@20,R@50,R@100
Metrics,46.52,42.06,38.14,46.54,53.98,60.68,64.17


In [127]:
tree_score_mat = get_tree_fts([tst_ranker_score_mat, tst_score_mat], nnz=nnz, clf=clf, mode='test')

In [135]:
compute_precision_recall(tree_score_mat*0.5 + tst_ranker_score_mat*0.5, tst_loader.dataset.labels)

Unnamed: 0,P@1,P@3,P@5,R@10,R@20,R@50,R@100
Metrics,50.89,45.4,41.35,51.74,58.67,65.14,68.87


In [62]:
val_labels = val_loader.dataset.labels

In [65]:
nz_label_inds = np.where(nnz > 0)[0]

In [63]:
val_labels[:, zero_nnz_labels]

<4000x3369 sparse matrix of type '<class 'numpy.int64'>'
	with 619 stored elements in Compressed Sparse Row format>

In [None]:

tst_ranker_score_mat = get_tree_fts([tst_ranker_score_mat], clf=clf, mode='test')*0.8 + tst_ranker_score_mat*0.2

evaluator = XMCEvaluator(args, tst_loader, data_manager, prefix='tst_ranker')
metrics = evaluator.eval(tst_ranker_score_mat)
logging.info('\n'+metrics.to_csv(sep='\t', index=False))
sp.save_npz(f'{args.OUT_DIR}/tst_ranker_score_mat.npz', tst_ranker_score_mat)