In [1]:
import sys
sys.path.append("..")

In [2]:
import utils
import mydatasets
import os
import numpy as np
import torch
import mymodels
from sklearn.manifold import TSNE
import ipywidgets as widgets
from ipywidgets import interact,fixed,interact_manual
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
import pandas as pd
from sklearn import metrics as skmet

In [3]:
DATASET="bibtex"
SPLIT=0

In [4]:
# change dirs because paths are hardcoded in mydatasets
curr_dir=os.getcwd()
os.chdir("..")
if DATASET in ["mediamill","delicious","bibtex"]:
    full_dataset,trn_splits,tst_splits=mydatasets.load_small_dataset(DATASET)
    trn_data,tst_data=mydatasets.get_small_dataset_split(full_dataset,trn_splits,tst_splits,SPLIT)
elif DATASET in ["eurlex"]:
    trn_data,tst_data=mydatasets.load_large_dataset(DATASET)
x_mat,y_mat,x_tst,y_tst=mydatasets.get_arrays(trn_data,tst_data)
# change back
os.chdir(curr_dir)

Loading datasets
../data/Bibtex/Bibtex_data.txt
../data/Bibtex/bibtex_trSplit.txt
../data/Bibtex/bibtex_tstSplit.txt
Number of splits : 10
## HEADER ##
#Point : 7395 , #Features : 1836 , #Labels : 159


In [8]:
load_model_dir="../runs/bibtex_30_3500/"
model_num=4
val_file_name="../runs/bibtex_datadict.p"

In [6]:
x_trn,y_trn,x_val,y_val=mydatasets.get_validation_split(x_mat,y_mat,val_file_name,None)

In [9]:
model=torch.load(load_model_dir+"model_"+str(model_num))

In [10]:
emb_trn=model(torch.from_numpy(x_trn.astype('float32'))).detach().numpy()
emb_val=model(torch.from_numpy(x_val.astype('float32'))).detach().numpy()

### Simple neighbour model

In [11]:
metrics_df=pd.DataFrame()
for num_neighbours in [5,10,15,20,50,100,150]:
    nbrs = NearestNeighbors(n_neighbors=num_neighbours, algorithm='ball_tree').fit(emb_trn)
    trn_metrics=utils.compute_mlr_metrics(nbrs,num_neighbours,y_trn,emb_trn,y_trn,"")
    trn_metrics["trn/val"]="trn"
    trn_metrics["num_nbr"]=num_neighbours
    metrics_df=metrics_df.append(trn_metrics)
    val_metrics=utils.compute_mlr_metrics(nbrs,num_neighbours,y_trn,emb_val,y_val,"")
    val_metrics["trn/val"]="val"
    val_metrics["num_nbr"]=num_neighbours
    metrics_df=metrics_df.append(val_metrics)

### Distance weighted neighbour model

In [12]:
def weighted_mlr_metrics(nbrs,num_neighbours,y_trn,emb_tst,y_tst,prefix):
    nbr_distances, nbr_indices = nbrs.kneighbors(emb_tst)
    weights=np.exp(-nbr_distances)
    y_nbr_weighted=y_trn[nbr_indices,:]*weights.reshape(-1,num_neighbours,1)
    assert(y_nbr_weighted.shape==(emb_tst.shape[0],num_neighbours,y_tst.shape[1]))
    y_pred=np.mean(y_nbr_weighted,axis=1)
    metrics_df=pd.DataFrame(index=[0])
    metrics_df.loc[0,prefix+"p@1"]=utils.precision_at_k(y_tst,y_pred,1)
    metrics_df.loc[0,prefix+"p@3"]=utils.precision_at_k(y_tst,y_pred,3)
    metrics_df.loc[0,prefix+"p@5"]=utils.precision_at_k(y_tst,y_pred,5)
    metrics_df.loc[0,prefix+"ranking_loss"]=skmet.label_ranking_loss(y_tst,y_pred)
    metrics_df.loc[0,prefix+"coverage_error"]=skmet.coverage_error(y_tst,y_pred)
    metrics_df.loc[0,prefix+"avg_prec_score"]=skmet.label_ranking_average_precision_score(y_tst,y_pred)
    return metrics_df

In [13]:
wmetrics_df=pd.DataFrame()
for num_neighbours in [5,10,15,20,50,100,150]:
    nbrs = NearestNeighbors(n_neighbors=num_neighbours, algorithm='ball_tree').fit(emb_trn)
    trn_metrics=weighted_mlr_metrics(nbrs,num_neighbours,y_trn,emb_trn,y_trn,"")
    trn_metrics["trn/val"]="trn"
    trn_metrics["num_nbr"]=num_neighbours
    wmetrics_df=wmetrics_df.append(trn_metrics)
    val_metrics=weighted_mlr_metrics(nbrs,num_neighbours,y_trn,emb_val,y_val,"")
    val_metrics["trn/val"]="val"
    val_metrics["num_nbr"]=num_neighbours
    wmetrics_df=wmetrics_df.append(val_metrics)

In [14]:
display(metrics_df[metrics_df["trn/val"]=="val"])
display(wmetrics_df[wmetrics_df["trn/val"]=="val"])

Unnamed: 0,p@1,p@3,p@5,ranking_loss,coverage_error,avg_prec_score,trn/val,num_nbr
0,0.508197,0.286202,0.204508,0.427308,98.836066,0.385078,val,5
0,0.518443,0.300546,0.218852,0.316805,78.901639,0.416717,val,10
0,0.534836,0.304645,0.225,0.269047,69.692623,0.435849,val,15
0,0.516393,0.305328,0.228279,0.236617,61.963115,0.443147,val,20
0,0.522541,0.299863,0.221311,0.154823,41.891393,0.451256,val,50
0,0.5,0.285519,0.222131,0.120227,32.952869,0.441439,val,100
0,0.485656,0.277322,0.215574,0.106594,29.268443,0.432866,val,150


Unnamed: 0,p@1,p@3,p@5,ranking_loss,coverage_error,avg_prec_score,trn/val,num_nbr
0,0.508197,0.298497,0.214344,0.422631,98.247951,0.425638,val,5
0,0.514344,0.306694,0.22623,0.307989,77.565574,0.451442,val,10
0,0.520492,0.312158,0.22582,0.258888,68.084016,0.46534,val,15
0,0.516393,0.312842,0.225,0.224842,59.981557,0.467567,val,20
0,0.52459,0.301913,0.22418,0.141881,39.17418,0.471794,val,50
0,0.502049,0.289617,0.225,0.107844,30.286885,0.452267,val,100
0,0.487705,0.281421,0.216393,0.09595,26.997951,0.441803,val,150


**Clearly a distance weighted scheme is beneficial.** 

This reflects favourably for the model, that similarity and distances have are correctly related, which is the objective with which the model was trained.

In [16]:
a = np.random.randint(10, size=(4,5))

In [22]:
a

array([[9, 1, 5, 0, 2],
       [2, 2, 4, 2, 8],
       [5, 3, 2, 0, 3],
       [7, 3, 6, 7, 4]])

In [25]:
np.argsort(a, axis=1)

array([[3, 1, 4, 2, 0],
       [0, 1, 3, 2, 4],
       [3, 2, 1, 4, 0],
       [1, 4, 2, 0, 3]])