In [1]:

from scipy.special import logit, expit
from scipy.stats import gaussian_kde
import numpy as np 

from statsmodels.distributions import ECDF

from seaborn import kdeplot
import altair as alt

In [167]:
filenames = ['hpo_implicit_bpr', 'label_correlation', 'hpo_lightfm_warp',
             'rfc','sea', 'nearest_neighbor']
niceNames = ['Implicit-BPR', 'Label Correlation', 'LightFM-WARP', 
             'Random Forest Classifier','SEA', 'Nearest Neighbour' ]

year = 2015

In [168]:
def simple_bootstrap(data, n=1000, take = 350):
    return (np.random.choice(data, size=(n,take))<=3).sum(1) /take

def simple_ci(data):
    d_sorted = np.sort(data)
    low = int(d_sorted.shape[0] // (1/0.025))
    high = int(d_sorted.shape[0] // (1/0.975))
    return (d_sorted[low], d_sorted[high])

def simple_ecdf(ranks, maxrank):
    x = np.arange(1, maxrank)
    ecdf = [(ranks<=i).sum()/len(ranks) for i in x]
    return x, ecdf



In [288]:
nnranks = np.load('./processed_data/2015_nearest_neighbor.npy')
mask = nnranks>10

In [289]:
x=[]
y=[]
y_nn=[]
names=[]

n =20
for count, name in enumerate(filenames):
    #load
    ranks = np.load('./processed_data/'+str(year)+'_'+name+'.npy')

    
    #analyse
    bstrap = simple_bootstrap(ranks, take=len(ranks))
    ci = simple_ci(bstrap)
    x_,y_ = simple_ecdf(ranks, 243)
    
    x+=list(x_[:n])
    y+=list(y_[:n])
    
    _,y_ = simple_ecdf(ranks[mask], 243)
    
    y_nn+=list(y_[:n])
    names+=[niceNames[count]]*n


In [290]:
import pandas as pd
df = pd.DataFrame(columns=['x','y', 'y_nn','Algorithm'],data=np.array([x,y,y_nn,names]).T)
df

Unnamed: 0,x,y,y_nn,Algorithm
0,1,0.24878048780487805,0.23036649214659685,Implicit-BPR
1,2,0.35853658536585364,0.29842931937172773,Implicit-BPR
2,3,0.43658536585365854,0.35602094240837695,Implicit-BPR
3,4,0.4585365853658537,0.3612565445026178,Implicit-BPR
4,5,0.5048780487804878,0.39267015706806285,Implicit-BPR
...,...,...,...,...
115,16,0.5707317073170731,0.07853403141361257,Nearest Neighbour
116,17,0.5756097560975609,0.08900523560209424,Nearest Neighbour
117,18,0.5804878048780487,0.09947643979057591,Nearest Neighbour
118,19,0.5878048780487805,0.11518324607329843,Nearest Neighbour


In [294]:
ch = alt.Chart(df).encode(
    x=alt.X('x:Q', title='Rank'),
    y=alt.Y('y:Q',scale=alt.Scale(domain=[0,1]), title='Fraction correct'),
    color=alt.Color('Algorithm:N', sort=niceNames)
).properties(height=200)
ch_nn = alt.Chart(df).encode(
    x=alt.X('x:Q'),
    y=alt.Y('y_nn:Q',scale=alt.Scale(domain=[0,1])),
    color=alt.Color('Algorithm:N',sort=niceNames)
).properties(height=200)




In [297]:
mega = ( ch.mark_line() + ch.mark_point(filled=True) ) + \
( ch_nn.mark_line(strokeDash=[6,5]) + ch_nn.mark_point() )

mega_ch = mega.properties(width=200).facet(
    facet=alt.Facet('Algorithm:N',sort=alt.SortArray(niceNames)),
    columns=3
).configure_axis(
    #labelFontSize=14,
    titleFontSize=14 
).configure_header(
    titleFontSize=14,
    labelFontSize=14
)

mega_ch.save('./figures/ecdf_gt10.html')

mega_ch

In [283]:
(( ch.mark_line()+ ch.mark_point(filled=True) ) & ( ch_nn.mark_line(strokeDash=[1,2]) + ch_nn.mark_point() ))


In [284]:
( ch.mark_line() + ch.mark_point(filled=True) ) | ( ch_nn.mark_line(strokeDash=[1,2]) + ch_nn.mark_point() )

In [285]:

mega.interactive()

# pat3 over nnranks

In [305]:
def pAt3_mask(ranks, nnranks):
    pAt3 = list()
    ci=list()
    for i in range(1,251):
        mask = nnranks>i
        bootstrap = simple_bootstrap(ranks[mask], take=mask.sum())
        mean = bootstrap.mean()
        ci_ = simple_ci(bootstrap)
        pAt3.append(mean)
        ci.append(ci_)
        
    return np.array(pAt3), np.array(ci)

In [306]:
import matplotlib.pyplot as plt
p = []
high=[]
low= []
n = []
nn = []
for count, name in enumerate(filenames):
    #load
    ranks = np.load('./processed_data/'+str(year)+'_'+name+'.npy')
    pAt3, ci =pAt3_mask(ranks,nnranks)
    
    p+= list(pAt3)
    low += list(ci[:,0])
    high += list(ci[:,1])
    n += [niceNames[count]]*250
    nn += list(range(1,251))

In [307]:
source = pd.DataFrame(columns=['pAt3','low', 'high', 'Algorithm', 'NN Rank'], data = np.array([p,low,high, n,nn]).T)

In [308]:
ch = alt.Chart(source).encode(
    x=alt.X('NN Rank:Q', title='NN Rank'),
    color=alt.Color('Algorithm:N',sort=niceNames)
).properties(height=200, width=200)

In [337]:
out = (ch.mark_line().encode(y=alt.Y('pAt3:Q',title='p@3'),) + ch.mark_area(opacity=0.2).encode(y='low:Q', y2='high:Q')).facet(
    facet=alt.Facet('Algorithm',sort=alt.SortArray(niceNames)),
    columns=3
).configure_axis(
    #labelFontSize=14,
    titleFontSize=14 
).configure_header(
    titleFontSize=14,
    labelFontSize=14
)

out.save('./figures/pAt3_vs_nnrank.html')
out

