In [5]:
import pyterrier as pt
pt.init()

terrier-assemblies 5.10 jar-with-dependencies not found, downloading to /root/.pyterrier...
Done
terrier-python-helper 0.0.8 jar not found, downloading to /root/.pyterrier...
Done


PyTerrier 0.10.0 has loaded Terrier 5.10 (built by craigm on 2024-08-22 17:33) and terrier-helper 0.0.8



In [4]:
#rm ~/.pyterrier

In [6]:
import os
import torch
import pandas as pd
import numpy as np
import json
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.dummy import DummyClassifier
from sklearn.cluster import KMeans
from scipy import stats
import pickle
import random
from scipy.stats import spearmanr,kendalltau
from scipy.spatial.distance import cdist
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial import distance_matrix

In [7]:
import pyterrier_crs

In [8]:
from pyterrier_crs.index import ResNetIndex
model_name = "resnet101"

In [9]:
from pyterrier_crs import datasets
datasets.setup_datasets()

### Load the dataset index using the corresponding location, alternatively check the pyterrier-fcrs test experiment

In [10]:
r = ResNetIndex("/nfs/primary/from_yashon/irecsys/shoes_test", model_name)

Downloading: "https://s3.amazonaws.com/pytorch/models/resnet101-5d3b4d8f.pth" to /root/.cache/torch/hub/checkpoints/resnet101-5d3b4d8f.pth


  0%|          | 0.00/170M [00:00<?, ?B/s]

In [11]:
pyterrier_crs.datasets.setup_datasets()

In [12]:
from pyterrier_crs.usersim import UserSim
import pyterrier_crs.models

In [13]:
ege_checkpoint = "http://www.dcs.gla.ac.uk/~craigm/fcrs/model_checkpoints/ege-rl-10000.pt"
usersim_path =   "http://www.dcs.gla.ac.uk/~craigm/fcrs/model_checkpoints/caption_model_shoes"

usersim = UserSim(usersim_path, r)
#ege = pyterrier_crs.models.EGE(ege_checkpoint, r, usersim[0].vocabSize, top_K = 100)
#transformer = pyterrier_crs.models.EGE(ege_checkpoint, r, usersim.vocabSize, top_K = 100, export_image_query_rep = True)

print("Models loaded")

relative captioning is called


  "num_layers={}".format(dropout, num_layers))


Models loaded


In [14]:
from collections import defaultdict

In [15]:
from typing import List, Tuple
def parse_qid(qid : str) -> Tuple[int, List[int], int]:
    
        # qid can be "u100,101-t0"
    first, second = qid.split("-")
    turn = int( second.replace("t", "") )
    first = first.replace("u", "")
    all_ids = first.split(",")
    target = int(all_ids[0])
    alternatives = [ int(item) for item in all_ids[1 : ] ]
        
    return (target, alternatives, turn)

## New Metasimulator (MetaSimProb) with Memory

In [16]:
class MetaUserSim_memory(pt.Transformer):
    
    def __init__(self, inner : UserSim, ranker, threshold = 0.5, tolerance = 1, target_as_alt = False):
        self.inner = inner
        self.tolerance = tolerance
        self.ranker = ranker
        self.target_as_alt = target_as_alt
        self.threshold = threshold
        self.memory = {}
        self.counter = defaultdict(int)
    
    def sim_rank1_target(self, qidgroup, target):
        candidate_id = qidgroup[qidgroup["rank"] == 0].iloc[0].docid
        candidate_rep = self.ranker.feat[candidate_id]
        target_rep = self.ranker.feat[target]
        val = target_rep - candidate_rep
        val = val ** 2
        val = val.sum()
        return val

    def transform(self, df):
        rtr = []
        for qid, qidgroup in df.groupby("qid"):
            #parse qid (target, alternatives, turn) TODO
            target, alternatives, turn = parse_qid(qid)
            if self.target_as_alt:
                alternatives.append(target)

            # if this is the first turn, forget anything this target
            if turn == 0 and target in self.memory:
                del self.memory[target]

            if turn <= self.tolerance:
                new_qid = "u"+str(target)+"-t"+str(turn)
                qidgroup = qidgroup.copy()
                qidgroup['qid'] = new_qid
                new_qid_group = self.inner.transform(qidgroup)
                new_qid_group["qid"] = qid
                rtr.append(new_qid_group)
                
                # save how similar we are
                self.memory[target] = self.sim_rank1_target(qidgroup, target)
                
            else:
                # its not the first turn
                assert target in self.memory

                sim_1 = self.sim_rank1_target(qidgroup, target)
                
                # how much did we improve
                delta = sim_1 - self.memory[target]

                # record the memory
                self.memory[target] = sim_1

                # perceived gain, keep using the same target, or if we dont pass the probabilistic threshold
                if delta > 0 or random.random() > self.threshold: 
                    new_qid = "u"+str(target)+"-t"+str(turn)
                    qidgroup = qidgroup.copy()
                    qidgroup['qid'] = new_qid
                    new_qid_group = self.inner.transform(qidgroup)
                    new_qid_group["qid"] = qid
                    rtr.append(new_qid_group)
                
                else: #perceived loss, pick a new alternative 
                    
                    #find ranker function (find nearest index from candidate)
                    candidate_id = qidgroup[qidgroup["rank"] == 0].iloc[0].docid
                    candidate_rep = self.ranker.feat[candidate_id]
                    val = self.ranker.feat[alternatives] - candidate_rep
                    val = val ** 2
                    val = val.sum()
                    v, offset = val.min(0)
                    #pick the most similar item 
                    nearest_docid = alternatives[offset]
                    #print(nearest_docid)
                    # construct new qid
                    new_qid = "u"+str(nearest_docid)+"-t"+str(turn)
                    if nearest_docid!= target:
                        self.counter[turn] +=1
                    qidgroup = qidgroup.copy()
                    qidgroup['qid'] = new_qid
                    #qid_group["nearest"] = str(nearest_docid)
                    new_qid_group = self.inner.transform(qidgroup)
                    new_qid_group["qid"] = qid
                    rtr.append(new_qid_group)
        return pd.concat(rtr)

## Define CRS model

In [17]:
import pyterrier_crs.models
transformer = pyterrier_crs.models.EGE(ege_checkpoint, r, usersim.vocabSize, top_K = 100, export_image_query_rep = True)

Downloading: "http://www.dcs.gla.ac.uk/~craigm/fcrs/model_checkpoints/ege-rl-10000.pt" to /root/.cache/torch/hub/checkpoints/ege-rl-10000.pt


  0%|          | 0.00/8.40M [00:00<?, ?B/s]

In [102]:
metasim = MetaUserSim_memory(
    usersim, transformer.ranker, threshold = 0.95, tolerance = 4, target_as_alt = True
) ## modify threshold and tolerance to obtain the values in the paper

In [103]:
metasim.image_name = metasim.inner.image_name

### Load input csv in its final pre-processed form

In [104]:
input_df = pd.read_csv('/nfs/primary/from_yashon/irecsys/ege_shoes_analysis/input_shoes_CRS_df.csv')
input_df

Unnamed: 0,qid,docno,docid,rank
0,"u1231,2401,4511-t0",img_womens_clogs_783.jpg,3593,0
1,"u3915,1141,2482,96,2588,877,1202-t0",img_womens_sneakers_158.jpg,2273,0
2,"u2536,2119,1141-t0",img_womens_high_heels_958.jpg,173,0
3,"u2426,4098,230-t0",img_womens_sneakers_1098.jpg,2546,0
4,"u4441,2401,1021,1426,169-t0",img_womens_clogs_915.jpg,1843,0
...,...,...,...,...
185,"u4116,543,3536-t0",img_womens_pumps_220.jpg,2129,0
186,"u2361,1387,1390,2909,2361,3322-t0",img_womens_flats_1302.jpg,2930,0
187,"u1720,3039,4334,1817,3524-t0",img_womens_rain_boots_483.jpg,29,0
188,"u3098,3865,3678,4233,3865,3822,938,4577,1141,2...",img_womens_clogs_728.jpg,3331,0


### Run CRS experiment with the above-defined specifications and hyperparameters. Use the option export_to_csv to obtain the per query results.

In [105]:
from pyterrier.measures import *
from pyterrier_crs.display import CRS_Experiment

exp_df = CRS_Experiment(
    input_df, 
    [transformer], 
    metasim, 
    [NDCG@10, 'recip_rank', Success@1, Success@10], 
    num_turns=10, 
    test_batch_size=64, 
    names=["EGE"],
    progress=True
    #export_to_csv="./ege_shoes_alter_tol4_0.95"
)
exp_df

100%|██████████| 3/3 [00:19<00:00,  6.61s/batch]


Unnamed: 0,name,measure,turn,value
0,EGE,Success@1,1,0.047368
1,EGE,Success@1,2,0.152632
2,EGE,Success@1,3,0.236842
3,EGE,Success@1,4,0.3
4,EGE,Success@1,5,0.378947
5,EGE,Success@1,6,0.447368
6,EGE,Success@1,7,0.521053
7,EGE,Success@1,8,0.552632
8,EGE,Success@1,9,0.563158
9,EGE,Success@1,10,0.568421


### Check how many times an alternative is selected

In [106]:
metasim.counter

defaultdict(int, {5: 120, 6: 93, 7: 106, 8: 100, 9: 98, 10: 111})

In [107]:
#exp_df.to_csv('exp_df_ege_tol4_0.95.csv', index = False)