In [1]:
import json
import pandas as pd
from pyserini.search.lucene import LuceneImpactSearcher
from pyserini.pyclass import JFloat, JInt, JHashMap

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def dot_product(dict1, dict2):
    # Calculate the dot product of two dictionaries
    to_print = [[k, dict1[k] * dict2[k]] for k in dict1 if k in dict2]
    to_print = list(sorted(to_print, key=lambda x: -x[1]))
    print(to_print)
    return sum(dict1[key] * dict2.get(key, 0) for key in dict1)


def create_jquery(encoded_query, searcher, fields = {}):
    jfields = JHashMap()
    for (field, boost) in fields.items():
        jfields.put(field, JFloat(boost))

    jquery = JHashMap()
    for (token, weight) in encoded_query.items():
        # if token in searcher.idf and searcher.idf[token] >= searcher.min_idf:
        jquery.put(token, JInt(weight))

    return jquery

In [46]:
phrase_splade_metadata_path = "/home/lamdo/splade/pyserini_evaluation/metadata/doris_mae__phrase_splade_52"
normal_splade_metadata_path = "/home/lamdo/splade/pyserini_evaluation/metadata/doris_mae__original_spladev2_max"

qrel_path = "/home/lamdo/splade/data/doris_mae/doris_mae/qrels/test.tsv"
queries_path = "/home/lamdo/splade/data/doris_mae/doris_mae/queries.jsonl"

In [47]:
phrase_splade_searcher = LuceneImpactSearcher("/scratch/lamdo/beir_splade/indexes/doris_mae__phrase_splade_52", query_encoder=None)
normal_splade_searcher = LuceneImpactSearcher("/scratch/lamdo/beir_splade/indexes/doris_mae__original_spladev2_max", query_encoder=None)

In [48]:
_qrels = pd.read_csv(qrel_path, sep='\t').to_dict("records")
    
qrel_metadata = {}
for line in _qrels:
    query_id = str(line["query-id"])
    doc_id = line["corpus-id"]
    score = line["score"]

    if query_id not in qrel_metadata:
        qrel_metadata[query_id] = []

    qrel_metadata[query_id].append({
        "docid": doc_id,
        "score": score
    })

In [49]:
queries = {}
with open(queries_path) as f:
    for line in f:
        jline = json.loads(line)
        id = jline["_id"]
        query_text = jline["text"]
        queries[id] = query_text

In [50]:
with open(phrase_splade_metadata_path) as f:
    phrase_splade_metadata = json.load(f)

with open(normal_splade_metadata_path) as f:
    normal_splade_metadata = json.load(f)

In [79]:
# normal_splade_metadata["encoded_queries"][1], phrase_splade_metadata["encoded_queries"][1]
chosen_index = 2

phrase_splade_query_rep = phrase_splade_metadata["encoded_queries"][chosen_index]["vector"]
normal_splade_query_rep = normal_splade_metadata["encoded_queries"][chosen_index]["vector"]

query_id = str(normal_splade_metadata["encoded_queries"][chosen_index]["query_id"])

print("QueryId", query_id, "->", queries[query_id])
print({k: v for k, v in sorted(phrase_splade_query_rep.items(), key=lambda item: -item[1])})
print()
print({k: v for k, v in sorted(normal_splade_query_rep.items(), key=lambda item: -item[1])})

QueryId 2 -> I am a user marketing manager, and I am planning to train a deep learning model to analyze user engagement data and thus identify key factors that influence it. With such a model, I can continuously analyze the data and adjust marketing strategies, recommendations and rewards programs to maintain user engagement or increase future revenue. Also, results from the model can also allow me to treat users with personalized marketing campaigns.  Therefore, I hope the model can track and analyze the data from customers in real-time.
{'subtitle': 178.0, 'personalization': 165.0, 'social-media': 158.0, 'recommendation-system': 155.0, 'deep-learning': 144.0, 'user-interaction': 139.0, 'marketing': 125.0, 'deep': 114.0, 'company': 110.0, '.': 108.0, 'user': 98.0, 'marketing-communication': 97.0, 'i': 94.0, 'am': 94.0, 'facebook': 94.0, 'real-time': 93.0, 'malaysia': 86.0, 'people': 85.0, 'smart-phone': 83.0, 'track': 81.0, 'price': 80.0, 'data': 77.0, 'software': 76.0, 'word': 75.0, 

In [80]:
chosen_query_id = query_id
doc_ids = [str(line['docid']) for line in qrel_metadata[chosen_query_id] if line["score"]]

In [81]:
min_normal, max_normal = min(normal_splade_metadata["predictions"][chosen_query_id].values()), max(normal_splade_metadata["predictions"][chosen_query_id].values())
min_phrase, max_phrase = min(phrase_splade_metadata["predictions"][chosen_query_id].values()), max(phrase_splade_metadata["predictions"][chosen_query_id].values())

print("score range normal", min_normal, max_normal)
print("score range phrase", min_phrase, max_phrase)
for docid in doc_ids:
    print(docid)
    print("normal splade", normal_splade_metadata["predictions"][chosen_query_id].get(docid, 0))
    print("phrase splade", phrase_splade_metadata["predictions"][chosen_query_id].get(docid, 0))

    print()

score range normal 191113.0 329239.0
score range phrase 298642.0 415275.0
119812
normal splade 0
phrase splade 354750.0

111207
normal splade 220158.0
phrase splade 0

249012
normal splade 0
phrase splade 0

218889
normal splade 254600.0
phrase splade 345013.0

178962
normal splade 0
phrase splade 0

205610
normal splade 254376.0
phrase splade 354305.0

31550
normal splade 0
phrase splade 0

28533
normal splade 0
phrase splade 321714.0

228216
normal splade 203204.0
phrase splade 306459.0

143766
normal splade 0
phrase splade 0

29085
normal splade 257984.0
phrase splade 342083.0

130033
normal splade 0
phrase splade 329908.0



In [82]:
chosen_doc_id = "130033"

print(json.loads(phrase_splade_searcher.doc(chosen_doc_id).lucene_document().get("raw"))["text"])

phrase_splade_doc_rep = json.loads(phrase_splade_searcher.doc(chosen_doc_id).lucene_document().get("raw"))["vector"]
normal_splade_doc_rep = json.loads(normal_splade_searcher.doc(chosen_doc_id).lucene_document().get("raw"))["vector"]

In this paper, we consider the detection of a decrease of engagement by users spontaneously interacting with a socially assistive robot in a public space. We first describe the UE-HRI dataset that collects spontaneous Human-Robot Interactions following the guidelines provided by the Affective Computing research community to collect data "in-the-wild". We then analyze the users' behaviors, focusing on proxemics, gaze, head motion, facial expressions and speech during interactions with the robot. Finally, we investigate the use of deep learning techniques (Recurrent and Deep Neural Networks) to detect user engagement decrease in realtime. The results of this work highlight, in particular, the relevance of taking into account the temporal dynamics of a user's behavior. Allowing 1 to 2 seconds as buffer delay improves the performance of taking a decision on user engagement.


In [83]:
print({k: v for k, v in sorted(phrase_splade_doc_rep.items(), key=lambda item: -item[1])})
print()
print({k: v for k, v in sorted(normal_splade_doc_rep.items(), key=lambda item: -item[1])})

{'deep-learning': 231, 'humanoid-robot': 203, 'robotic-system': 179, 'human-behavior': 178, 'parallel-robot': 158, 'object-manipulation': 144, 'human-robot-interaction': 140, 'user-interaction': 132, 'social-interaction': 130, 'emotional-expression': 130, 'spontaneous': 129, 'dataset': 126, 'robot': 125, 'detection-method': 123, 'hr': 120, 'multi-agent': 120, 'artificial-intelligence': 115, 'recurring': 110, 'subtitle': 108, 'disabled-people': 107, 'recurrent': 106, 'virtual-reality': 105, 'delay-time': 105, 'deep': 104, 'dynamic-behavior': 103, 'fly': 100, '##emi': 99, 'intelligent-robot': 99, 'detection': 97, 'spontaneously': 97, 'ic': 96, 'detection-algorithm': 96, 'engagement': 95, 'buffer': 95, 'wild': 94, 'neural-network': 94, 'guidelines': 93, 'affect': 91, 'emotion': 91, 'u': 90, 'negative-affect': 90, 'buffers': 90, 'assist': 89, 'interaction': 88, 'delay': 88, 'hrd': 88, 'cognitive-architecture': 87, 'virtual-space': 86, 'physical-space': 86, '##x': 85, 'pro': 85, 'decrease':

In [84]:
dot_product(phrase_splade_doc_rep, phrase_splade_query_rep), dot_product(normal_splade_doc_rep, normal_splade_query_rep)

[['deep-learning', 33264.0], ['subtitle', 19224.0], ['user-interaction', 18348.0], ['social-media', 12956.0], ['deep', 11856.0], ['.', 7668.0], ['company', 7370.0], ['engagement', 6840.0], ['artificial-intelligence', 6785.0], ['personalization', 6600.0], ['people', 6545.0], ['malaysia', 6450.0], ['human-behavior', 6408.0], ['user', 6370.0], ['facebook', 6204.0], ['word', 6150.0], ['algorithm', 5100.0], ['technology', 5041.0], ['real-time', 4836.0], ['usability', 4672.0], ['neural-network', 4512.0], ['software', 4332.0], ['smart-phone', 4150.0], ['video', 4095.0], ['recommendation-system', 4030.0], ['location-management', 4028.0], ['the', 3960.0], ['innovation-diffusion', 3854.0], ['data', 3542.0], ['game', 3540.0], ['image', 3021.0], [',', 2856.0], ['dataset', 2646.0], ['technology-acceptance', 2550.0], ['learning', 2520.0], ['network', 2499.0], ['store', 2376.0], ['cloud-computing', 2254.0], ['school', 2146.0], ['price', 2080.0], ['clinical-guidelines', 2072.0], ['smart', 2052.0], ['e

(329908.0, 185036)

In [30]:
print("QueryId", query_id)
print({k: v for k, v in sorted(phrase_splade_query_rep.items(), key=lambda item: -item[1])})
print()
print({k: v for k, v in sorted(normal_splade_query_rep.items(), key=lambda item: -item[1])})

QueryId PLAIN-2
{'statins': 242.0, 'cholesterol': 187.0, 'stat': 145.0, 'fat': 138.0, 'breast': 138.0, 'cecil': 131.0, 'breast-cancer': 131.0, 'cancer': 130.0, 'al.': 124.0, '.': 118.0, 'drug': 110.0, 'patient': 102.0, '?': 101.0, 'cancer-research': 101.0, 'zein': 101.0, '##in': 96.0, 'do': 91.0, 'blood': 90.0, 'new-drug': 80.0, 'cause': 79.0, 'mammary-gland': 79.0, 'tumor': 78.0, 'drugs': 77.0, 'tax': 75.0, 'causation': 75.0, 'people': 73.0, 'gene': 73.0, 'drug-treatment': 71.0, 'women': 69.0, 'lipid': 69.0, 'price': 65.0, 'protective-effect': 65.0, 'breast-carcinoma': 65.0, 'us': 64.0, 'vldl': 62.0, 'compound': 60.0, 'mammography': 58.0, 'menopause': 56.0, 'pathogenesis': 55.0, 'compounds': 53.0, 'sipm': 53.0, 'content': 51.0, 'therapies': 51.0, 'breast-cancers': 50.0, 'medication': 49.0, 'hdl': 49.0, 'misuse': 48.0, 'squalene': 48.0, 'cusum': 48.0, 'serum-cholesterol': 48.0, 'ld': 47.0, 'prescription-drugs': 47.0, 'food': 46.0, 'medicine': 46.0, 'pregnancy': 46.0, 'sterol': 45.0, 'm

In [127]:
phrase_splade_metadata["predictions"][query_id]

{'MED-2431': 75067.0,
 'MED-10': 73880.0,
 'MED-2429': 73096.0,
 'MED-14': 73082.0,
 'MED-2439': 64919.0,
 'MED-2435': 64462.0,
 'MED-4695': 64196.0,
 'MED-4226': 63963.0,
 'MED-2434': 63489.0,
 'MED-4117': 62829.0,
 'MED-4223': 62120.0,
 'MED-3856': 61849.0,
 'MED-1371': 60873.0,
 'MED-3832': 60854.0,
 'MED-1825': 60557.0,
 'MED-2122': 60474.0,
 'MED-2436': 59550.0,
 'MED-4097': 59150.0,
 'MED-3849': 58640.0,
 'MED-2437': 58320.0,
 'MED-2440': 58168.0,
 'MED-3833': 57984.0,
 'MED-3841': 57984.0,
 'MED-1564': 57860.0,
 'MED-4827': 57724.0,
 'MED-4096': 57613.0,
 'MED-950': 57523.0,
 'MED-4652': 57403.0,
 'MED-3799': 57374.0,
 'MED-3860': 57340.0,
 'MED-1193': 57338.0,
 'MED-2427': 57254.0,
 'MED-1829': 56792.0,
 'MED-5357': 56745.0,
 'MED-4057': 56610.0,
 'MED-4465': 56494.0,
 'MED-4787': 56346.0,
 'MED-2103': 56155.0,
 'MED-3205': 55775.0,
 'MED-3766': 55582.0,
 'MED-838': 55308.0,
 'MED-4440': 55103.0,
 'MED-4049': 55088.0,
 'MED-4643': 54946.0,
 'MED-3862': 54922.0,
 'MED-3447': 549

In [131]:
jquery = create_jquery({"clinical-trial": 100}, phrase_splade_searcher)

In [132]:
hits = phrase_splade_searcher.object.search(jquery, 100)

In [133]:
hits

[<io.anserini.search.ScoredDoc at 0x7f8fc4fce610 jclass=io/anserini/search/ScoredDoc jself=<LocalRef obj=0x5556164b20e2 at 0x7f8fc5af71b0>>,
 <io.anserini.search.ScoredDoc at 0x7f8fc4fce1b0 jclass=io/anserini/search/ScoredDoc jself=<LocalRef obj=0x5556164b20f2 at 0x7f8fc5c170d0>>,
 <io.anserini.search.ScoredDoc at 0x7f8fc4fcf970 jclass=io/anserini/search/ScoredDoc jself=<LocalRef obj=0x5556164b20fa at 0x7f8fc5c16ad0>>,
 <io.anserini.search.ScoredDoc at 0x7f8fc4fce2f0 jclass=io/anserini/search/ScoredDoc jself=<LocalRef obj=0x5556164b210a at 0x7f8fc5c16d10>>,
 <io.anserini.search.ScoredDoc at 0x7f8fc562b8d0 jclass=io/anserini/search/ScoredDoc jself=<LocalRef obj=0x5556164b2112 at 0x7f8fc5c16af0>>,
 <io.anserini.search.ScoredDoc at 0x7f8fc5628900 jclass=io/anserini/search/ScoredDoc jself=<LocalRef obj=0x5556164b2122 at 0x7f8fc5c173d0>>,
 <io.anserini.search.ScoredDoc at 0x7f8fc562b510 jclass=io/anserini/search/ScoredDoc jself=<LocalRef obj=0x5556164b212a at 0x7f8fc5c16b30>>,
 <io.anserini

In [105]:
phrase_splade_searcher.doc(chosen_doc_id).lucene_document().get("raw")

'{\n  "_id" : "MED-2437",\n  "title" : "Diet and breast cancer: understanding risks and benefits.",\n  "text" : "BACKGROUND: Breast cancer is the most commonly diagnosed cancer among women in the United States. Extensive research has been completed to evaluate the relationship between dietary factors and breast cancer risk and survival after breast cancer; however, a summary report with clinical inference is needed. Materials and METHODS: This review summarizes the current epidemiological and clinical trial evidence relating diet to breast cancer incidence, recurrence, survival, and mortality. The review includes emerging epidemiological studies that assess risk within breast cancer subtypes as well as a summary of previous and ongoing dietary intervention trials designed to modify breast cancer risk. RESULTS: The available literature suggests that both low-fat and high-fiber diets may be weakly protective against breast cancer, whereas total energy intake and alcohol appear to be posi

In [82]:
for item in jquery.keySet():
    print(item)

toxicity
inhibitors
stat
gene
drugs
ch
vitamin d
cause
breast cancer
do
cho
drug
does
causes
cancer
breast
?
status


In [70]:
phrase_splade_query_rep

{'?': 1,
 'do': 96,
 'does': 34,
 'cause': 99,
 'status': 1,
 'drug': 75,
 'cancer': 129,
 'gene': 122,
 'causes': 29,
 'drugs': 94,
 'breast': 98,
 'ch': 2,
 'cho': 164,
 'toxicity': 25,
 'inhibitors': 17,
 'stat': 138,
 'breast cancer': 158,
 'vitamin d': 51}