# Installs

In [None]:
!pip install transformers -q
!pip install sentence-transformers -q
!pip install gdown -q

## Pytorch CUDA config

In [1]:
!export PYTORCH_CUDA_ALLOC_CONF=garbage_collection_threshold:0.6

# Paths

In [2]:
EMBEDDINGS_PATH = './embeddings/'
CSFCUBE_REPO_PATH = './CSFCube-repo/'
CSFCUBE_EVAL_SCRIPTS_PATH = './CSFCube-master/'
EMBEDDINGS_PATH = './EMBEDDINGS2/'

In [3]:
import os

if not os.path.exists(EMBEDDINGS_PATH):
    os.mkdir(EMBEDDINGS_PATH)

# CSFCube

In [4]:
!apt-get update -y

Hit:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:2 http://security.ubuntu.com/ubuntu jammy-security InRelease               
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease                      
Get:4 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [119 kB]
Hit:5 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Fetched 119 kB in 1s (155 kB/s)
Reading package lists... Done


In [5]:
!apt-get install git-all -y
!apt-get install zip -y

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-all is already the newest version (1:2.34.1-1ubuntu1.10).
0 upgraded, 0 newly installed, 0 to remove and 42 not upgraded.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
zip is already the newest version (3.0-12build2).
0 upgraded, 0 newly installed, 0 to remove and 42 not upgraded.


## Clone CSFCube Repository

In [6]:
!git clone https://github.com/iesl/CSFCube.git $CSFCUBE_REPO_PATH

fatal: destination path './CSFCube-repo' already exists and is not an empty directory.


## Download Custom Version

The eval scripts was changed to output all metrics (nDCG, MAP, etc) as a dict.

In [7]:
!gdown '1-Ltm1fAhXPW64XROBmX_qc_WOiLr4AGX'

Downloading...
From: https://drive.google.com/uc?id=1-Ltm1fAhXPW64XROBmX_qc_WOiLr4AGX
To: /notebooks/paper2/CSFCube-master.zip
100%|██████████████████████████████████████| 5.50M/5.50M [00:00<00:00, 68.5MB/s]


In [8]:
!unzip -o CSFCube-master.zip

Archive:  CSFCube-master.zip
  inflating: CSFCube-master/abstracts-csfcube-preds.jsonl  
  inflating: CSFCube-master/queries-release.csv  
  inflating: CSFCube-master/README.md  
 extracting: CSFCube-master/.gitignore  
  inflating: CSFCube-master/test-pid2pool-csfcube.json  
  inflating: CSFCube-master/test-pid2anns-csfcube-method.json  
  inflating: CSFCube-master/ann_guidelines.pdf  
  inflating: CSFCube-master/readable_annotations/6431039-background-adju.txt  
  inflating: CSFCube-master/readable_annotations/10695055-background-adju.txt  
  inflating: CSFCube-master/readable_annotations/7898033-background-adju.txt  
  inflating: CSFCube-master/readable_annotations/8781666-background-adju.txt  
  inflating: CSFCube-master/readable_annotations/1936997-background-adju.txt  
  inflating: CSFCube-master/readable_annotations/6431039-result-adju.txt  
  inflating: CSFCube-master/readable_annotations/53080736-result-adju.txt  
  inflating: CSFCube-master/readable_annotations/5052952-method

## Read all Paper Abstracts

## Read Abstracts

In [9]:
import json

pid2abstract = {}

# Read in paper text data.
with open(f"{CSFCUBE_REPO_PATH}abstracts-csfcube-preds.jsonl", 'r', encoding='utf-8') as absfile:
    for line in absfile:
        injson = json.loads(line.strip())
        pid2abstract[injson['paper_id']] = injson

In [10]:
# Calculate the len of each document
for paper_id in pid2abstract:
  paper_dict = pid2abstract[paper_id]
  pid2abstract[paper_id]['len'] = len(paper_dict['title'] + ' ' + ' '.join(paper_dict['abstract']))

In [11]:
# Order by the len, results in less padding and accelerates inference
pid2abstract = dict(sorted(pid2abstract.items(), key=lambda x:x[1]['len']))

# Generate Embeddings

## Functions

In [12]:
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForMaskedLM, AutoTokenizer
from tqdm.auto import tqdm
import torch
from typing import Dict, List

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

@torch.no_grad()
def get_dense_embeddings(dense_model, texts):
    return dense_model.encode(texts, convert_to_tensor='True')

@torch.no_grad()
def get_sparse_embeddings(sparse_model, tokenizer, texts: List[str]):
    tokens = tokenizer(texts,
                       return_tensors='pt',
                       padding='longest',
                       truncation=True,)
    
    output = sparse_model(**tokens.to(device))
    
    vec = torch.max(
        torch.log(
            1 + torch.relu(output.logits)
        ) * tokens.attention_mask.unsqueeze(-1),
    dim=1)[0]
    
    return vec

@torch.no_grad()
def generate_embeddings(**kwargs):

    model_type = kwargs['model_type']
    model_id = kwargs['model_id']
    tokenizer_id = None
    if 'tokenizer_id' in kwargs:
        tokenizer_id = kwargs['tokenizer_id']
    batch_size = kwargs['batch_size']

    if model_type == "dense":
        model = SentenceTransformer(model_id)
    elif model_type == "sparse":
        model = AutoModelForMaskedLM.from_pretrained(model_id).to(device)
        model.eval()
        if not tokenizer_id:
            tokenizer = AutoTokenizer.from_pretrained(model_id)
        else:
            tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
        if tokenizer.model_max_length > 1024:
            tokenizer.model_max_length = 512

    output_embeddings = {}

    paper_ids = list(pid2abstract.keys())

    for i in tqdm(range(0, len(paper_ids), batch_size)):
        batch_papers_ids = paper_ids[i:i+batch_size]

        papers_batch_text = []
        for batch_paper_id in batch_papers_ids:
            paper_dict = pid2abstract[batch_paper_id]

            paper_title = paper_dict['title']
            paper_abstract = ' '.join(paper_dict['abstract'])

            if model_type == "dense":
                paper_title = paper_title.lower()
                paper_abstract = paper_abstract.lower()

                paper_input_text = paper_title + model.tokenizer.sep_token + paper_abstract
            elif model_type == "sparse":
                paper_input_text = paper_title + ' ' + paper_abstract
            papers_batch_text.append(paper_input_text)

        if model_type == "dense":
            embeddings = get_dense_embeddings(model, papers_batch_text)
        else:
            embeddings = get_sparse_embeddings(model, tokenizer, papers_batch_text)

        for embedding, paper_id in zip(embeddings, batch_papers_ids):
            output_embeddings[paper_id] = embedding.detach().cpu()

    return output_embeddings

In [13]:
import pickle

models = [
    {
        "model_id": "gubartz/dense_model",
        "model_name": "hf_dense_model",
        "batch_size": 128,
        "model_type": "dense"
    },
    {
        "model_id": "naver/splade-cocondenser-ensembledistil",
        "model_name": "splade-cocondenser-ensembledistil",
        "batch_size": 32,
        "model_type": "sparse"},
    {
        "model_id": "gubartz/sparse_model",
        "model_name": "hf_sparse_model",
        "batch_size": 32,
        "model_type": "sparse",
        "tokenizer_id": "allenai/scibert_scivocab_uncased"
    }
]

for model in tqdm(models):
    print(model['model_id'])
    embeddings = generate_embeddings(**model)
    model['embeddings'] = embeddings

    model_name = model['model_name']
    '''
    file_name = f"{model['model_type']}_{model_name}.plk"
    save_path = f"{EMBEDDINGS_PATH}{file_name}"
    with open(save_path, "wb") as fOut:
        pickle.dump(model, fOut, protocol=pickle.HIGHEST_PROTOCOL)'''

  0%|          | 0/3 [00:00<?, ?it/s]

gubartz/dense_model


  0%|          | 0/33 [00:00<?, ?it/s]

naver/splade-cocondenser-ensembledistil


  0%|          | 0/132 [00:00<?, ?it/s]

gubartz/sparse_model


  0%|          | 0/132 [00:00<?, ?it/s]

In [14]:
del embeddings

# First Stage Results

In [15]:
import sys

if f"{CSFCUBE_EVAL_SCRIPTS_PATH}eval_scripts" not in sys.path:
    sys.path.append(f"{CSFCUBE_EVAL_SCRIPTS_PATH}eval_scripts")

In [16]:
import json
import ranking_eval

cos = torch.nn.CosineSimilarity(dim=0)

def run_experiment(facet, experiment_name, doc_dense_embeddings, doc_sparse_embeddings, split='test'):
    if facet == 'all':
        result = ranking_eval.graded_eval_pool_rerank(data_path=f'{CSFCUBE_EVAL_SCRIPTS_PATH}',
                                                      method_name=experiment_name,
                                                      facet=facet,
                                                      dataset='csfcube',
                                                      run_path='./',
                                                      split=split)
        return result

    # Read in pools for the queries per facet.
    with open(f"{CSFCUBE_EVAL_SCRIPTS_PATH}test-pid2anns-csfcube-" + facet + '.json', 'r', encoding='utf-8') as fp:
        qpid2pool = json.load(fp)

    qpid2pool_ranked = {}

    for qpid in tqdm(qpid2pool.keys(), leave=False):
        # Get the paper-ids for candidates.
        cand_pids = qpid2pool[qpid]['cands']

        if doc_dense_embeddings is not None:
            q_dense_emb = doc_dense_embeddings[qpid]
        if doc_sparse_embeddings is not None:
            q_sparse_emb = doc_sparse_embeddings[qpid]

        query_cand_sim = []

        for cpid in cand_pids:
            if doc_dense_embeddings is not None:
                doc_dense_emb = doc_dense_embeddings[cpid]
            if doc_sparse_embeddings is not None:
                doc_sparse_emb = doc_sparse_embeddings[cpid]

            if doc_dense_embeddings is not None:
                dense_cos_sim = cos(q_dense_emb, doc_dense_emb)
                sim = dense_cos_sim.item()
            if doc_sparse_embeddings is not None:
                sparse_cos_sim = cos(q_sparse_emb, doc_sparse_emb)
                sim = sparse_cos_sim.item()
            if doc_dense_embeddings is not None and doc_sparse_embeddings is not None:
                sim = dense_cos_sim.item() + sparse_cos_sim.item()

            query_cand_sim.append((cpid, sim))
        # Sort the candidates in predicted rank order - query_cand_sim to smallest cosine sim.
        ranked_pool_dense = list(sorted(query_cand_sim, key=lambda cd: cd[1], reverse=True))
        qpid2pool_ranked[qpid] = ranked_pool_dense

    # Write out the ranked pool in a format consumed by the eval script.
    with open('test-pid2pool-csfcube-' + experiment_name + '-' + facet + '-ranked.json', 'w', encoding='utf-8') as fp:
        json.dump(qpid2pool_ranked, fp)
    result = ranking_eval.graded_eval_pool_rerank(data_path=f'{CSFCUBE_EVAL_SCRIPTS_PATH}',
                                                  method_name=experiment_name,
                                                  facet=facet,
                                                  dataset='csfcube',
                                                  run_path='./',
                                                  split=split)
    return result

In [17]:
facets = ['background', 'method', 'result', 'all']

results = []

#only one model
for model in models:
    for facet in facets:
        embeddings = model['embeddings']
        experiment_name = model['model_name']
        if model['model_type'] == "dense":
            result = run_experiment(facet=facet,
                                    doc_dense_embeddings=embeddings,
                                    doc_sparse_embeddings=None,
                                    experiment_name=experiment_name)
        else:
            result = run_experiment(facet=facet,
                                    doc_dense_embeddings=None,
                                    doc_sparse_embeddings=embeddings,
                                    experiment_name=experiment_name)
        result['experiment_name'] = experiment_name
        result['facet'] = facet
        results.append(result)

# dense + sparse
dense_model = models[0]
sparse_model = models[1]

for facet in facets:
    dense_embeddings = dense_model['embeddings']
    sparse_embeddings = sparse_model['embeddings']
    experiment_name = dense_model['model_name'] + '_' + sparse_model['model_name']
    result = run_experiment(facet=facet,
                            doc_dense_embeddings=dense_embeddings,
                            doc_sparse_embeddings=sparse_embeddings,
                            experiment_name=experiment_name)

    result['experiment_name'] = dense_model['model_name'] + '+' + sparse_model['model_name']
    result['facet'] = facet
    results.append(result)

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

# Rerank - Sparse + Dense

In [20]:
def run_experiment_rerank(facet, experiment_name, doc_dense_embeddings, doc_sparse_embeddings, split='test'):
    if facet == 'all':
        result = ranking_eval.graded_eval_pool_rerank(data_path=f'{CSFCUBE_EVAL_SCRIPTS_PATH}',
                                                      method_name=experiment_name,
                                                      facet=facet,
                                                      dataset='csfcube',
                                                      run_path='./',
                                                      split=split)
        return result

    # Read in pools for the queries per facet.
    with open(f"{CSFCUBE_EVAL_SCRIPTS_PATH}test-pid2anns-csfcube-" + facet + '.json', 'r', encoding='utf-8') as fp:
        qpid2pool = json.load(fp)

    qpid2pool_ranked = {}

    for qpid in tqdm(qpid2pool.keys(), leave=False):
        # Get the paper-ids for candidates.
        cand_pids = qpid2pool[qpid]['cands']
        
        # first stage
        q_sparse_emb = doc_sparse_embeddings[qpid]
        query_cand_sim = []
        for cpid in cand_pids:
            doc_sparse_emb = doc_sparse_embeddings[cpid]
            sparse_cos_sim = cos(q_sparse_emb, doc_sparse_emb).item()
            query_cand_sim.append((cpid, sparse_cos_sim))

        # Sort the candidates in predicted rank order - query_cand_sim to smallest cosine sim.
        ranked_pool_sparse = list(sorted(query_cand_sim, key=lambda cd: cd[1], reverse=True))
        
        # rerank
        cand_50_len = int(len(cand_pids) * 0.5)
        q_dense_emb = doc_dense_embeddings[qpid]
        query_cand_sim_rerank = []
        
        for ranked in ranked_pool_sparse[:cand_50_len]:
            cpid = ranked[0]
            doc_dense_emb = doc_dense_embeddings[cpid]
            dense_cos_sim = cos(q_dense_emb, doc_dense_emb).item()
            query_cand_sim_rerank.append((cpid, dense_cos_sim))
        
        # Sort the candidates in predicted rank order - query_cand_sim to smallest cosine sim.
        ranked_pool = list(sorted(query_cand_sim_rerank, key=lambda cd: cd[1], reverse=True))
        qpid2pool_ranked[qpid] = ranked_pool


    # Write out the ranked pool in a format consumed by the eval script.
    with open('test-pid2pool-csfcube-' + experiment_name + '-' + facet + '-ranked.json', 'w', encoding='utf-8') as fp:
        json.dump(qpid2pool_ranked, fp)
    result = ranking_eval.graded_eval_pool_rerank(data_path=f'{CSFCUBE_EVAL_SCRIPTS_PATH}',
                                                  method_name=experiment_name,
                                                  facet=facet,
                                                  dataset='csfcube',
                                                  run_path='./',
                                                  split=split)
    return result

In [21]:
dense_model = models[0]
sparse_model = models[1]

for facet in facets:
    dense_embeddings = dense_model['embeddings']
    sparse_embeddings = sparse_model['embeddings']
    experiment_name = sparse_model['model_name'] + '_' + dense_model['model_name']
    result = run_experiment_rerank(facet=facet,
                                   doc_dense_embeddings=dense_embeddings,
                                   doc_sparse_embeddings=sparse_embeddings,
                                   experiment_name=experiment_name)

    result['experiment_name'] = 'rerank-' + sparse_model['model_name'] + '+' + dense_model['model_name']
    result['facet'] = facet
    results.append(result)

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

# Rerank - Dense + Sparse

In [22]:
def run_experiment_rerank(facet, experiment_name, doc_dense_embeddings, doc_sparse_embeddings, split='test'):
    if facet == 'all':
        result = ranking_eval.graded_eval_pool_rerank(data_path=f'{CSFCUBE_EVAL_SCRIPTS_PATH}',
                                                      method_name=experiment_name,
                                                      facet=facet,
                                                      dataset='csfcube',
                                                      run_path='./',
                                                      split=split)
        return result

    # Read in pools for the queries per facet.
    with open(f"{CSFCUBE_EVAL_SCRIPTS_PATH}test-pid2anns-csfcube-" + facet + '.json', 'r', encoding='utf-8') as fp:
        qpid2pool = json.load(fp)

    qpid2pool_ranked = {}

    for qpid in tqdm(qpid2pool.keys(), leave=False):
        # Get the paper-ids for candidates.
        cand_pids = qpid2pool[qpid]['cands']
        
        # first stage
        q_dense_emb = doc_dense_embeddings[qpid]
        query_cand_sim = []
        for cpid in cand_pids:
            doc_dense_emb = doc_dense_embeddings[cpid]
            dense_cos_sim = cos(q_dense_emb, doc_dense_emb).item()
            query_cand_sim.append((cpid, dense_cos_sim))        

        # Sort the candidates in predicted rank order - query_cand_sim to smallest cosine sim.
        ranked_pool_dense = list(sorted(query_cand_sim, key=lambda cd: cd[1], reverse=True))
        
        # rerank
        cand_50_len = int(len(cand_pids) * 0.5)
        q_sparse_emb = doc_sparse_embeddings[qpid]
        query_cand_sim_rerank = []
        
        for ranked in ranked_pool_dense[:cand_50_len]:
            cpid = ranked[0]
            doc_sparse_emb = doc_sparse_embeddings[cpid]
            sparse_cos_sim = cos(q_sparse_emb, doc_sparse_emb).item()
            query_cand_sim_rerank.append((cpid, sparse_cos_sim))
        
        # Sort the candidates in predicted rank order - query_cand_sim to smallest cosine sim.
        ranked_pool = list(sorted(query_cand_sim_rerank, key=lambda cd: cd[1], reverse=True))
        
        paper_ids = set(x[0] for x in ranked_pool)
        remain = [x for x in ranked_pool_dense if x[0] not in paper_ids]
        ranked_pool2 = ranked_pool + remain    

        qpid2pool_ranked[qpid] = ranked_pool2

    # Write out the ranked pool in a format consumed by the eval script.
    with open('test-pid2pool-csfcube-' + experiment_name + '-' + facet + '-ranked.json', 'w', encoding='utf-8') as fp:
        json.dump(qpid2pool_ranked, fp)
    result = ranking_eval.graded_eval_pool_rerank(data_path=f'{CSFCUBE_EVAL_SCRIPTS_PATH}',
                                                  method_name=experiment_name,
                                                  facet=facet,
                                                  dataset='csfcube',
                                                  run_path='./',
                                                  split=split)
    return result

In [23]:
dense_model = models[0]
sparse_model = models[1]

for facet in facets:
    dense_embeddings = dense_model['embeddings']
    sparse_embeddings = sparse_model['embeddings']
    experiment_name = dense_model['model_name'] + "_" + sparse_model['model_name']
    result = run_experiment_rerank(facet=facet,
                                   doc_dense_embeddings=dense_embeddings,
                                   doc_sparse_embeddings=sparse_embeddings,
                                   experiment_name=experiment_name)

    result['experiment_name'] = 'rerank-' + dense_model['model_name'] + "+" + sparse_model['model_name']
    result['facet'] = facet
    results.append(result)

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

# Paper Results

In [24]:
import pandas as pd

df = pd.DataFrame(results)
df['ndcg%20'] = df['ndcg%20'] * 100
df['mean_av_precision'] = df['mean_av_precision'] * 100

In [25]:
x = df[['experiment_name', 'facet', 'mean_av_precision', 'ndcg%20']]
x.to_csv("results.csv", decimal=",")

In [26]:
df[['experiment_name', 'facet', 'mean_av_precision', 'ndcg%20']]

Unnamed: 0,experiment_name,facet,mean_av_precision,ndcg%20
0,hf_dense_model,background,55.326459,74.032761
1,hf_dense_model,method,26.229211,46.414835
2,hf_dense_model,result,41.839249,65.401976
3,hf_dense_model,all,40.865179,61.679622
4,splade-cocondenser-ensembledistil,background,48.953631,70.326641
5,splade-cocondenser-ensembledistil,method,31.005963,47.839076
6,splade-cocondenser-ensembledistil,result,38.900234,56.623795
7,splade-cocondenser-ensembledistil,all,39.49526,58.032655
8,hf_sparse_model,background,54.829069,73.818765
9,hf_sparse_model,method,28.977155,49.988563
