## Installing packages

In [None]:
!pip install pyserini openai faiss-cpu -qq

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.1/154.1 MB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.6/73.6 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m47.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m34.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m84.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m59.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m188.7/188.7 kB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.9/5.9 MB[0m [31m20.6 MB/s[0m eta

## Downloading data

In [None]:
!wget https://msmarco.blob.core.windows.net/msmarcoranking/queries.tar.gz
!tar -xvf ./queries.tar.gz

--2023-06-14 20:46:19--  https://msmarco.blob.core.windows.net/msmarcoranking/queries.tar.gz
Resolving msmarco.blob.core.windows.net (msmarco.blob.core.windows.net)... 20.150.34.4
Connecting to msmarco.blob.core.windows.net (msmarco.blob.core.windows.net)|20.150.34.4|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 18882551 (18M) [application/gzip]
Saving to: ‘queries.tar.gz’


2023-06-14 20:46:23 (4.39 MB/s) - ‘queries.tar.gz’ saved [18882551/18882551]

queries.dev.tsv
queries.eval.tsv
queries.train.tsv


## Import packages

In [None]:
import os
import random
import time

import openai
import json
import pandas as pd
from pyserini.search.lucene import LuceneSearcher
from tqdm.auto import tqdm

openai.api_key = ""

In [None]:
searcher = LuceneSearcher.from_prebuilt_index('msmarco-v1-passage')

Downloading index at https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-v1-passage.20221004.252b5e.tar.gz...


lucene-index.msmarco-v1-passage.20221004.252b5e.tar.gz: 2.02GB [01:14, 29.2MB/s]                            


In [None]:
df = pd.read_csv("/content/queries.train.tsv", sep="\t", header=None)
df.columns = ["id", "query"]
df.head()

Unnamed: 0,id,query
0,121352,define extreme
1,634306,what does chattel mean on credit history
2,920825,what was the great leap forward brainly
3,510633,tattoo fixers how much does it cost
4,737889,what is decentralization process.


In [None]:
def generate_permutation_chat(query, passages):
  messages = [
    {"role": "system", "content": "You are RankGPT, an intelligent assistant that can rank passages based on their relevancy to the query."},
    {"role": "user", "content": f"I will provide you with {len(passages)} passages, each indicated by number identifier []. Rank them based on their relevance to query: {query}."},
    {"role": "assistant", "content": "Okay, please provide the passages."},
  ]

  for idx, passage in enumerate(passages):
    messages.append({"role": "user", "content": f"[{idx+1}] {passage}"})
    messages.append({"role": "assistant", "content": f"Received passage [{idx+1}]"})


  messages.append({
      "role": "user", "content": f"Search Query: {query}. Rank the {len(passages)} passages above based on their relevance to the search query. The passages should be listed in descending order using identifiers, and the most relevant passages should be listed first, and the output format should be [] > [], e.g., [1] > [2]. Only response the ranking results, do not say any word or explain."
  })
  return messages

In [None]:
def generate_permutation_text(query, passages):
  text = ["You are an intelligent assistant that can rank passages based on their relevancy to the query.",
          f"The following are {len(passages)} passages, each indicated by number identifier []. I can rank them based on their relevance to query: {query}",]

  for idx, passage in enumerate(passages):
    text.append(f"[{idx+1}] {passage}")

  text.append(f"The search query is: {query}")
  text.append(f"I will rank the {len(passages)} passages above based on their relevance to the search query. The passages will be listed in descending order using identifiers, and the most relevant passages should be listed first, and the output format should be [] > [] > etc, e.g., [1] > [2] > etc.")
  text.append(f"The ranking results of the {len(passages)} passages (only identifiers) is:")
  return "\n".join(text)

In [None]:
def generate_listwise_ranking(searcher, model, query):
  hits = searcher.search(query, k=random.randint(10, 10))
  ranked_passages = [json.loads(hit.raw)["contents"] for hit in hits]

  completion = openai.ChatCompletion.create(
    model=model,
    messages=generate_permutation_chat(query, ranked_passages)
  )

  permutation_text = generate_permutation_text(query, ranked_passages)
  listwise_ranking = completion.choices[0].message["content"]

  return permutation_text, listwise_ranking

In [None]:
len(longt5_dataset)

162

In [None]:
# longt5_dataset = []

for query in tqdm(df["query"].tolist()[len(longt5_dataset):1000]):
  longt5_dataset.append(generate_listwise_ranking(searcher, "gpt-3.5-turbo", query))
  time.sleep(5)

  0%|          | 0/345 [00:00<?, ?it/s]

In [None]:
df_longt5 = pd.DataFrame(longt5_dataset)
df_longt5.to_csv("longt5_dataset.tsv", sep="\t", index=None, header=False)