In [2]:
import sys
sys.path.insert(0, '../utils')
from helper import load_chroma, word_wrap, project_embeddings
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
import numpy as np

In [4]:
embedding_function = SentenceTransformerEmbeddingFunction()

chroma_collection = load_chroma(filename='../Evaluation Sets/Raptor Contract.docx', collection_name='contract', embedding_function=embedding_function)
chroma_collection.count()

650

In [5]:
query = "What is the purpose of the escrow?"
results = chroma_collection.query(query_texts=query, n_results=10, include=['documents', 'embeddings'])

retrieved_documents = results['documents'][0]

for document in results['documents'][0]:
    print(word_wrap(document))
    print('')

escrow.

this agreement, the escrow agreement or the contemplated transactions ;

“ escrow amount ” means, $ 1, 000, 000 “ facilities ” means any
buildings, plants, improvements or structures located on the real
property.

at the closing, buyer shall deposit the escrow amount with the escrow
agent to be held in a separate account ( the “ escrow account ” ). the
escrow account shall be governed by the terms of the escrow agreement.
all funds contained in the escrow account shall be held in escrow and
shall be available solely for the purpose of satisfying obligations of
the company securityholders, if any, as provided in section 2. 07 ( f
). closing deliveries.

to the sellers ’ representative, the employment agreements and any
other ancillary agreements to be entered into by buyer or parent at
closing, executed by buyer or parent ( as applicable ) ; to the escrow
agent, by wire transfer of immediately available funds, the escrow
amount ;

. in so doing, the buyer may rely on any and al

In [6]:
from sentence_transformers import CrossEncoder
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

In [7]:
pairs = [[query, doc] for doc in retrieved_documents]
scores = cross_encoder.predict(pairs)
print("Scores:")
for score in scores:
    print(score)

Scores:
0.9157961
-0.65759283
2.1176157
4.324339
-1.2751368
-0.13444367
-0.00770735
-0.101533696
2.0051513
-0.9458619


In [8]:
print("New Ordering:")
for o in np.argsort(scores)[::-1]:
    print(o+1)

New Ordering:
4
3
9
1
7
8
6
2
10
5


Re-ranking with Query Expansion

In [9]:
original_query = "What is the purpose of the escrow?"
generated_queries = [
    "Who is responsible for managing the escrow?"
    "How is the escrow amount determined?"
    "What conditions need to be met before the escrow is released?"
    "Can the escrow amount be adjusted during the contract term?"
    "What happens if there is a dispute regarding the escrow funds?"
]

In [10]:
queries = [original_query] + generated_queries

results = chroma_collection.query(query_texts=queries, n_results=10, include=['documents', 'embeddings'])
retrieved_documents = results['documents']

In [11]:
# Deduplicate the retrieved documents
unique_documents = set()
for documents in retrieved_documents:
    for document in documents:
        unique_documents.add(document)

unique_documents = list(unique_documents)

In [12]:
pairs = []
for doc in unique_documents:
    pairs.append([original_query, doc])

In [13]:
scores = cross_encoder.predict(pairs)

In [14]:
print("Scores:")
for score in scores:
    print(score)

Scores:
-0.65759283
-0.101533696
-0.13444367
2.0051513
0.03871134
4.3243384
-1.2751368
-1.3882294
2.1176157
-0.00770735
0.58933103
0.9157961
-1.4596148
-0.9458619


In [15]:
print("New Ordering:")
for o in np.argsort(scores)[::-1]:
    print(o)

New Ordering:
5
8
3
11
10
4
9
1
2
0
13
6
7
12
