In [1]:
! pip install --upgrade pymilvus


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [115]:
! pip install --upgrade ollama tqdm pypdf voyageai openai wget


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


## Import API Keys

In [2]:
import os
from dotenv import load_dotenv

load_dotenv()

VOYAGE_API_KEY = os.getenv('VOYAGE_API_KEY')


## Document 
URL of the Document https://www.judiciary.uk/wp-content/uploads/2024/07/Final-Judgment-CA-2023-001978-BBC-v-BBC-Pension-Trust-another.pdf

In [2]:
! wget https://www.judiciary.uk/wp-content/uploads/2024/07/Final-Judgment-CA-2023-001978-BBC-v-BBC-Pension-Trust-another.pdf

--2024-07-11 15:07:58--  https://www.judiciary.uk/wp-content/uploads/2024/07/Final-Judgment-CA-2023-001978-BBC-v-BBC-Pension-Trust-another.pdf
Resolving www.judiciary.uk (www.judiciary.uk)... 2600:9000:243d:7800:1f:7f0e:b300:93a1, 2600:9000:243d:cc00:1f:7f0e:b300:93a1, 2600:9000:243d:1400:1f:7f0e:b300:93a1, ...
Connecting to www.judiciary.uk (www.judiciary.uk)|2600:9000:243d:7800:1f:7f0e:b300:93a1|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 283459 (277K) [application/pdf]
Saving to: ‘Final-Judgment-CA-2023-001978-BBC-v-BBC-Pension-Trust-another.pdf.2’


2024-07-11 15:07:58 (3.78 MB/s) - ‘Final-Judgment-CA-2023-001978-BBC-v-BBC-Pension-Trust-another.pdf.2’ saved [283459/283459]



In [4]:
from pypdf import PdfReader

reader = PdfReader("Final-Judgment-CA-2023-001978-BBC-v-BBC-Pension-Trust-another.pdf")

pages = [page.extract_text() for page in reader.pages]

print(pages[0])

  
 
Neutral Citation Number  [2024] EWCA Civ 767   
 
Case No:  CA-2023 -001978   
IN THE COURT OF APPEAL ( CIVIL  DIVISION)  
ON APPEAL FROM  THE HIGH COURT OF JUSTICE  
BUSINESS AND PROPERTY COURTS OF ENGLAND AND WALES  
BUSINESS LIST : PENSIONS (ChD)        
The Hon Mr Justice Adam Johnson  
[2023] EWHC 1965  (Ch)  
Royal Courts of Justice  
Strand, London, WC2A 2LL  
 
Date:  09/07 /2024   
Before :  
 
LORD JUSTICE LEWISON  
LADY JUSTICE FALK  
and 
SIR CHRISTOPHER FLOYD  
 
- - - - - - - - - - - - - - - - - - - - - 
 
Between :  
 
 BRITISH BROADCASTING CORPORATION  Appellant  
 - and -  
 (1) BBC P ENSION TRUST LIMITED  
(2) CHRISTINA BURNS   
Respondent s 
 
- - - - - - - - - - - - - - - - - - - - - 
 
     Michael Tennet KC and Edward Sawyer  (instructed by  Linklaters LLP ) 
for the Appellant       
      Brian Green KC and Joseph Steadman (instructed by  Slaughter and May  Solicitors ) 
for the  First Respondent  
Andrew Spink KC and Saul Margo  (instructed by Stephenson Ha

# Prepare Embedding Models

For this one, we are going to use `voyage-law-2`, it's an embedding model specialised for the legal domain

In [5]:
import voyageai

voyage_client = voyageai.Client()

result = voyage_client.embed(["hello world"], model="voyage-law-2")

In [7]:
embedding_dim = len(result.embeddings[0])
print(embedding_dim)
print(result.embeddings[0][:10])

1024
[0.000756315013859421, -0.02162403240799904, 0.0052010356448590755, -0.02917512319982052, -0.00796651840209961, -0.03238343447446823, 0.0660339742898941, 0.03845587745308876, -0.01913367211818695, 0.05562642216682434]


In [8]:
def embed_text(text: str) -> str:
    return voyage_client.embed([text], model="voyage-law-2").embeddings[0]


In [9]:
from pymilvus import MilvusClient

milvus_client = MilvusClient(uri="./milvus_legal.db")

collection_name = "my_rag_collection"


In [10]:
if milvus_client.has_collection(collection_name):
    milvus_client.drop_collection(collection_name)


In [11]:
milvus_client.create_collection(
    collection_name=collection_name,
    dimension=embedding_dim,
    metric_type="IP",  # Inner product distance
    consistency_level="Strong",  # Strong consistency level
)


In [13]:
from tqdm import tqdm

data = []

for i, page in enumerate(tqdm(pages, desc="Creating embeddings")):
    data.append({"id": i, "vector": embed_text(page), "text": page})

milvus_client.insert(collection_name=collection_name, data=data)

Creating embeddings: 100%|███████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:08<00:00,  2.45it/s]


{'insert_count': 20,
 'ids': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
 'cost': 0}

In [15]:
question = "who are the lawyers?"

In [16]:
search_res = milvus_client.search(
    collection_name=collection_name,
    data=[
        embed_text(question)
    ],  # Use the `embed_text` function to convert the question to an embedding vector
    limit=3,  # Return top 3 results
    search_params={"metric_type": "IP", "params": {}},  # Inner product distance
    output_fields=["text"],  # Return the text field
)


In [17]:
import json
retrieved_lines_with_distances = [
    (res["entity"]["text"], res["distance"]) for res in search_res[0]
]
print(json.dumps(retrieved_lines_with_distances, indent=4))


[
    [
        "BBC v BBC Pensions Trust  \n13 meaning. Indeed , it would be the starting point for a reasonable reader wishing to \nunderstand the scope of rule 19.2.  \n48. Although the use of a dictionary is a permissible aid to the interpretation  of a written \ninstrument, as Stey n LJ said in Arbuthnott v Fagan  [1995] CLC 1396, 1402:  \n\u201cDictionaries never solve concrete problems of construction. \nThe meaning of words cannot be ascertained divorced from their \ncontext. And part of the contextual scene is the purpose of the \nprovision. \u201d \n49. In addition, as Lord Hoffmann pointed out in R v Brown  [1996] AC 543 , 561 it is a \nfallacy to treat the words of an English sentence as building blocks whose meaning \ncannot be affected by the rest of the sentence. The unit of communication by means of \nlanguage is the sentence and not the parts of which it is composed. The signifi cance of \nindividual words is affected by other words and the syntax of the whole.  \n50. 

## Use LLM to get a RAG response

In [18]:
context = "\n".join(
    [line_with_distance[0] for line_with_distance in retrieved_lines_with_distances]
)


In [19]:
SYSTEM_PROMPT = """
Human: You are an AI assistant. You are able to find answers to the questions from the contextual passage snippets provided.
"""

USER_PROMPT = f"""
    Use the following pieces of information enclosed in <context> tags to provide an answer to the question enclosed in <question> tags.
    <context>
    {context}
    </context>
    <question>
    {question}
    </question>
"""


In [21]:
from openai import OpenAI

client = OpenAI(
    base_url = 'http://localhost:11434/v1',
    api_key='ollama', # required, but unused
)

response = client.chat.completions.create(
    model="llama3",
    messages=[
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": USER_PROMPT},
    ],
)
print(response.choices[0].message.content)


According to the text, the lawyers involved in this case are:

* Andrew Spink KC (instructed by Stephenson Harwood LLP) for the First Respondent
* Saul Margo (also instructed by Stephenson Harwood LLP) for the First Respondent
* Mr. Tennet (representing the Second Respondent)
* Mr. Spink KC (also representing the Second Respondent)
* Arden LJ (mentioned as having given a previous judgment in Stena Line)

Note that "KC" stands for King's Counsel, which is a title of distinction conferred upon certain senior barristers in England and Wales.
