In [299]:
warnings.filterwarnings("ignore")

In [300]:
!pip install transformers sentence-transformers datasets cohere



In [301]:
!pip install pinecone-client



In [302]:
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
from pinecone import Pinecone, ServerlessSpec
import os
from tqdm import tqdm
import cohere
import numpy as np
import warnings
from IPython.display import display

##### First things first - APIs

In [303]:
with open("chohere_api_key.txt") as f:
    COHERE_API_KEY = f.read().strip()
with open("pinecone_api_key.txt") as f:
    PINECONE_API_KEY = f.read().strip()

## First Element - Embedding Model

In [304]:
from sentence_transformers import SentenceTransformer

EMBEDDING_MODEL = 'all-MiniLM-L6-v2'
model = SentenceTransformer(EMBEDDING_MODEL)

More models can be found [here](https://huggingface.co/models?pipeline_tag=sentence-similarity&sort=trending)

In [305]:
def chunk_text(text, chunk_size=100, overlap=10):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        if i + chunk_size > len(words):
            if chunks:
                break
        chunk = words[i:i + chunk_size]
        chunks.append(' '.join(chunk))
    return chunks

In [306]:
def load_and_embedd_dataset(
        dataset_name: str = 'RealTimeData/News_August_2023',
        split: str = 'train',
        model: SentenceTransformer = SentenceTransformer('all-MiniLM-L6-v2'),
        text_field: str = 'description',
        rec_num: int = 50
) -> tuple:
    """
    Load a dataset and embedd the text field using a sentence-transformer model
    Args:
        dataset_name: The name of the dataset to load
        split: The split of the dataset to load
        model: The model to use for embedding
        text_field: The field in the dataset that contains the text
        rec_num: The number of records to load and embedd
    Returns:
        tuple: A tuple containing the dataset and the embeddings
    """
    from datasets import load_dataset

    print("Loading and embedding the dataset")

    # Load the dataset
    dataset = load_dataset(dataset_name, split=split)
    chunked_contexts = []
    for context in dataset[text_field][:rec_num]:
        chunked = chunk_text(context)
        for chunk in chunked:
            chunked_contexts.append(chunk)
    # Embed the first `rec_num` rows of the dataset
    embeddings = model.encode(chunked_contexts)

    print("Done!")
    return dataset, embeddings

In [307]:
DATASET_NAME = 'RealTimeData/News_August_2023'

dataset, embeddings = load_and_embedd_dataset(
    dataset_name=DATASET_NAME,
    rec_num=50,
    model=model,
)
shape = embeddings.shape

Loading and embedding the dataset
Done!


Lets us look at the dataset and the embeddings

In [308]:
pd_dataset = dataset.to_pandas()
pd_dataset

Unnamed: 0,authors,date_download,date_modify,date_publish,description,filename,image_url,language,localpath,maintext,source_domain,title,title_page,title_rss,url
0,[],2023-08-01 01:20:55+00:00,_,2023-08-01 01:10:02,A consultant cardiologist at the Federal Medic...,https%3A%2F%2Fdailytrust.com%2Ftherapeutic-lif...,https://dailytrust.com/wp-content/uploads/2018...,en,_,A consultant cardiologist at the Federal Medic...,dailytrust.com,‘Therapeutic lifestyle modification’ lowers ri...,_,_,https://dailytrust.com/therapeutic-lifestyle-m...
1,[],2023-08-01 01:20:06+00:00,_,2023-08-01 01:13:54,The Nasarawa State government is taking measur...,https%3A%2F%2Fdailytrust.com%2Fhow-nasarawa-go...,https://dailytrust.com/wp-content/uploads/2022...,en,_,The Nasarawa State government is taking measur...,dailytrust.com,How Nasarawa govt is responding to diphtheria ...,_,_,https://dailytrust.com/how-nasarawa-govt-is-re...
2,[],2023-08-01 01:20:19+00:00,_,2023-08-01 01:07:57,Lawyers are divided over the renewed moves to ...,https%3A%2F%2Fdailytrust.com%2Fnba-conference-...,https://dailytrust.com/wp-content/uploads/2022...,en,_,Lawyers are divided over the renewed moves to ...,dailytrust.com,NBA Conference: Lawyers divided over parallel ...,_,_,https://dailytrust.com/nba-conference-lawyers-...
3,[],2023-08-01 01:20:00+00:00,_,2023-08-01 00:37:29,D’Tigress will face the winners between Mozamb...,https%3A%2F%2Fdailytrust.com%2Fdtigress-to-fac...,https://dailytrust.com/wp-content/uploads/2022...,en,_,D’Tigress will face the winners between Mozamb...,dailytrust.com,D’Tigress to face Mozambique or Cote d’Ivoire ...,_,_,https://dailytrust.com/dtigress-to-face-mozamb...
4,[],2023-08-01 01:20:37+00:00,_,2023-08-01 01:11:50,Liver cancer patients are being spared overnig...,https%3A%2F%2Fdailytrust.com%2Fradioactive-bea...,https://dailytrust.com/wp-content/uploads/2022...,en,_,Liver cancer patients are being spared overnig...,dailytrust.com,Radioactive beads in the wrist that can fight ...,_,_,https://dailytrust.com/radioactive-beads-in-th...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5054,"[Carrie Young, Natalie Herbick]",2023-08-01 01:26:26+00:00,_,2023-08-01 01:01:07,She was diagnosed with stage 2B HER2-POSITIVE ...,https%3A%2F%2Fwww.wkbn.com%2Fnews%2Fohio%2Fbre...,https://www.wkbn.com/wp-content/uploads/sites/...,en,_,CLEVELAND (WJW) – Lesley Kiraly Hosta was just...,www.wkbn.com,"Breast cancer survivor says research, newer dr...",_,_,https://www.wkbn.com/news/ohio/breast-cancer-s...
5055,[Brooke Williams],2023-08-01 01:26:20+00:00,_,2023-08-01 00:19:52,Country artist Luke Bryan invited a local girl...,https%3A%2F%2Fwww.wkbn.com%2Fnews%2Fnational-w...,https://www.wkbn.com/wp-content/uploads/sites/...,en,_,DENVER (KDVR) — Country artist Luke Bryan invi...,www.wkbn.com,Child with cancer gets invited back stage to L...,_,_,https://www.wkbn.com/news/national-world/luke-...
5056,[Stephanie Whiteside],2023-08-01 01:04:45+00:00,_,2023-07-31 23:15:01,Social media has gone wild as people claim the...,https%3A%2F%2Ffox2now.com%2Fnews%2Fnational%2F...,https://fox2now.com/wp-content/uploads/sites/1...,en,_,"(NewsNation) — As in decades past, the questio...",fox2now.com,Did the government confirm aliens exist?,_,_,https://fox2now.com/news/national/did-the-gove...
5057,[Brooke Williams],2023-08-01 01:04:51+00:00,_,2023-08-01 00:18:40,Country artist Luke Bryan invited a local girl...,https%3A%2F%2Ffox2now.com%2Fnews%2Fnational%2F...,https://fox2now.com/wp-content/uploads/sites/1...,en,_,DENVER (KDVR) — Country artist Luke Bryan invi...,fox2now.com,Child with cancer gets invited back stage to L...,_,_,https://fox2now.com/news/national/luke-bryan-i...


In [309]:
print(f"The embeddings shape: {embeddings.shape}")

The embeddings shape: (50, 384)


## Second Element - Vector Database
We will use Pinecone's free-to-use vectorDB

In [310]:
def create_pinecone_index(
        index_name: str,
        dimension: int,
        metric: str = 'cosine',
):
    """
    Create a pinecone index if it does not exist
    Args:
        index_name: The name of the index
        dimension: The dimension of the index
        metric: The metric to use for the index
    Returns:
        Pinecone: A pinecone object which can later be used for upserting vectors and connecting to VectorDBs
    """
    from pinecone import Pinecone, ServerlessSpec
    print("Creating a Pinecone index...")
    pc = Pinecone(api_key=PINECONE_API_KEY)
    existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
    if index_name not in existing_indexes:
        pc.create_index(
            name=index_name,
            dimension=dimension,
            # Remember! It is crucial that the metric you will use in your VectorDB will also be a metric your embedding
            # model works well with!
            metric=metric,
            spec=ServerlessSpec(
                cloud="aws",
                region="us-east-1"
            )
        )
    print("Done!")
    return pc

In [311]:
INDEX_NAME = 'news-august-2023-desc-new'

# Create the vector database
# We are passing the index_name and the size of our embeddings
pc = create_pinecone_index(INDEX_NAME, shape[1])

Creating a Pinecone index...
Done!


Now that we have created the vector database, let's add some data to it!

In [312]:
def upsert_vectors(
        index: Pinecone,
        embeddings: np.ndarray,
        dataset: dict,
        text_field: str = 'description',
        batch_size: int = 128
):
    """
    Upsert vectors to a pinecone index
    Args:
        index: The pinecone index object
        embeddings: The embeddings to upsert
        dataset: The dataset containing the metadata
        batch_size: The batch size to use for upserting
    Returns:
        An updated pinecone index
    """
    print("Upserting the embeddings to the Pinecone index...")
    shape = embeddings.shape

    ids = [str(i) for i in range(shape[0])]
    meta = [{text_field: text} for text in dataset[text_field]]

    # create list of (id, vector, metadata) tuples to be upserted
    to_upsert = list(zip(ids, embeddings, meta))

    for i in tqdm(range(0, shape[0], batch_size)):
        i_end = min(i + batch_size, shape[0])
        index.upsert(vectors=to_upsert[i:i_end])
    return index


In [313]:
# Upsert the embeddings to the Pinecone index
index = pc.Index(INDEX_NAME)
index_upserted = upsert_vectors(index, embeddings, dataset)

Upserting the embeddings to the Pinecone index...


100%|██████████| 1/1 [00:01<00:00,  1.05s/it]


Let's view the index statistics!

In [314]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

## Third Element - LLM
We will use [Cohere's chat API](https://cohere.com/chat)

First example to a question that the model can't answer without the RAG process help:

In [326]:
import cohere

#First lets write a query for the LLM
query = "Which country do Germany suspend bilateral cooperation with?"

co = cohere.Client(api_key=COHERE_API_KEY)
response = co.chat(
        model='command-r-plus',
        message=query,
    )
response.text

"On June 7, 2022, Germany announced that it was suspending bilateral development cooperation with Guatemala, Honduras, and Nicaragua due to concerns about democratic and human rights violations in those countries. This suspension means that Germany will halt any ongoing development projects and financial assistance programs that it has with these three countries.\n\nGermany's decision to suspend cooperation was likely influenced by a variety of factors, including reports of corruption, restrictions on freedom of speech and assembly, and the erosion of the rule of law in the aforementioned countries. This move by Germany sends a strong signal that it prioritizes democracy and human rights in its international engagements and is willing to take concrete actions to promote those values.\n\nIt's important to note that the suspension of bilateral cooperation does not necessarily mean a complete cessation of all diplomatic relations or interactions between Germany and these Central American 

We have a vector database and it is populated with vectors.<br>
Let's write a function that queries vectors from the database!

In [327]:
def augment_prompt(
        query: str,
        text_field: str = 'description',
        model: SentenceTransformer = SentenceTransformer('all-MiniLM-L6-v2'),
        index=None,
) -> str:
    """
    Augment the prompt with the top 3 results from the knowledge base
    Args:
        query: The query to augment
        index: The vectorstore object
    Returns:
        str: The augmented prompt
    """
    results = [float(val) for val in list(model.encode(query))]

    # get top 3 results from knowledge base
    query_results = index.query(
        vector=results,
        top_k=3,
        include_values=True,
        include_metadata=True
    )['matches']
    text_matches = [match['metadata'][text_field] for match in query_results]

    # get the text from the results
    source_knowledge = "\n\n".join(text_matches)

    # feed into an augmented prompt
    augmented_prompt = f"""Using the contexts below, answer the query.
    Contexts:
    {source_knowledge}
    If the answer is not included in the source knowledge - say that you don't know.
    Query: {query}"""
    return augmented_prompt, source_knowledge

In [328]:
# Let us remember our query
query = "Which country do Germany suspend bilateral cooperation with?"
augmented_prompt, source_knowledge = augment_prompt(query, model=model, index=index)
response = co.chat(
        model='command-r-plus',
        message=augmented_prompt,
    )
response.text

'Germany has suspended bilateral cooperation with the Niger Republic.'

In [329]:
print(source_knowledge)

Coup: Germany To Suspend Bilateral Cooperation With Niger Republic

D’Tigress will face the winners between Mozambique and Cote d’Ivoire in the quarter-finals of the 2023 FIBA Women’s Afrobasket taking place in Rwanda. The two sides will face off today and will meet the defending champions, Nigeria who qualified for the quarterfinals after beating Congo DR and Egypt in their two group games. Nigeria, Cameroon,…

On July 26, Sen. Patty Murray announced that UW will be the recipient of a $10 million grant for semiconductor manufacturing and research development.


2nd Example:

In [330]:
query = "What was the score in the game between Nigeria and Ireland in the 2023 FIFA Women’s World Cup?"

co = cohere.Client(api_key=COHERE_API_KEY)
response = co.chat(
        model='command-r-plus',
        message=query,
    )
response.text

"The game between Nigeria and Ireland in the 2023 FIFA Women's World Cup ended with a score of 3-0 in favor of Nigeria. The Super Falcons of Nigeria dominated the game and secured their place in the knockout stages with a convincing victory."

In [331]:
augmented_prompt, source_knowledge = augment_prompt(query, model=model, index=index)
response = co.chat(
        model='command-r-plus',
        message=augmented_prompt,
    )
response.text

'The score was 0-0.'

In [332]:
print(source_knowledge)

... Samoura praises team for making women’s football proud The Super Falcons of Nigeria are through to the round of 16 of the 2023 FIFA Women’s World Cup after a 0-0 draw against Ireland yesterday. The feat means Nigeria is the first African country to have reached the World Cup knockout rounds in two successive…

D’Tigress will face the winners between Mozambique and Cote d’Ivoire in the quarter-finals of the 2023 FIBA Women’s Afrobasket taking place in Rwanda. The two sides will face off today and will meet the defending champions, Nigeria who qualified for the quarterfinals after beating Congo DR and Egypt in their two group games. Nigeria, Cameroon,…

Home SportsAshes 2023: Stuart Broad Gets Winning Send-Off As England Beat Australia By 49 Runs; Draw Series 2-2 Veteran fast-bowler Stuart Broad got his perfect fairytale ending to a glorious cricketing career by picking the last two wickets as England beat Australia by 49 runs in the fifth and final Ashes Test at The Oval…


3rd Example:

In [333]:
query = "How did former UFC middleweight champion Alex Pereira won his match in his light heavyweight debut at UFC 291?"

co = cohere.Client(api_key=COHERE_API_KEY)
response = co.chat(
        model='command-r-plus',
        message=query,
    )
response.text

"Alex Pereira, the former UFC middleweight champion, won his light heavyweight debut at UFC 291 on June 29, 2024, with a stunning second-round knockout of his opponent, Nikita Krylov.\n\nPereira, who is known for his exceptional knockout power, showcased his skills once again in this fight. In the second round, he landed a perfectly timed counter right hook that caught Krylov off guard, sending him crashing to the canvas. Pereira followed up with additional strikes on the ground to secure the knockout victory.\n\nThe win improved Pereira's professional MMA record to 8-1, with all of his wins coming by way of knockout. This impressive performance in his light heavyweight debut is sure to generate even more excitement around Pereira's future in the UFC, as he continues to establish himself as one of the most exciting knockout artists in the sport."

In [334]:
augmented_prompt, source_knowledge = augment_prompt(query, model=model, index=index)
response = co.chat(
        model='command-r-plus',
        message=augmented_prompt,
    )
response.text

'Alex Pereira won his match by split decision.'

In [335]:
print(source_knowledge)

Former UFC middleweight champion Alex Pereira won his match by split decision in his light heavyweight debut at UFC 291 on Saturday but one name that remains attached to the kickboxing star is Nigerian-born fighter Israel Adesanya. During the post-fight interview, the Brazilian made it his condition to accept a third fight against “The Last…

Home SportsAshes 2023: Stuart Broad Gets Winning Send-Off As England Beat Australia By 49 Runs; Draw Series 2-2 Veteran fast-bowler Stuart Broad got his perfect fairytale ending to a glorious cricketing career by picking the last two wickets as England beat Australia by 49 runs in the fifth and final Ashes Test at The Oval…

... Samoura praises team for making women’s football proud The Super Falcons of Nigeria are through to the round of 16 of the 2023 FIFA Women’s World Cup after a 0-0 draw against Ireland yesterday. The feat means Nigeria is the first African country to have reached the World Cup knockout rounds in two successive…
