In [1]:
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
from pinecone import Pinecone, ServerlessSpec
import os
from tqdm import tqdm
import cohere
import numpy as np
import warnings
from IPython.display import display
warnings.filterwarnings("ignore")

2024-06-28 17:49:22.221320: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-28 17:49:22.221437: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-28 17:49:23.546731: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-06-28 17:49:26.508061: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
COHERE_API_KEY = 'your COHERE_API_KEY'PINECONE_API_KEY = 'your PINECONE_API_KEY'

In [3]:
from sentence_transformers import SentenceTransformer

EMBEDDING_MODEL = 'all-MiniLM-L6-v2'
model = SentenceTransformer(EMBEDDING_MODEL)

In [4]:
def load_and_embedd_dataset(
        dataset_name: str = 'cnn_dailymail',
        split: str = 'train',
        model: SentenceTransformer = SentenceTransformer('all-MiniLM-L6-v2'),
        text_field: str = 'text',
        rec_num: int = 400
) -> tuple:
    """
    Load a dataset and embedd the text field using a sentence-transformer model
    Args:
        dataset_name: The name of the dataset to load
        split: The split of the dataset to load
        model: The model to use for embedding
        text_field: The field in the dataset that contains the text
        rec_num: The number of records to load and embedd
    Returns:
        tuple: A tuple containing the dataset and the embeddings
    """
    from datasets import load_dataset
    
    print("Loading and embedding the dataset")
    
    # Load the dataset
    dataset = load_dataset(dataset_name, 'default', split=split)
    
    # Embed the first `rec_num` rows of the dataset  
    embeddings = model.encode(dataset[text_field][:rec_num])
    
    print("Done!")
    return dataset, embeddings

In [5]:
DATASET_NAME = 'Ateeqq/news-title-generator'

dataset, embeddings = load_and_embedd_dataset(
    dataset_name=DATASET_NAME,
    rec_num=10000,
    model=model,
    text_field='text'
)

shape = embeddings.shape

Loading and embedding the dataset
Done!


In [6]:
pd_dataset = dataset.to_pandas()
pd_dataset.head(5)

Unnamed: 0,summary,text
0,upGrad learner switches to career in ML & Al w...,"Saurav Kant, an alumnus of upGrad and IIIT-B's..."
1,Delhi techie wins free food from Swiggy for on...,Kunal Shah's credit card bill payment platform...
2,New Zealand end Rohit Sharma-led India's 12-ma...,New Zealand defeated India by 8 wickets in the...
3,Aegon life iTerm insurance plan helps customer...,"With Aegon Life iTerm Insurance plan, customer..."
4,"Have known Hirani for yrs, what if MeToo claim...",Speaking about the sexual harassment allegatio...


In [7]:
print(f"The embeddings shape: {embeddings.shape}")

The embeddings shape: (10000, 384)


In [8]:
def create_pinecone_index(
        index_name: str,
        dimension: int,
        metric: str = 'cosine',
):
    """
    Create a pinecone index if it does not exist
    Args:
        index_name: The name of the index
        dimension: The dimension of the index
        metric: The metric to use for the index
    Returns:
        Pinecone: A pinecone object which can later be used for upserting vectors and connecting to VectorDBs
    """
    from pinecone import Pinecone, ServerlessSpec
    print("Creating a Pinecone index...")
    pc = Pinecone(api_key=PINECONE_API_KEY)
    existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
    if index_name not in existing_indexes:
        pc.create_index(
            name=index_name,
            dimension=dimension,
            metric=metric,
            spec=ServerlessSpec(
                cloud="aws",
                region="us-east-1"
            )
        )
    print("Done!")
    return pc

In [9]:
INDEX_NAME = 'news-title-generator'

# Create the vector database
# We are passing the index_name and the size of our embeddings
pc = create_pinecone_index(INDEX_NAME, shape[1])

Creating a Pinecone index...
Done!


In [10]:
def upsert_vectors(
        index: Pinecone,
        embeddings: np.ndarray,
        dataset: dict,
        text_field: str = 'text',
        batch_size: int = 128
):
    """
    Upsert vectors to a pinecone index
    Args:
        index: The pinecone index object
        embeddings: The embeddings to upsert
        dataset: The dataset containing the metadata
        batch_size: The batch size to use for upserting
    Returns:
        An updated pinecone index
    """
    print("Upserting the embeddings to the Pinecone index...")
    shape = embeddings.shape
    
    ids = [str(i) for i in range(shape[0])]
    meta = [{text_field: text} for text in dataset[text_field]]
    
    # create list of (id, vector, metadata) tuples to be upserted
    to_upsert = list(zip(ids, embeddings, meta))

    for i in tqdm(range(0, shape[0], batch_size)):
        i_end = min(i + batch_size, shape[0])
        index.upsert(vectors=to_upsert[i:i_end])
    return index


In [11]:
# Upsert the embeddings to the Pinecone index
index = pc.Index(INDEX_NAME)
index_upserted = upsert_vectors(index, embeddings, dataset)

Upserting the embeddings to the Pinecone index...


100%|██████████| 79/79 [00:26<00:00,  2.95it/s]


In [12]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 10000}},
 'total_vector_count': 10000}

In [13]:
import cohere

In [14]:
def augment_prompt(
        query: str,
        model: SentenceTransformer = SentenceTransformer('all-MiniLM-L6-v2'),
        index=None,
) -> str:
    """
    Augment the prompt with the top 3 results from the knowledge base
    Args:
        query: The query to augment
        index: The vectorstore object
    Returns:
        str: The augmented prompt
    """
    results = [float(val) for val in list(model.encode(query))]
    
    # get top 3 results from knowledge base
    query_results = index.query(
        vector=results,
        top_k=3,
        include_values=True,
        include_metadata=True
    )['matches']
    text_matches = [match['metadata']['text'] for match in query_results]
    
    # get the text from the results
    source_knowledge = "\n\n".join(text_matches)
    
    # feed into an augmented prompt
    augmented_prompt = f"""Using the contexts below, answer the query.
    Contexts:
    {source_knowledge}
    If the answer is not included in the source knowledge - say that you don't know.
    Query: {query}"""
    return augmented_prompt, source_knowledge

# Example 1

In [18]:
query = "What role did Saurav Kant transition to at Tech Mahindra after completing upGrad's program?"
print(f'Query: "{query}"')
print()

# Using LLM only
co = cohere.Client(api_key=COHERE_API_KEY)
response = co.chat(
        model='command-r-plus',
        message=query,
    )
print(f'Answer without using RAG: {response.text}')
print()

# Using RAG and LLM
augmented_prompt, source_knowledge = augment_prompt(query, model=model, index=index)
response = co.chat(
        model='command-r-plus',
        message=augmented_prompt,
    )
print(f'Answer using RAG: {response.text}')
print()
print(f'Source knowledge:')
print(source_knowledge)

Query: "What role did Saurav Kant transition to at Tech Mahindra after completing upGrad's program?"



Answer without using RAG: Saurav Kant transitioned to the role of Senior Software Engineer at Tech Mahindra after completing upGrad's program.

Answer using RAG: Saurav Kant transitioned to the role of Data Scientist at Tech Mahindra.

Source knowledge:
Saurav Kant, an alumnus of upGrad and IIIT-B's PG Program in Machine learning and Artificial Intelligence, was a Sr Systems Engineer at Infosys with almost 5 years of work experience. The program and upGrad's 360-degree career support helped him transition to a Data Scientist at Tech Mahindra with 90% salary hike. upGrad's Online Power Learning has powered 3 lakh+ careers.

The Chief Justice of India Ranjan Gogoi, CBI Director Alok Kumar Verma, RBI Governor Shaktikanta Das and Comptroller and Auditor General Rajiv Mehrishi studied at St Stephen's College in New Delhi. All four had pursued degrees in History from the college. NITI Aayog CEO Amitabh Kant also studied at St Stephen's, graduating in Economics.

Over the last 2 years with 46

# Example 2

In [19]:
query = "How many consecutive victories did India's ODI team achieve before losing to New Zealand under Rohit Sharma's captaincy?"
print(f'Query: "{query}"')
print()

# Using LLM only
co = cohere.Client(api_key=COHERE_API_KEY)
response = co.chat(
        model='command-r-plus',
        message=query,
    )
print(f'Answer without using RAG: {response.text}')
print()

# Using RAG and LLM
augmented_prompt, source_knowledge = augment_prompt(query, model=model, index=index)
response = co.chat(
        model='command-r-plus',
        message=augmented_prompt,
    )
print(f'Answer using RAG: {response.text}')
print()
print(f'Source knowledge:')
print(source_knowledge)

Query: "How many consecutive victories did India's ODI team achieve before losing to New Zealand under Rohit Sharma's captaincy?"



Answer without using RAG: India's ODI team, under the captaincy of Rohit Sharma, achieved a streak of 8 consecutive bilateral ODI series victories before suffering a defeat at the hands of New Zealand in January 2023. This streak began in July 2022 when India defeated England 2-1 in a three-match ODI series. They then went on to win successive ODI series against West Indies, Zimbabwe, New Zealand, Bangladesh, Sri Lanka, and Australia (twice). The streak came to an end when New Zealand defeated India by 5 wickets in the first ODI of the three-match series, with India eventually losing the series 2-1.

Answer using RAG: India had 12 consecutive victories in international matches under Rohit Sharma's captaincy before losing to New Zealand in the fourth ODI.

Source knowledge:
New Zealand defeated India by 8 wickets in the fourth ODI at Hamilton on Thursday to win their first match of the five-match ODI series. India lost an international match under Rohit Sharma's captaincy after 12 conse

# Example 3

In [20]:
query = "How much money did Mumbai's talent tech startup Shortlist raise in a Series A round of funding?"
print(f'Query: "{query}"')
print()

# Using LLM only
co = cohere.Client(api_key=COHERE_API_KEY)
response = co.chat(
        model='command-r-plus',
        message=query,
    )
print(f'Answer without using RAG: {response.text}')
print()

# Using RAG and LLM
augmented_prompt, source_knowledge = augment_prompt(query, model=model, index=index)
response = co.chat(
        model='command-r-plus',
        message=augmented_prompt,
    )
print(f'Answer using RAG: {response.text}')
print()
print(f'Source knowledge:')
print(source_knowledge)

Query: "How much money did Mumbai's talent tech startup Shortlist raise in a Series A round of funding?"



Answer without using RAG: Mumbai-based talent tech startup Shortlist raised $3.25 million in a Series A round of funding.

Answer using RAG: $2 million.

Source knowledge:
Mumbai-headquartered talent technology startup Shortlist has raised $2 million in a Series A round of funding. The round was led by Blue Haven Initiative, with participation from Compass Venture Capital, Zephyr Acorn among others. Founded by Simon Desjardins, Paul Breloff and Matt Schnuck, Shortlist screens candidates using predictive chat-based interviews and online competency-based assessments.

Edtech startup Byju's has reportedly raised $400 million (over â¹2,890 crore) in funding, which values it at nearly $4 billion, making it India's fourth most valuable startup. The funding was secured from Canadaâs CPP Investment Board, Naspers Ventures, General Atlantic and some existing investors. Started in 2008, Byjuâs counts Chan Zuckerberg Initiative and Sequoia Capital among investors.

Pune-based baby products s