In [None]:
!pip install transformers sentence-transformers datasets cohere
!pip install pinecone-client

Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cohere
  Downloading cohere-5.5.8-py3-none-any.whl (173 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m173.8/173.8 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━

In [8]:
# API keys
with open("chohere_api_keys.txt") as f:
    COHERE_API_KEY = f.read().strip()
with open("pinecone_api_key.txt") as f:
    PINECONE_API_KEY = f.read().strip()

In [32]:
# Import necessary libraries
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
from pinecone import Pinecone, ServerlessSpec
import cohere
import numpy as np
import pandas as pd
from tqdm import tqdm
import warnings
import nltk
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict

warnings.filterwarnings("ignore")
nltk.download('punkt')


# Load embedding model
EMBEDDING_MODEL = 'all-MiniLM-L6-v2'
model = SentenceTransformer(EMBEDDING_MODEL)

# Function to split text into sentences
def split_into_sentences(text):
    return nltk.sent_tokenize(text)

# Function to load and embed dataset with sentence chunking
def load_and_embed_dataset(dataset_name: str, split: str, model: SentenceTransformer, text_field: str, rec_num: int) -> tuple:
    print("Loading and embedding the dataset")
    dataset = load_dataset(dataset_name, split=split)
    sentence_chunks = []
    origin_docs = {}

    for idx, doc in enumerate(dataset[text_field][:rec_num]):
        sentences = split_into_sentences(doc)
        sentence_chunks.extend(sentences)
        for sentence in sentences:
            normalized_sentence = sentence.strip().lower()
            origin_docs[normalized_sentence] = idx  # Maintain the origin document index for each sentence

    embeddings = model.encode(sentence_chunks)
    print("Done!")
    return dataset, embeddings, sentence_chunks, origin_docs

# Dataset parameters
DATASET_NAME = 'ag_news'
TEXT_FIELD = 'text'
RECORDS_NUM = 200

# Load and embed dataset
dataset, embeddings, sentence_chunks, origin_docs = load_and_embed_dataset(
    dataset_name=DATASET_NAME,
    split='train',
    model=model,
    text_field=TEXT_FIELD,
    rec_num=RECORDS_NUM
)

# Create Pinecone index
def create_pinecone_index(index_name: str, dimension: int, metric: str = 'cosine'):
    print("Creating a Pinecone index...")
    pc = Pinecone(api_key=PINECONE_API_KEY)
    existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
    if index_name not in existing_indexes:
        pc.create_index(
            name=index_name,
            dimension=dimension,
            metric=metric,
            spec=ServerlessSpec(cloud="aws", region="us-east-1")
        )
    print("Done!")
    return pc

INDEX_NAME = 'ag-news-index'
pc = create_pinecone_index(INDEX_NAME, embeddings.shape[1])

# Upsert vectors to Pinecone index
def upsert_vectors(index: Pinecone, embeddings: np.ndarray, sentence_chunks: list, batch_size: int = 128):
    print("Upserting the embeddings to the Pinecone index...")
    ids = [str(i) for i in range(embeddings.shape[0])]
    meta = [{'text': text} for text in sentence_chunks]
    to_upsert = list(zip(ids, embeddings, meta))
    for i in tqdm(range(0, embeddings.shape[0], batch_size)):
        i_end = min(i + batch_size, embeddings.shape[0])
        index.upsert(vectors=to_upsert[i:i_end])
    print("Done!")
    return index

index = pc.Index(INDEX_NAME)
index_upserted = upsert_vectors(index, embeddings, sentence_chunks)

# Cohere client
co = cohere.Client(api_key=COHERE_API_KEY)

# Function to augment prompt with context based on origin documents
def augment_prompt(query: str, model: SentenceTransformer, index=None, top_k: int = 3) -> str:
    query_embedding = model.encode(query).tolist()
    query_results = index.query(
        vector=query_embedding,
        top_k=top_k * 3,  # Retrieve more sentences to ensure we get top-k documents
        include_values=True,
        include_metadata=True
    )['matches']

    # Get the embeddings of the retrieved chunks
    retrieved_embeddings = np.array([match['values'] for match in query_results])
    retrieved_texts = [match['metadata']['text'].strip().lower() for match in query_results]

    # Debugging: Print sentences that caused KeyError
    for text in retrieved_texts:
        if text not in origin_docs:
            print(f"Sentence causing KeyError: {text}")

    retrieved_doc_indices = [origin_docs.get(text, -1) for text in retrieved_texts]

    # Compute cosine similarity between the query and the retrieved chunks
    similarities = cosine_similarity([query_embedding], retrieved_embeddings)[0]

    # Group sentences by their origin documents
    doc_similarities = defaultdict(float)
    doc_sentences = defaultdict(list)
    for idx, doc_index in enumerate(retrieved_doc_indices):
        if doc_index != -1:
            doc_similarities[doc_index] += similarities[idx]
            doc_sentences[doc_index].append(retrieved_texts[idx])

    # Rank documents based on cumulative similarity
    ranked_doc_indices = sorted(doc_similarities, key=doc_similarities.get, reverse=True)

    # Select the top-ranked documents to use as context
    selected_docs = ranked_doc_indices[:top_k]
    source_knowledge = "\n\n".join([" ".join(doc_sentences[doc_index]) for doc_index in selected_docs])

    augmented_prompt = f"""Using the contexts below, answer the query.
    Contexts:
    {source_knowledge}
    If the answer is not included in the source knowledge - say that you don't know.
    Query: {query}"""
    return augmented_prompt, source_knowledge


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Loading and embedding the dataset
Done!
Creating a Pinecone index...
Done!
Upserting the embeddings to the Pinecone index...


100%|██████████| 4/4 [00:02<00:00,  1.69it/s]

Done!





In [34]:
hallucinations_queries=["How much will Hewlett-Packard pay for Synstar?",
                      "What did the scientists oppose in the Bush administration and how many of them were Nobel Prize winners?",
                      "What was the annual base salary of Danny Bazil Riley when he started working as the general manager at a commercial real estate firm?"
                      ]

In [37]:
q1=hallucinations_queries[0]
q2=hallucinations_queries[1]
q3=hallucinations_queries[2]

In [38]:
q=q1
print("-" * 80)
print(f"Query: {q}")
print("-" * 80)
response = co.chat(model='command-r-plus', message=q)
print("Without context:" )
print(response.text)
augmented_prompt, source_knowledge = augment_prompt(q, model=model, index=index)
response_with_context = co.chat(model='command-r-plus', message=augmented_prompt)
print("With context:")
print(response_with_context.text)
print("\nSource knowledge used:")
print(source_knowledge)
print("\n" + "-" * 80 + "\n")

--------------------------------------------------------------------------------
Query: How much will Hewlett-Packard pay for Synstar?
--------------------------------------------------------------------------------
Without context:
Hewlett-Packard will pay $420 million for Synstar.
With context:
Hewlett-Packard will pay $297 million for Synstar.

Source knowledge used:
hp to buy synstar hewlett-packard will pay \$297 million for the british company.

just all of it that isn't dell. delightful dell the company's results show that it's not grim all over tech world.

microsoft has 40-50 billion in the bank.

--------------------------------------------------------------------------------



In [41]:
q=q2
print("-" * 80)
print(f"Query: {q}")
print("-" * 80)
response = co.chat(model='command-r-plus', message=q)
print("Without context:" )
print(response.text)
augmented_prompt, source_knowledge = augment_prompt(q, model=model, index=index)
response_with_context = co.chat(model='command-r-plus', message=augmented_prompt)
print("With context:")
print(response_with_context.text)
print("\nSource knowledge used:")
print(source_knowledge)
print("\n" + "-" * 80 + "\n")

--------------------------------------------------------------------------------
Query: What did the scientists oppose in the Bush administration and how many of them were Nobel Prize winners?
--------------------------------------------------------------------------------
Without context:
The scientists opposed the Bush administration's policies on stem cell research and global warming. Out of the 62 signatories, 20 were Nobel Prize winners.
With context:
The scientists opposed the Bush administration's use of scientific advice. There were 48 Nobel Prize winners among them.

Source knowledge used:
science, politics collide in election year (ap) ap - with more than 4,000 scientists, including 48 nobel prize winners, having signed a statement opposing the bush administration's use of scientific advice, this election year is seeing a new development in the uneasy relationship between science and politics.

\\are you paying attention bush administration?\\

that's about the state of knowl

In [40]:
q=q3
print("-" * 80)
print(f"Query: {q}")
print("-" * 80)
response = co.chat(model='command-r-plus', message=q)
print("Without context:" )
print(response.text)
augmented_prompt, source_knowledge = augment_prompt(q, model=model, index=index)
response_with_context = co.chat(model='command-r-plus', message=augmented_prompt)
print("With context:")
print(response_with_context.text)
print("\nSource knowledge used:")
print(source_knowledge)
print("\n" + "-" * 80 + "\n")

--------------------------------------------------------------------------------
Query: What was the annual base salary of Danny Bazil Riley when he started working as the general manager at a commercial real estate firm?
--------------------------------------------------------------------------------
Without context:
$120,000
With context:
Danny Bazil Riley's annual base salary when he started working as the general manager at a commercial real estate firm was $70,000.

Source knowledge used:
safety net (forbes.com) forbes.com - after earning a ph.d. in sociology, danny bazil riley started to work as the general manager at a commercial real estate firm at an annual base salary of  #36;70,000. but, at 32, "buying insurance was the furthest thing from my mind," says riley.

making your insurer pay if hurricane charley blows your house down, how can you make your insurance company pay?

not to worry: you can find a financial planner for every specialized need

---------------------------