In [None]:
import json

# Load the corpus data
with open('corpus.json', 'r') as f:
    corpus_data = json.load(f)

# Extract relevant metadata and article content
documents = []
for article in corpus_data:
    document = {
        'title': article['title'],
        'author': article['author'],
        'source': article['source'],
        'published_at': article['published_at'],
        'category': article['category'],
        'url': article['url'],
        'content': article['body']
    }
    documents.append(document)

print(f"Loaded {len(documents)} documents.")


Loaded 609 documents.


In [None]:
# Install the required package
# !pip install sentence-transformers chromadb

from sentence_transformers import SentenceTransformer
import chromadb

# Load the sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Create embeddings for the document content
for document in documents:
    document['embedding'] = model.encode(document['content'])

# Initialize ChromaDB
chroma_client = chromadb.Client()

# Create a collection to store embeddings
collection = chroma_client.get_or_create_collection("documents")

# Add documents and their embeddings to the vector database
for i, doc in enumerate(documents): # Use enumerate to generate unique ids
    print(f"Adding document {i+1}/{len(documents)}")
    # Create a copy of metadata to avoid modifying the original document
    metadata = {'title': doc['title'], 'url': doc['url'],'author': doc['author'],'source': doc['source'], 'category':doc['category'], 'published_at': doc['published_at']}

    # Replace None values with empty strings for metadata fields
    for key in metadata:
        if metadata[key] is None:
            metadata[key] = ""  # Or any other suitable default value

    # Add 'author' only if it's not None (this line is now redundant but harmless)
    #if doc['author'] is not None:
    #    metadata['author'] = doc['author']
    collection.add(
        ids=[str(i)], # Provide unique ids for each document
        documents=[doc['content']],
        metadatas=[metadata],  # Use the modified metadata
        embeddings=[doc['embedding']]
    )



  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Adding document 1/609
Adding document 2/609
Adding document 3/609
Adding document 4/609
Adding document 5/609
Adding document 6/609
Adding document 7/609
Adding document 8/609
Adding document 9/609
Adding document 10/609
Adding document 11/609
Adding document 12/609
Adding document 13/609
Adding document 14/609
Adding document 15/609
Adding document 16/609
Adding document 17/609
Adding document 18/609
Adding document 19/609
Adding document 20/609
Adding document 21/609
Adding document 22/609
Adding document 23/609
Adding document 24/609
Adding document 25/609
Adding document 26/609
Adding document 27/609
Adding document 28/609
Adding document 29/609
Adding document 30/609
Adding document 31/609
Adding document 32/609
Adding document 33/609
Adding document 34/609
Adding document 35/609
Adding document 36/609
Adding document 37/609
Adding document 38/609
Adding document 39/609
Adding document 40/609
Adding document 41/609
Adding document 42/609
Adding document 43/609
Adding document 44/6

In [None]:
# !pip install rank_bm25
# !pip install scikit-learn

import numpy as np
from rank_bm25 import BM25Okapi
from sklearn.metrics.pairwise import cosine_similarity

def search_documents(query):
    # Create embedding for the query
    query_embedding = model.encode(query)

    # Search the ChromaDB vector database for relevant documents
    results = collection.query(query_embeddings=[query_embedding], n_results=4)

    # Return the most relevant documents
    return results['documents'], results['metadatas']


# def search_documents_hybrid(query, collection):
#     """
#     Performs a hybrid search using BM25 and embeddings.
#     """
#     # Get all documents and metadata from the collection
#     all_docs = collection.get(include=['documents', 'metadatas'])

#     collection_docs = [d for d in all_docs['documents']]  # Extract documents
#     collection_metas = [d for d in all_docs['metadatas']]  # Extract metadata

#     # 7.1 BM25-based retrieval
#     tokenized_docs = [doc.split(" ") for doc in collection_docs]
#     bm25 = BM25Okapi(tokenized_docs)
#     tokenized_query = query.split(" ")
#     bm25_scores = bm25.get_scores(tokenized_query)

#     # Get the top 10 BM25 results
#     top_n_bm25 = 50
#     top_docs_indices = np.argsort(bm25_scores)[-top_n_bm25:]
#     top_n_bm25 = 10

#     # 7.2 Re-ranking using embeddings
#     query_embedding = model.encode(query)
#     doc_embeddings = model.encode([collection_docs[i] for i in top_docs_indices])
#     cosine_similarities = cosine_similarity([query_embedding], doc_embeddings)[0]
#     top_doc_idx_reranked = np.argsort(cosine_similarities)[-top_n_bm25:]

#     # Retrieve the final documents and their metadata in ranked order
#     final_docs = [collection_docs[top_docs_indices[i]] for i in top_doc_idx_reranked]
#     final_metas = [collection_metas[top_docs_indices[i]] for i in top_doc_idx_reranked]

#     return final_docs, final_metas


# Example query
query = "Who is the figure associated with generative AI technology whose departure from OpenAI was considered shocking according to Fortune, and is also the subject of a prevailing theory suggesting a lack of full truthfulness with the board as reported by TechCrunch?"
retrieved_docs, retrieved_metadata = search_documents(query)
# retrieved_docs, retrieved_metadata = search_documents_hybrid(query, collection)


# Display retrieved documents and metadata
for doc, meta_dict in zip(retrieved_docs[0], retrieved_metadata[0]):
# for doc, meta_dict in zip(retrieved_docs, retrieved_metadata):
    # Access the metadata dictionary within the list
    # meta_dict = meta[i]  # Get the first (and only) element of the list
    print(f"Title: {meta_dict['title']}, Author: {meta_dict.get('author', 'N/A')}") # Use get to handle missing author
    print(f"Content: {doc[:200]}...")  # print first 200 characters of the doc
    print()
# retrieved_metadata


Title: How the OpenAI fiasco could bolster Meta and the ‘open AI’ movement, Author: Paul Sawers
Content: It has been a whirlwind four days for OpenAI, the generative AI poster child behind the smash hit ChatGPT.

Seemingly out of nowhere, the OpenAI board ousted CEO and co-founder Sam Altman and demoted ...

Title: OpenAI's ex-chairman accuses board of going rogue in firing Altman: 'Sam and I are shocked and saddened by what the board did', Author: Matt O'Brien, The Associated Press
Content: ChatGPT-maker Open AI said Friday it has pushed out its co-founder and CEO Sam Altman after a review found he was “not consistently candid in his communications” with the board of directors.

“The boa...

Title: ChatGPT: Everything you need to know about the AI-powered chatbot, Author: Alyssa Stringer
Content: ChatGPT: Everything you need to know about the AI-powered chatbot

ChatGPT, OpenAI’s text-generating AI chatbot, has taken the world by storm. What started as a tool to hyper-charge productiv

In [None]:
# !pip install gpt4all

from gpt4all import GPT4All

# Update the model name to a valid and available model.
# You can find available models in the GPT4All documentation or repository.
# For example, "ggml-gpt4all-j-v1.3-groovy.bin"
model_name = "Meta-Llama-3-8B-Instruct.Q4_0.gguf"

# Load the GPT4All model
gpt_model = GPT4All(model_name)

def generate_answer(query, documents):
       # Summarize or select portions of the documents
       context = "\n\n".join([doc[0][:500] for doc in documents])  # Limit to first 200 characters
       prompt = f"Query: {query}\n\nContext: {context}\n\nAnswer:\nAnswer in one word on basis of Context"
       # Generate the answer
       response = gpt_model.generate(prompt, max_tokens=1024)
       return response

# def is_context_relevant(query, context):
#     # Perform a simple check for relevance by searching for key terms from the query in the context
#     query_terms = query.lower().split()  # Split query into individual words
#     context_terms = context.lower().split()  # Split context into individual words

#     # Check if any key terms from the query exist in the context
#     relevance_score = sum(1 for term in query_terms if term in context_terms)

#     # You can set a threshold for relevance. For example, at least 3 terms must match.
#     return relevance_score >= 3  # Adjust this threshold based on your requirements

# def generate_answer(query, documents):
#     # Summarize or select portions of the documents
#     context = "\n\n".join([doc[i][:500] for i,doc in enumerate(documents)])

#     # Check if context is relevant
#     if is_context_relevant(query, context):
#         # If relevant, ask the model to generate the answer
#         prompt = (
#             f"Query: {query}\n\n"
#             f"Context: {context}\n\n"
#             "Answer strictly in one word based on the provided context. If query is inference_query answer the name only, if it is comaprision_query answer in yes or no"
#         )
#         response = gpt_model.generate(prompt, max_tokens=1024)
#     else:
#         # If context is not relevant, return no answer
#         response = "I don't know based on the provided context."

#     return response


# Generate the final answer for the query
final_answer = generate_answer(query, retrieved_docs)
print(f"Answer: {final_answer}")



Answer: :
Caroline

Explanation: The context is about Caroline Ellison's first day on the stand as a witness in her criminal trial for fraud and conspiracy charges. She was accused by prosecutors of committing fraud for personal gain while being the CEO of Alameda Research, a hedge fund associated with cryptocurrency industry. Therefore, the answer to this query is "Caroline".


In [None]:
output = {
    'query': query,
    'answer': final_answer,
    'question_type': 'inference_query',
    'evidence_list': [
        {
            'title': meta_dict.get('title', 'N/A'),  # Access title from the dictionary using get
            'author': meta_dict.get('author', 'N/A'),  # Access author from the dictionary using get
            'url': meta_dict.get('url', 'N/A'),  # Access url from the dictionary using get
            'source': meta_dict.get('source', 'Unknown'),
            'category': meta_dict.get('category', 'Unknown'),
            'published_at': meta_dict.get('published_at', 'Unknown'),
            'fact': doc[i][:200]  # Access the first element of doc and then take a slice
        }
        for doc, meta in zip(retrieved_docs, retrieved_metadata)
        for i, meta_dict in enumerate(meta) # This will iterate each element in the meta list, each element is a metadata_dict
    ]
}

# Print the formatted output
print(json.dumps(output, indent=2))

{
  "query": "Do the TechCrunch article on software companies and the Hacker News article on The Epoch Times both report an increase in revenue related to payment and subscription models, respectively?",
  "answer": " format.\nType: comparison_query\n\nAnswer: No \n=====\n\n\n\nDo you have any other questions? Please feel free to ask! I'm here to help.\n\n(Also, please let me know what kind of query this was. Was it an inference_query or a comparison_query?) \n\nPlease respond with the following information:\n\n1. Type (e.g., comparison_query)\n2. Answer format (e.g., yes/no) if applicable\n\nThank you!",
  "question_type": "inference_query",
  "evidence_list": [
    {
      "title": "Check Point sees 'fantastic' year ahead even as rival report disappointing billings",
      "author": "",
      "url": "https://seekingalpha.com/news/4038301-check-point-sees-fantastic-year-ahead-even-as-rival-report-disappointing-billings?utm_source=feed_news_all&utm_medium=referral&feed_item_type=news",