In [1]:
import os
import pandas as pd
from kaggle.api.kaggle_api_extended import KaggleApi
from langchain.vectorstores import Chroma
from langchain.embeddings import SentenceTransformerEmbeddings
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForCausalLM, AutoTokenizer
from langchain.docstore.document import Document
from nltk.corpus import wordnet
import torch

# Ensure NLTK resources are downloaded
import nltk
nltk.download('wordnet')

# Authenticate with the Kaggle API
api = KaggleApi()
api.authenticate()


In [None]:
# Download the "All The News" dataset from Kaggle
api.dataset_download_files('asad1m9a9h6mood/news-articles', path='data/', unzip=True)

# Load and process the dataset
file_path = 'data/Articles.csv'  # Adjust the path based on dataset name
df = pd.read_csv(file_path, encoding='ISO-8859-1')


In [4]:
# Convert documents into the required format for Chroma 
document_list = [
    Document(
        page_content=row['Article'],
        metadata={
            'date': row['Date'],
            'heading': row['Heading'],
            'news_type': row['NewsType']
        }
    )
    for _, row in df.iterrows()
]

# NOTE Truncate document list because it takes too long to process
max_length = 1000
if len(document_list) > max_length:
    document_list = document_list[:max_length]


In [None]:
# Initialize the SentenceTransformer and Chroma vectorstore
# Embed documents using SentenceTransformer
embedding_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
embedding_function = SentenceTransformerEmbeddings(embedding_model)

# Create a vector store using Chroma
vector_store = Chroma.from_documents(document_list, embedding_function)


In [None]:
# Function to expand the query using WordNet
def expand_query_with_synonyms(query):
    synonyms = set()
    for word in query.split():
        for syn in wordnet.synsets(word):
            for lemma in syn.lemmas():
                synonyms.add(lemma.name())
    expanded_query = query + " " + " ".join(synonyms)
    return expanded_query

In [6]:
# Start of 2.3

# Helper function to dynamically retrieve more information
def dynamic_retrieval(query, docs, vector_store):
    """
    Function to dynamically retrieve more documents based on the query and the current documents.
    This allows the model to ask for more information if necessary during generation.
    
    Args:
    - query: The current query or prompt being used.
    - docs: List of documents already retrieved, to avoid redundancy.
    - vector_store: The vector store used to perform the retrieval.
    
    Returns:
    - List of newly retrieved documents.
    """
    # Perform dynamic retrieval using vector_store and filtering out already retrieved documents
    new_docs = vector_store.similarity_search(query, top_k=10)  # Retrieve top 10 similar documents
    
    # Filter out already retrieved documents by comparing metadata (e.g., 'heading')
    retrieved_docs = {doc.metadata['heading'] for doc in docs}  # Adjust to your unique document key
    unique_new_docs = [doc for doc in new_docs if doc.metadata['heading'] not in retrieved_docs]
    
    return unique_new_docs



In [None]:
# Function to iteratively generate text and dynamically retrieve more info if needed
def generate_with_dynamic_retrieval(prompt, vector_store, model, tokenizer, max_iterations=3):
    """
    Generate a response while dynamically retrieving more information as needed.
    The model will iteratively request more documents if gaps or ambiguities are detected.
    """
    expanded_query = expand_query_with_synonyms(prompt)
    current_docs = vector_store.similarity_search(expanded_query, k=5)  # Initial retrieval
    generation = ""
    context = " ".join([doc.page_content for doc in current_docs[:2]])  # Limiting to top 2 documents
    
    for i in range(max_iterations):
        # Tokenize the current context
        inputs = tokenizer(context, return_tensors="pt")
        
        # Generate text using the loaded model
        with torch.no_grad():  # Disables gradient computation for faster inference
            outputs = model.generate(
                inputs['input_ids'],
                #attention_mask=inputs['attention_mask'],
                max_new_tokens=50,  # Generate up to 100 new tokens
                do_sample=True,      # Enable sampling for more varied results
                top_k=50,            # Limit the sampling to the top 50 tokens
                temperature=0.7,      # Control the randomness (higher = more random)
                length_penalty=1.5  # Discourages very long outputs
            )

        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Append generated text to the ongoing generation
        generation += generated_text
        
        # Check if there are ambiguities or gaps in the generated text (trigger more retrieval)
        if "need more information" in generated_text or "clarify" in generated_text:
            # Dynamically retrieve more documents based on the generated content
            new_docs = dynamic_retrieval(generated_text, current_docs, vector_store)
            current_docs += new_docs  # Add newly retrieved documents to the context
            context += " " + " ".join([doc.page_content for doc in new_docs[:2]])  # Update the context
        else:
            break  # Exit if no more information is needed

    return generation


In [None]:
# Expand the initial query
query = "Tell me about what the news in Karachi is  mostly about"

In [8]:
# Set the correct model path
model_path = "/home/nealsharma/llm/Mistral-7B-v0.1"

# Load the model and tokenizer
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [7]:
# Generate text with dynamic retrieval
response = generate_with_dynamic_retrieval(query, vector_store, model, tokenizer)
print(response)
