In [None]:
import os
import pandas as pd
from kaggle.api.kaggle_api_extended import KaggleApi
from langchain.vectorstores import Chroma
from langchain.embeddings import SentenceTransformerEmbeddings
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForCausalLM, AutoTokenizer
from langchain.docstore.document import Document
from nltk.corpus import wordnet
import torch

# # Ensure NLTK resources are downloaded
# import nltk
# nltk.download('wordnet')

# Authenticate with the Kaggle API
api = KaggleApi()
api.authenticate()


In [None]:
import os
import shutil

# Path to Hugging Face cache
cache_dir = os.path.expanduser("~/.cache/huggingface")

# Check if the cache directory exists and delete it
if os.path.exists(cache_dir):
    shutil.rmtree(cache_dir)
    print(f"Hugging Face cache at {cache_dir} has been cleared.")
else:
    print("Hugging Face cache is already cleared.")


In [None]:
# Prune Git LFS objects to free up space
!git -C /home/nealsharma/llm/Mistral-7B-v0.1 lfs prune


In [None]:
# Download the "All The News" dataset from Kaggle
api.dataset_download_files('asad1m9a9h6mood/news-articles', path='data/', unzip=True)

# Load and process the dataset
file_path = 'data/Articles.csv'  # Adjust the path based on dataset name
df = pd.read_csv(file_path, encoding='ISO-8859-1')


In [None]:
# Convert documents into the required format for Chroma 
document_list = [
    Document(
        page_content=row['Article'],
        metadata={
            'date': row['Date'],
            'heading': row['Heading'],
            'news_type': row['NewsType']
        }
    )
    for _, row in df.iterrows()
]

# NOTE Truncate document list because it takes too long to process
max_length = 10
if len(document_list) > max_length:
    document_list = document_list[:max_length]


In [None]:
# Initialize the SentenceTransformer and Chroma vectorstore
# Embed documents using SentenceTransformer

embedding_function = SentenceTransformerEmbeddings(model_name = 'all-MiniLM-L6-v2')

# Create a vector store using Chroma
vector_store = Chroma.from_documents(document_list, embedding_function)


In [None]:
# Define a function to perform dynamic retrieval
def dynamic_retrieval(query, docs, vector_store):
    """
    Function to dynamically retrieve more documents based on the generated text.
    """
    # Retrieve more documents using the generated query
    new_docs = vector_store.similarity_search(query, top_k=2)  # Retrieve top 10 similar documents
    
    # Filter out already retrieved documents to avoid redundancy
    retrieved_docs = {doc.metadata['heading'] for doc in docs}
    unique_new_docs = [doc for doc in new_docs if doc.metadata['heading'] not in retrieved_docs]
    
    return unique_new_docs

In [None]:
# Expand the initial query
query = "Tell me about what the news in Karachi is  mostly about"

# Initial document retrieval using the query
current_docs = vector_store.similarity_search(query, k=2)  # Retrieve top 2 documents initially
context = " ".join([doc.page_content for doc in current_docs])

In [None]:
# Set the correct model path
model_path = "/home/nealsharma/llm/Mistral-7B-v0.1"

# Load the model and tokenizer
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [None]:
# Caching tokenized context to avoid re-tokenization
cached_inputs = None

In [None]:
import time 


# Tokenize the initial context (cache tokenized inputs)
cached_inputs = tokenizer(context, return_tensors="pt")

# Generate text using the loaded model
start_time = time.time()
with torch.no_grad():
    outputs = model.generate(
        cached_inputs['input_ids'],
        max_new_tokens=20,
        do_sample=True,
        top_k=50,
        temperature=0.7,
        length_penalty=1.5
    )
print(f"Model inference took {time.time() - start_time} seconds")

# Decode the generated tokens back into text
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)

# Check if more information is needed
# if "need more information" in generated_text or "clarify" in generated_text:
#     # Perform dynamic retrieval based on the generated text
#     new_docs = dynamic_retrieval(generated_text, current_docs, vector_store)
    
#     if new_docs:
#         # Update the context with new information
#         current_docs += new_docs
#         new_context = " " + " ".join([doc.page_content[:300] for doc in new_docs[:2]])  # Update context, limit to 300 chars
#         new_inputs = tokenizer(new_context, return_tensors="pt")  # Only tokenize new part
        
#         # Combine cached inputs with newly tokenized inputs
#         inputs['input_ids'] = torch.cat([cached_inputs['input_ids'], new_inputs['input_ids']], dim=1)

#         # Generate more text using the updated context
#         start_time = time.time()
#         with torch.no_grad():
#             outputs = model.generate(
#                 inputs['input_ids'],
#                 max_new_tokens=20,
#                 do_sample=True,
#                 top_k=50,
#                 temperature=0.7,
#                 length_penalty=1.5
#             )
#         print(f"Model inference (second iteration) took {time.time() - start_time} seconds")

#         generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
#         print(generated_text)