In [1]:
import os
import pandas as pd
from kaggle.api.kaggle_api_extended import KaggleApi
from langchain.vectorstores import Chroma
from langchain.embeddings import SentenceTransformerEmbeddings
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForCausalLM, AutoTokenizer
from langchain.docstore.document import Document
from nltk.corpus import wordnet
import torch

# # Ensure NLTK resources are downloaded
# nltk.download('wordnet')

# Authenticate with the Kaggle API
api = KaggleApi()
api.authenticate()


  from tqdm.autonotebook import tqdm, trange
The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

In [2]:
import os
import shutil

# Path to Hugging Face cache
cache_dir = os.path.expanduser("~/.cache/huggingface")

# Check if the cache directory exists and delete it
if os.path.exists(cache_dir):
    shutil.rmtree(cache_dir)
    print(f"Hugging Face cache at {cache_dir} has been cleared.")
else:
    print("Hugging Face cache is already cleared.")


Hugging Face cache at /home/nealsharma/.cache/huggingface has been cleared.


In [3]:
# Prune Git LFS objects to free up space
!git -C /home/nealsharma/llm/Mistral-7B-v0.1 lfs prune


prune: 5 local object(s), 5 retained, done.                                     


In [4]:
# Download the "All The News" dataset from Kaggle
api.dataset_download_files('asad1m9a9h6mood/news-articles', path='data/', unzip=True)

# Load and process the dataset
file_path = 'data/Articles.csv'  # Adjust the path based on dataset name
df = pd.read_csv(file_path, encoding='ISO-8859-1')


Dataset URL: https://www.kaggle.com/datasets/asad1m9a9h6mood/news-articles


In [5]:
# Convert documents into the required format for Chroma 
document_list = [
    Document(
        page_content=row['Article'],
        metadata={
            'date': row['Date'],
            'heading': row['Heading'],
            'news_type': row['NewsType']
        }
    )
    for _, row in df.iterrows()
]

# NOTE Truncate document list because it takes too long to process
max_length = 10
if len(document_list) > max_length:
    document_list = document_list[:max_length]


In [6]:
# Initialize the SentenceTransformer and Chroma vectorstore
# Embed documents using SentenceTransformer

embedding_function = SentenceTransformerEmbeddings(model_name = 'all-MiniLM-L6-v2')

# Create a vector store using Chroma
vector_store = Chroma.from_documents(document_list, embedding_function)


  embedding_function = SentenceTransformerEmbeddings(model_name = 'all-MiniLM-L6-v2')


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [7]:
def expand_query(query):
    expanded_terms = []
    for word in query.split():
        synsets = wordnet.synsets(word)
        if synsets:
            expanded_terms.extend([lemma.name() for lemma in synsets[0].lemmas()])
    return " ".join(set(query.split() + expanded_terms))

def dynamic_retrieval(query, docs, vector_store, k=2):
    expanded_query = expand_query(query)
    new_docs = vector_store.similarity_search(expanded_query, k=k)
    retrieved_docs = {doc.metadata['heading'] for doc in docs}
    unique_new_docs = [doc for doc in new_docs if doc.metadata['heading'] not in retrieved_docs]
    return unique_new_docs


In [8]:
# Set the correct model path
model_path = "/home/nealsharma/llm/Mistral-7B-v0.1"

# Load the model and tokenizer
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_path)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
# Expand the initial query
query = "Tell me about what the news in Karachi is based on what you know"

# Initial document retrieval using the query
current_docs = vector_store.similarity_search(query, k=2)
context = " ".join([doc.page_content for doc in current_docs])

# Caching tokenized context to avoid re-tokenization
cached_inputs = tokenizer(context, return_tensors="pt")

In [None]:
max_iterations = 3
for i in range(max_iterations):
    # Generate text
    with torch.no_grad():
        outputs = model.generate(
            cached_inputs['input_ids'],
            max_new_tokens=40,
            do_sample=True,
            top_k=50,
            top_p=0.95,
            temperature=0.7,
            length_penalty=1.5
        )
    
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"Iteration {i+1} generated text:\n{generated_text}\n")
    
    # Check if more information is needed
    if len(generated_text.split()) < 20 or "need more information" in generated_text.lower():
        new_docs = dynamic_retrieval(generated_text, current_docs, vector_store, k=2)
        if new_docs:
            current_docs += new_docs
            new_context = " ".join([doc.page_content for doc in new_docs])
            context += " " + new_context
            cached_inputs = tokenizer(context, return_tensors="pt")
        else:
            print("No new relevant information found. Ending generation.")
            break
    else:
        print("Sufficient information generated. Ending generation.")
        break

# Cell 10: Clean up (optional)
del model
torch.cuda.empty_cache()

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
