In [None]:
import os
import pandas as pd
from kaggle.api.kaggle_api_extended import KaggleApi
from langchain.vectorstores import Chroma
from langchain.embeddings import SentenceTransformerEmbeddings
from sentence_transformers import SentenceTransformer
from transformers import pipeline
from langchain.docstore.document import Document
from nltk.corpus import wordnet

# Ensure NLTK resources are downloaded
import nltk
nltk.download('wordnet')

# Authenticate with the Kaggle API
api = KaggleApi()
api.authenticate()


In [None]:
# Download the "All The News" dataset from Kaggle
api.dataset_download_files('asad1m9a9h6mood/news-articles', path='data/', unzip=True)

# Load and process the dataset
file_path = 'data/Articles.csv'  # Adjust the path based on dataset name
df = pd.read_csv(file_path, encoding='ISO-8859-1')


In [None]:
# Convert documents into the required format for Chroma 
document_list = [
    Document(
        page_content=row['Article'],
        metadata={
            'date': row['Date'],
            'heading': row['Heading'],
            'news_type': row['NewsType']
        }
    )
    for _, row in df.iterrows()
]

# TODO Truncate document list because it takes too long to process
if len(document_list) > 1000:
    document_list = document_list[:1000]


In [None]:
# Embed documents using SentenceTransformer
embedder = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = SentenceTransformerEmbeddings(model_name='all-MiniLM-L6-v2')

# Create a vector store using Chroma
vector_store = Chroma.from_documents(document_list, embeddings)


In [None]:
# Start of 2.2
# Function to expand the query using WordNet

def expand_query_with_synonyms(query):
    synonyms = set()
    for word in query.split():
        for syn in wordnet.synsets(word):
            for lemma in syn.lemmas():
                synonyms.add(lemma.name())
    expanded_query = query + " " + " ".join(synonyms)
    return expanded_query


In [None]:
# Expand the initial query
query = "Tell me about the latest news in the world"
expanded_query = expand_query_with_synonyms(query)

# Perform semantic search and retrieve relevant documents
retrieved_docs = vector_store.similarity_search(expanded_query, k=5)



In [None]:
# Get the Hugging Face token from the environment variables
hf_token = os.getenv("HUGGING_FACE_TOKEN")
#print(hf_token)

# Initialize the LLM (using Mistral or GPT-2 as an example)
#llm = pipeline("text-generation", model="gpt2")  # Replace with another model if needed
#for VM offline model usage
#llm = pipeline("text-generation", model="/home/nealsharma/.cache/huggingface/hub/models--mistralai--Mistral-7B-v0.1/snapshots/7231864981174d9bee8c7687c24c8344414eae6b", batch_size = 50)

llm = pipeline("text-generation", model="mistralai/Mistral-7B-v0.1", token=hf_token, batch_size = 50)



In [None]:
# Combine top-ranked documents into a context string
context = " ".join([doc.page_content for doc in retrieved_docs[:2]])  # Limiting to top 2 documents

# Generate the final response
response = llm(context, max_new_tokens=100, do_sample=True, top_k=50)[0]['generated_text']
print(response)
