In [None]:
import os
import pandas as pd
from kaggle.api.kaggle_api_extended import KaggleApi
from langchain.vectorstores import Chroma
from langchain.embeddings import SentenceTransformerEmbeddings
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForCausalLM, AutoTokenizer
from langchain.docstore.document import Document
from nltk.corpus import wordnet
import torch

# Ensure NLTK resources are downloaded
import nltk
nltk.download('wordnet')

# Authenticate with the Kaggle API
api = KaggleApi()
api.authenticate()


In [None]:
# Download the "All The News" dataset from Kaggle
api.dataset_download_files('asad1m9a9h6mood/news-articles', path='data/', unzip=True)

# Load and process the dataset
file_path = 'data/Articles.csv'  # Adjust the path based on dataset name
df = pd.read_csv(file_path, encoding='ISO-8859-1')


In [None]:
# Convert documents into the required format for Chroma 
document_list = [
    Document(
        page_content=row['Article'],
        metadata={
            'date': row['Date'],
            'heading': row['Heading'],
            'news_type': row['NewsType']
        }
    )
    for _, row in df.iterrows()
]

# TODO Truncate document list because it takes too long to process
max_length = 3000
if len(document_list) > max_length:
    document_list = document_list[:max_length]


In [None]:
# Embed documents using SentenceTransformer
embedder = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = SentenceTransformerEmbeddings(model_name='all-MiniLM-L6-v2')

# Create a vector store using Chroma
vector_store = Chroma.from_documents(document_list, embeddings)


In [None]:
# Start of 2.2
# Function to expand the query using WordNet

def expand_query_with_synonyms(query):
    synonyms = set()
    for word in query.split():
        for syn in wordnet.synsets(word):
            for lemma in syn.lemmas():
                synonyms.add(lemma.name())
    expanded_query = query + " " + " ".join(synonyms)
    return expanded_query


In [None]:
# Expand the initial query
query = "Tell me about what the news in Karachi is  mostly about"
expanded_query = expand_query_with_synonyms(query)

# Perform semantic search and retrieve relevant documents
retrieved_docs = vector_store.similarity_search(expanded_query, k=5)

# Combine top-ranked documents into a context string
context = " ".join([doc.page_content for doc in retrieved_docs[:2]])  # Limiting to top 2 documents


In [None]:
# Set the correct model path
model_path = "/home/nealsharma/llm/Mistral-7B-v0.1"

In [None]:
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto")

In [None]:
# Tokenize the context
inputs = tokenizer(context, return_tensors="pt").to(device)


In [None]:
# Generate text using the loaded model
with torch.no_grad():  # Disables gradient computation for faster inference
    outputs = model.generate(
        inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_new_tokens=50,  # Generate up to 100 new tokens
        do_sample=True,      # Enable sampling for more varied results
        top_k=50,            # Limit the sampling to the top 50 tokens
        temperature=0.7,      # Control the randomness (higher = more random)
        length_penalty=1.5  # Discourages very long outputs
    )

# Decode the generated tokens back into text
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Print the generated text
print(generated_text)

