In [9]:
import os
import pandas as pd
from kaggle.api.kaggle_api_extended import KaggleApi
from langchain.vectorstores import Chroma
from langchain.embeddings import SentenceTransformerEmbeddings
from sentence_transformers import SentenceTransformer
from transformers import pipeline
from langchain.docstore.document import Document


In [10]:
# Step 1: Authenticate with the Kaggle API
api = KaggleApi()
api.authenticate()

# Download the dataset (e.g., "All the News" dataset)
api.dataset_download_files('asad1m9a9h6mood/news-articles', path='data/', unzip=True) 


Dataset URL: https://www.kaggle.com/datasets/asad1m9a9h6mood/news-articles


In [11]:

# Step 2: Load and Process the Dataset
file_path = 'data/Articles.csv'  # path based on dataset name
df = pd.read_csv(file_path, encoding='ISO-8859-1')
#print(df.columns)

# Extract the content (text) column from the dataset
documents = df['Article'].dropna().tolist()


# Convert documents into the required format for Chroma
document_list = [
    Document(
        page_content=row['Article'],
        metadata={
            'date': row['Date'],
            'heading': row['Heading'],
            'news_type': row['NewsType']
        }
    )
    for _, row in df.iterrows()
]

In [12]:
# Step 3: Embed Documents
embedder = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = SentenceTransformerEmbeddings(model_name='all-MiniLM-L6-v2')

#Truncate document list because it takes too long to process
if len(document_list) > 1000:
    document_list = document_list[:1000]

# Create a vector store using Chroma
vector_store = Chroma.from_documents(document_list, embeddings)






In [13]:
# Step 4: Query the Vector Store
query = "Tell me about what most likely happens in the news for Istanbul"
retrieved_docs = vector_store.similarity_search(query, k=2)


In [14]:
# Step 5: Generate a Response Using Retrieved Documents as Context
context = " ".join([doc.page_content for doc in retrieved_docs])

# Get the Hugging Face token from the environment variables
hf_token = os.getenv("HUGGING_FACE_TOKEN")

# Use an LLM for response generation, in this case GPT2
#llm = pipeline("text-generation", model="gpt2")  # Replace with another model if needed
llm = pipeline("text-generation", model="mistralai/Mistral-7B-v0.1", token=hf_token)



Loading checkpoint shards: 100%|██████████| 2/2 [01:11<00:00, 35.84s/it]


In [15]:
# Step 6: Generate the Final Response
response = llm(context, max_new_tokens=30, do_sample=True, top_k=50)[0]['generated_text']
print(f"Generated Response: {response}")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Generated Response: strong>ISTANBUL: The foiled coup attempt seeking to unseat the government of President Recep Tayyip Erdogan has cost the Turkish economy 300 billion lira ($100 billion/90 billion euro), the trade minister was quoted as saying on Tuesday.</strongCustoms and Commerce Minister Bulent Tufenkci was quoted as saying by the Hurriyet newspaper the heavy price tag may even go up but insisted that Turkey´s economic fundamentals were solid."Warplanes, helicopters, weapons, bombs, buildings: 300 billion lira. Maybe I am underestimating a bit," he said, referring to the heavy destruction on the night of the coup."It might go up even more," he said.But the minister warned that the entire picture should be seen in a medium-term context even if some investors had been put off in the short-term."The putschists made Turkey seem like a third world country," he fumed."They (investors) are not coming after the images revealed tanks were deployed on the streets, parliament was bombed," h