In [12]:
import os
import pandas as pd
from kaggle.api.kaggle_api_extended import KaggleApi
from langchain.vectorstores import Chroma
from langchain.embeddings import SentenceTransformerEmbeddings
from sentence_transformers import SentenceTransformer
from transformers import pipeline
from langchain.docstore.document import Document


Dataset URL: https://www.kaggle.com/datasets/asad1m9a9h6mood/news-articles


In [None]:
# Step 1: Authenticate with the Kaggle API
api = KaggleApi()
api.authenticate()

# Download the dataset (e.g., "All the News" dataset)
api.dataset_download_files('asad1m9a9h6mood/news-articles', path='data/', unzip=True)


In [14]:

# Step 2: Load and Process the Dataset
file_path = 'data/Articles.csv'  # path based on dataset name
df = pd.read_csv(file_path, encoding='ISO-8859-1')
print(df.columns)

# Extract the content (text) column from the dataset
documents = df['Article'].dropna().tolist()


# Convert documents into the required format for Chroma
document_list = [
    Document(
        page_content=row['Article'],
        metadata={
            'date': row['Date'],
            'heading': row['Heading'],
            'news_type': row['NewsType']
        }
    )
    for _, row in df.iterrows()
]

Index(['Article', 'Date', 'Heading', 'NewsType'], dtype='object')


In [17]:
# Step 3: Embed Documents
embedder = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = SentenceTransformerEmbeddings(model_name='all-MiniLM-L6-v2')

#Truncate document list because it takes too long to process
if len(document_list) > 1000:
    document_list = document_list[:1000]

# Create a vector store using Chroma
vector_store = Chroma.from_documents(document_list, embeddings)






In [21]:
# Step 4: Query the Vector Store
query = "Tell me about what most likely happens in the news for Sweden"
retrieved_docs = vector_store.similarity_search(query, k=2)

# Step 5: Generate a Response Using Retrieved Documents as Context
context = " ".join([doc.page_content for doc in retrieved_docs])


# Use an LLM for response generation, in this case GPT2
llm = pipeline("text-generation", model="gpt2")  # Replace with another model if needed
response = llm(context, max_new_tokens=100, do_sample=True, top_k=50)[0]['generated_text']

print(f"Generated Response: {response}")


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Response: strong>WASHINGTON: International Monetary Fund chief Christine Lagarde said that Britain´s shock vote to quit the European Union has injected significant uncertainty into the global economy but is unlikely to cause a world recession.</strongBut in an exclusive interview with AFP, she also said that Brexit underscores the need for the EU to do a better explaining how it benefits Europeans, amid "disenchantment" with the institution.And she said that Britain´s move to cut corporate taxes to counter the expected economic fallout from its choice to break with the EU was just a "race to the bottom" that could hurt everyone.Two weeks after the British referendum on cutting its EU ties, Lagarde, speaking in her Washington offices at the beginning of her second five year term as IMF managing director, called the event a "major downside risk" for the world."We don´t think that a global recession is very likely. The immediate effects will be on the UK," with some spillover in