In [None]:
import requests
import pandas as pd
import os
import time
from dotenv import load_dotenv

In [None]:
# Load API keys from your .env file
load_dotenv()
NEWSDATA_IO_KEY = os.getenv("NEWSDATA_IO_KEY")

In [None]:
QUERY = "Artificial Intelligence"
TARGET_ARTICLE_COUNT = 200 
REQUEST_DELAY_SECONDS = 5 

# This list will hold all articles 
all_articles = []
next_page_token = None # This token tells the API which page of results to get next

print(f"Starting paginated fetch from Newsdata.io. Target: {TARGET_ARTICLE_COUNT} articles.")


# loop for target number of news atricles
while len(all_articles) < TARGET_ARTICLE_COUNT:
    
    
    url = "https://newsdata.io/api/1/news"
    params = {
        "q": QUERY,
        "language": "en",
        "apikey": NEWSDATA_IO_KEY
    }
    
    # If we have a token from a previous request, add it to the params to get the next page
    if next_page_token:
        params["page"] = next_page_token

    print(f"Fetching page... (Collected {len(all_articles)} articles so far)")

    try:
        response = requests.get(url, params=params)
        
        if response.status_code != 200:
            print(f"Error: {response.status_code}. Stopping.")
            break # Exit the loop on an error

        data = response.json()

        if data.get("status") != "success":
            print(f"API Error: {data.get('results', {}).get('message')}. Stopping.")
            break # Exit the loop on an error
            
        articles_on_this_page = data.get("results", [])
        
        # This part ensures the columns match newsapi 
        for article in articles_on_this_page:
            authors = ", ".join(article.get("creator", [])) if article.get("creator") else "N/A"
            all_articles.append({
                "source": article.get("source_id", "N/A"),
                "author": authors,
                "title": article.get("title"),
                "description": article.get("description"),
                "url": article.get("link"),
                "publishedAt": article.get("pubDate")
            })

        next_page_token = data.get("nextPage")
        if not next_page_token:
            print("No more pages of results available.")
            break # Exit the loop if the API no more page available

        # Wait before the next request to avoid limit finishing
        time.sleep(REQUEST_DELAY_SECONDS)
        
    except requests.exceptions.RequestException as e:
        print(f"A network error occurred: {e}. Stopping.")
        break

In [None]:
#save as csv
if not all_articles:
    print("\nNo articles were collected.")
else:
    # Convert the list into a DataFrame
    df = pd.DataFrame(all_articles)

    output_columns = [
        "source", "title", "description", "url", "publishedAt"
    ]
    df = df[output_columns]

    # Save to its own CSV file
    df.to_csv("newsdata_io.csv", index=False, encoding="utf-8")

    print("\nSuccess!")
    print(f"Saved a total of {len(df)} articles to 'newsdata_io.csv'.")