In [26]:
import requests
import json
import google.generativeai as genai
from google.colab import userdata
import pprint
from google import genai

news_key=userdata.get("CURRENT_KEY")
gemma=userdata.get("GOOGLE_API_KEY")

In [27]:
# Create Gemini client
client = genai.Client(api_key=gemma)

#utility method to talk the Gemini
def get_response(prompt, model='gemini-2.0-flash-lite'):
    response = client.models.generate_content(
        model=model,
        contents=prompt
    )
    return response.text

In [18]:
#testing get_response
get_response("What is a zero shot prompt?")

'A zero-shot prompt is a way to get a language model (like GPT-3 or similar) to perform a task **without** providing it with any examples of the desired input-output mapping. In other words, you ask it to do something it\'s never explicitly been trained on *with examples*.\n\nHere\'s a breakdown:\n\n* **Zero Examples:**  The "zero-shot" part means you\'re providing *zero* examples in your prompt to demonstrate the desired behavior.  You\'re not showing it how to do it.\n\n* **Prompt Engineering:** You rely on carefully crafting your prompt to clearly define the task you want the model to perform. This is key.  The prompt has to be very specific and unambiguous.\n\n* **Leveraging Pre-existing Knowledge:**  The power of zero-shot learning comes from the language model\'s massive training dataset. It implicitly learns a lot of world knowledge, relationships, and patterns from the vast amount of text it\'s been exposed to.  You\'re hoping it can apply this general knowledge to a specific t

In [31]:
#querying current api for health related articles
url = f'https://api.currentsapi.services/v1/search?keywords=Health&language=en&page_size=150&apiKey={news_key}'
response = requests.get(url)
data=response.json()
print(data)
print(f"Total Results: {len(data.get('news', []))}")  # Access 'news' from JSON data

Total Results: 150


In [38]:
#display sample article
pprint.pprint(data['news'][65])

{'author': 'Joseph Choi',
 'category': ['insurance', 'health'],
 'description': 'The Centers for Disease Control and Prevention (CDC) '
                'published a webpage Friday listing the conflicts of interest '
                'reported by members of a key vaccine advisory committee.\n'
                'Health and Human Services (HHS) Secretary Robert F. Kennedy '
                'Jr. hailed it as another step toward "radical transparency," '
                'while on...',
 'id': '288ba563-12f6-4708-9e79-b74c53f2e762',
 'image': 'None',
 'language': 'en',
 'published': '2025-03-07 21:08:05 +0000',
 'title': "CDC launches 'conflicts of interest' page about vaccine advisory "
          'panel',
 'url': 'https://thehill.com/policy/healthcare/5183223-cdc-vaccine-advisory-committee-conflicts/'}


In [73]:
#preprocessing retrieved articles: handeling duplicates and missing data
def preprocess_articles(articles):
    print(f"Starting preprocessing on {len(articles)} articles")
    original_article = data.get('news', [])[90]
    print("\nOriginal article keys:")
    print(sorted(original_article.keys()))

    seen_urls = set()
    preprocessed_articles = []
    for article in articles:
      #skip articles without url
        url = article.get('url')
        if not url:
            continue
        #skip duplicate urls
        if url in seen_urls:
            continue
        seen_urls.add(url)
        processed_article = {
            "id": article.get("id", ""),
            "title": "PROCESSED: " + article.get("title", "No title available"),  # Add a prefix to verify
            "description": article.get("description", "No description available"),
            "url": url,
            "author": article.get("author", "Unknown"),
            "image": article.get("image", ""),
            "language": article.get("language", "en"),
            "category": article.get("category", []),
            "published": article.get("published", "na"),
        }
        # Skip articles with empty titles and descriptions (both missing)
        if processed_article["title"] == "No title available" and processed_article["description"] == "No description available":
            continue
        preprocessed_articles.append(processed_article)
    return preprocessed_articles

processed_data = preprocess_articles(data.get('news', []))

print(f"Total articles after preprocessing: {len(processed_data)}")

#print sample original article
print("\nOriginal article:")
pprint.pprint(data.get('news', [])[17])

#print sample preprocessed article
print("\nPreprocessed article:")
pprint.pprint(processed_data[17])

Starting preprocessing on 150 articles

Original article keys:
['author', 'category', 'description', 'id', 'image', 'language', 'published', 'title', 'url']
Total articles after preprocessing: 150

Original article:
{'author': 'Jessica Patterson',
 'category': ['regional', 'springfield', 'massachusetts'],
 'description': 'MARTIN COUNTY, KY (WOWK) - As communities across the region '
                "continue to rebuild after February's catastrophic flooding, "
                'efforts are still underway to lend a hand to those in need.\n'
                'Thanks to the generosity of others, the Martin County Health '
                "Department's Rebuilding Joy Initiative collected thousands "
                '...',
 'id': 'f285f03f-5c7c-4e16-a511-87dabaaa0f70',
 'image': 'None',
 'language': 'en',
 'published': '2025-03-07 23:52:09 +0000',
 'title': 'Martin County rebuilds joy for kids with toy drive after floods',
 'url': 'https://www.wowktv.com/news/kentucky/martin-county-ky/martin-