Use Python 3.11.5

In [1]:
import os
import json
import requests
from bs4 import BeautifulSoup
from langdetect import detect, DetectorFactory
from datetime import datetime
from titlecase import titlecase

# Set a seed for consistent language detection results.
DetectorFactory.seed = 0

# === CONFIGURATION ===

# Database file path (this file will be updated by the pipeline)
DB_FILE = "link_cards.json"

# List of Altmetric news page URLs to scrape.
ALTMETRIC_BASE_URLS = [
    "https://nature.altmetric.com/details/49134871/news",
    "https://nature.altmetric.com/details/86875450/news",
    "https://royalsociety.altmetric.com/details/67113987/news",
    "https://cambridge.altmetric.com/details/148065072/news",
    "https://plos.altmetric.com/details/50457092/news",
    "https://www.altmetric.com/details/109906728/news",
    "https://nature.altmetric.com/details/154178347/news",
    "https://www.altmetric.com/details/121359234/news",
    "https://tandf.altmetric.com/details/81579019/news",
    "https://www.altmetric.com/details/142953324/chapter/155827676/news",
    "https://royalsociety.altmetric.com/details/161515855/news",
    "https://www.altmetric.com/details/108380522/news",
    "https://oxfordjournals.altmetric.com/details/168344032/news",
    "https://plos.altmetric.com/details/166068334/news",
    "https://scienceadvances.altmetric.com/details/173863208/news",
    "https://springeropen.altmetric.com/details/163437325/news",
    "https://science.altmetric.com/details/173914272/news"
]

# other links, some radio and some other languages
# https://www.cbc.ca/listen/live-radio/1-8-your-world-tonight/clip/16126567-shipping-tax-online-gambling-whale-songs
# https://www.swr.de/wissen/evolutionaere-parallelen-bei-walgesaengen-und-menschlicher-sprache-100.html
# https://www.radiofrance.fr/franceinfo/podcasts/le-billet-sciences-week-end/le-chant-des-baleines-un-langage-complexe-comparable-a-celui-des-humains-7789902
# https://www.haaretz.co.il/science/biology/2025-02-07/ty-article-magazine/.premium/00000194-db9b-df08-a3bc-dfbfa34a0000?gift=82191ff5ae4e40c981df2354adf1d19b
# https://super.abril.com.br/ciencia/baleias-se-comunicam-de-forma-tao-eficiente-quanto-humanos-segundo-estudo/
# https://www.deutschlandfunk.de/sprachmuster-walgesaenge-haben-aehnlichkeiten-zur-menschlichen-sprache-100.html

# Manual articles can be added here.
# Each article should be a dictionary with these keys:
# 'link', 'image', 'alt_text', 'title', 'source', 'time_text', 'summary', and 'lang'
MANUAL_ARTICLES = [
    # {
    #     'link': 'https://cosmiclog.com/2025/02/06/scientists-find-links-between-whale-songs-and-languages/',
    #     'image': 'https://i0.wp.com/cosmiclog.com/wp-content/uploads/2020/07/cropped-costype4.jpg?w=928&ssl=1',
    #     'alt_text': 'Cosmic Log',
    #     'title': 'Scientists find links between whale songs and languages',
    #     'source': 'Cosmic Log',
    #     'time_text': '06 Feb 2025',
    #     'summary': "When whales sing, what do they sing about? Researchers haven't yet cracked that code, but they say a statistical analysis shows...",
    #     'lang': 'en',
    #     'dt': datetime.strptime('06 Feb 2025', '%d %b %Y')
    # },
    {
        'link': 'https://www.youtube.com/watch?v=1gEI7Ihibac',
        'image': 'https://yt3.googleusercontent.com/wDzBAf9UB_rSJZlt_pHtRjGx0JzbK5bwhbdQxKwaW8D5254qsAuzTgm2w6V7q13yKghazQruAEA=s160-c-k-c0x00ffffff-no-rj',
        'alt_text': 'Wild Resonance',
        'title': 'Wild Resonance #1: animal syntax in the trees and seas - a conversation with Dr. Mason Youngblood',
        'source': 'Wild Resonance',
        'time_text': '07 Aug 2025',
        'summary': 'Dr. Youngblood and Stephen explore Mason`s fascinating research on vocal efficiency in house finches and cetaceans...',
        'lang': 'en',
        'dt': datetime.strptime('07 Aug 2025', '%d %b %Y')
    },
    {
        'link': 'https://www.cbsnews.com/news/billion-bird-deaths-glass-window-building-crashes/',
        'image': 'https://upload.wikimedia.org/wikipedia/commons/d/dc/CBS_News_logo_%282020%29.svg',
        'alt_text': 'CBS News',
        'title': 'More than 1 billion birds die each year after crashing into buildings, study finds',
        'source': 'CBS News',
        'time_text': '07 Aug 2024',
        'summary': 'Buildings are one of the top bird killers in the United States, with more than a billion birds dying after a building crash each year...',
        'lang': 'en',
        'dt': datetime.strptime('07 Aug 2024', '%d %b %Y')
    },
    {
        'link': 'https://www.mpg.de/21947859/0521-evan-taking-the-lead-in-music-150495-x',
        'image': 'https://www.mpg.de/assets/og-logo-281c44f14f2114ed3fe50e666618ff96341055a2f8ce31aa0fd70471a30ca9ed.jpg',
        'alt_text': 'Max Planck Society',
        'title': 'Taking the lead in music: First-mover advantage may be key to success in the music industry',
        'source': 'Max Planck Society',
        'time_text': '21 May 2024',
        'summary': 'Researchers studying the science of cultural evolution have conducted a comprehensive study of the role of first-mover advantage in music...',
        'lang': 'en',
        'dt': datetime.strptime('21 May 2024', '%d %b %Y')
    },
    {
        'link': 'https://www.discovermagazine.com/the-sciences/music-and-mutations-could-be-linked',
        'image': 'https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQlkWEaNFhbfisTT-Kc48pYYzR7aNme4L1xSA&s',
        'alt_text': 'Discover Magazine',
        'title': 'Music and Mutations Could Be Linked',
        'source': 'Discover Magazine',
        'time_text': '17 Nov 2022',
        'summary': 'Some researchers think that evolution drives transformations in both species and songs...',
        'lang': 'en',
        'dt': datetime.strptime('17 Nov 2022', '%d %b %Y')
    },
    {
        'link': 'https://www.apa.org/monitor/2021/07/cover-domestic-extremists',
        'image': 'https://aspirace.com/wp-content/uploads/2019/04/APA-Logo.jpg',
        'alt_text': 'Monitor on Psychology',
        'title': 'Deradicalizing domestic extremists',
        'source': 'Monitor on Psychology',
        'time_text': '01 Jul 2021',
        'summary': 'Psychologists are using their expertise in human behavior to identify ways to deradicalize and disengage domestic extremists...',
        'lang': 'en',
        'dt': datetime.strptime('01 Jul 2021', '%d %b %Y')
    },
    {
        'link': 'https://cosmosmagazine.com/people/breaking-up-is-hard-to-do-hip-hop-is-a-cauldron-of-conformity/',
        'image': 'https://cosmosmagazine.com/wp-content/uploads/2021/09/cosmoslogo-396x168-notrans.png',
        'alt_text': 'Cosmos',
        'title': 'Hip Hop is a cauldron of conformity',
        'source': 'Cosmos',
        'time_text': '24 Sep 2021',
        'summary': 'Sampling, the incorporation of short snippets from previously recorded songs, is very popular among hip hop and electronic dance artists...',
        'lang': 'en',
        'dt': datetime.strptime('24 Sep 2021', '%d %b %Y')
    },
    {
        'link': 'https://www.nytimes.com/2020/07/02/science/sparrow-bird-song.html',
        'image': 'https://logos-world.net/wp-content/uploads/2020/11/The-New-York-Times-Logo.png',
        'alt_text': 'New York Times',
        'title': "Canada's Sparrows Are Singing a New Song. You'll Hear It Soon",
        'source': 'New York Times',
        'time_text': '2 Jul 2020',
        'summary': 'Over 20 years, scientists tracked the transformation of the traditional trill of a common bird from western Canada to Ontario...',
        'lang': 'en',
        'dt': datetime.strptime('2 Jul 2020', '%d %b %Y')
    },
    {
        'link': 'https://soundcloud.com/voislam/drive-time-show-podcast-08-02-2021-justice',
        'image': 'https://pbs.twimg.com/profile_images/1338157913056153603/ERXMZcAZ_400x400.jpg',
        'alt_text': 'Voices of Islam Radio',
        'title': "Extremism: Rise from the far right",
        'source': 'Voices of Islam Radio',
        'time_text': '2 Feb 2021',
        'summary': 'Research has shown that higher levels of poverty are linked to increases in violence and extremism...',
        'lang': 'en',
        'dt': datetime.strptime('2 Feb 2021', '%d %b %Y')
    },
    {
        'link': 'https://open.spotify.com/episode/2QCsTvlA7X9GAmOwku98Ff',
        'image': 'https://i.scdn.co/image/ab6765630000ba8aeb58ac03947f5c65fac92b05',
        'alt_text': 'WFUV',
        'title': "'What's What' Podcast",
        'source': 'WFUV',
        'time_text': '20 Nov 2024',
        'summary': 'New research published this August in the scientific journal PLOS ONE suggests that more than a billion birds in the US die annually from building collisions...',
        'lang': 'en',
        'dt': datetime.strptime('20 Nov 2024', '%d %b %Y')
    },
    {
        'link': 'https://open.spotify.com/episode/5xL3CBZE8punJqWqcpAaio',
        'image': 'https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTRZh_YSsve2MOnbIakH-RBTmOiF7-95g9OGA&s',
        'alt_text': 'Parsing Science',
        'title': "Sampling music networks",
        'source': 'Parsing Science',
        'time_text': '30 Aug 2019',
        'summary': 'Can the sharing of drum break samples among musicians help us better understand how networks of artists collaborate?',
        'lang': 'en',
        'dt': datetime.strptime('30 Aug 2019', '%d %b %Y')
    },
    {
        'link': 'https://open.spotify.com/episode/5xL3CBZE8punJqWqcpAaio',
        'image': 'https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTRZh_YSsve2MOnbIakH-RBTmOiF7-95g9OGA&s',
        'alt_text': 'Parsing Science',
        'title': "Sampling music networks",
        'source': 'Parsing Science',
        'time_text': '30 Aug 2019',
        'summary': 'Can the sharing of drum break samples among musicians help us better understand how networks of artists collaborate?',
        'lang': 'en',
        'dt': datetime.strptime('30 Aug 2019', '%d %b %Y')
    },
    {
        'link': 'https://www.theguardian.com/environment/article/2024/aug/07/death-toll-for-birds-hitting-buildings-may-be-over-1-billion-a-year-in-us-report',
        'image': 'https://assets-legacy.floridarrc.com/2023/01/the-guardian-logo.jpeg',
        'alt_text': 'The Guardian',
        'title': "Death toll for birds hitting buildings may be over 1 billion a year in US",
        'source': 'The Guardian',
        'time_text': '07 Aug 2024',
        'summary': "Only 40 percent of birds survive collisions with windows, researchers say, suggesting that mortality rates are far higher than previously thought...",
        'lang': 'en',
        'dt': datetime.strptime('07 Aug 2024', '%d %b %Y')
    },
    # {
    #     'link': 'https://queenseagle.com/all/queens-college-bird-research-mason-youngblood',
    #     'image': 'https://images.squarespace-cdn.com/content/v1/5b9ffe0f1137a680c2c08250/1537994256644-S7EXI3SF0JJWY5B0K9WF/qde+logo.jpeg?format=1500w',
    #     'alt_text': 'Queens Daily Eagle',
    #     'title': "Queens College student's research is for the birds",
    #     'source': 'Queens Daily Eagle',
    #     'time_text': '17 Apr 2019',
    #     'summary': "Queens College Ph.D. student Mason Youngblood’s work traces the cultural evolution of birdsong...",
    #     'lang': 'en',
    #     'dt': datetime.strptime('17 Apr 2019', '%d %b %Y')
    # },
    # {
    #     'link': 'https://open.spotify.com/episode/1A8mhEbVKfB1Q8k2YgOXji',
    #     'image': 'https://is1-ssl.mzstatic.com/image/thumb/Podcasts211/v4/63/1c/ab/631cab1d-b27e-6fa4-b603-5bd1063e4676/mza_671126361412530562.jpg/300x300bb.webp',
    #     'alt_text': 'The Life Plot',
    #     'title': "019. Mason Youngblood",
    #     'source': 'The Life Plot',
    #     'time_text': '24 Jan 2019',
    #     'summary': "Mario Benitez and Joel Daness invited me on to The Life Plot to talk about public perceptions of evolutionary biology, common cultural transmission mechanisms in humans and animals, and the current state of pop music...",
    #     'lang': 'en',
    #     'dt': datetime.strptime('24 Jan 2019', '%d %b %Y')
    # }
]

# OPTIONAL: Define which sources are considered important.
IMPORTANT_SOURCES = [
    "National Geographic",
    "Scientific American",
    "New York Times",
    "The Guardian",
    "El País",
    "New Scientist",
    "ABC.net.au",
    "Tech Crunch",
    "Medium US",
    "Phys.org",
    "World Economic Forum",
    "The Conversation",
    "RNZ",
    "Los Angeles Times",
    "CBC",
    "CNN News",
    "Washington Post",
    "The Guardian",
    "Popular Science",
    "Futurity",
    "Yahoo! News",
    "Iflscience"
]

# === DATABASE FUNCTIONS ===

def load_database():
    """Load the database of link cards from the JSON file (or return an empty list if not present)."""
    if os.path.exists(DB_FILE):
        with open(DB_FILE, "r", encoding="utf-8") as f:
            try:
                return json.load(f)
            except Exception as e:
                print("Error loading database:", e)
                return []
    return []

def save_database(db):
    """Save the database list to the JSON file."""
    with open(DB_FILE, "w", encoding="utf-8") as f:
        # Use default=str so datetime objects are converted to strings.
        json.dump(db, f, indent=2, default=str)

# === SCRAPING FUNCTIONS ===

def scrape_altmetric_news(url):
    """Scrapes a given Altmetric news page URL and returns a list of article dictionaries."""
    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, 'html.parser')
    articles = soup.find_all('article', class_='post msm')
    article_list = []
    
    for art in articles:
        a_tag = art.find('a', class_='block_link')
        if not a_tag:
            continue
        link = a_tag.get('href')
        
        img_tag = art.find('img', class_='avatar')
        image = img_tag.get('src') if img_tag else ""
        alt_text = img_tag.get('alt') if img_tag else ""
        
        h3_tag = art.find('h3')
        title = h3_tag.get_text(strip=True) if h3_tag else "No Title"
        
        dt = None
        h4_tag = art.find('h4')
        if h4_tag:
            time_tag = h4_tag.find('time')
            if time_tag and time_tag.has_attr("datetime"):
                datetime_str = time_tag["datetime"]
                if datetime_str.endswith("Z"):
                    datetime_str = datetime_str[:-1]
                try:
                    dt = datetime.fromisoformat(datetime_str)
                except Exception:
                    dt = None
            h4_text = h4_tag.get_text(strip=True)
            if ',' in h4_text:
                source, time_text = h4_text.split(',', 1)
                source = source.strip()
                time_text = time_text.strip()
            else:
                source = h4_text.strip()
                time_text = ""
        else:
            source = "Unknown"
            time_text = ""
        
        # Only include articles from important sources.
        if not any(key.lower() in source.lower() for key in IMPORTANT_SOURCES):
            continue
        
        p_tag = art.find('p', class_='summary')
        summary = p_tag.get_text(strip=True) if p_tag else ""
        
        try:
            lang = detect(title)
        except Exception:
            lang = "unknown"
        
        article_data = {
            'link': link,
            'image': image,
            'alt_text': alt_text,
            'title': titlecase(title),
            'source': source,
            'time_text': time_text,
            'summary': summary,
            'lang': lang,
            'dt': dt,
            'active': True  # New items default to active.
        }
        article_list.append(article_data)
    
    return article_list

def scrape_altmetric_news_pages(base_url, max_pages=10):
    """Iterates through multiple pages of an Altmetric base URL and returns a list of article dictionaries."""
    all_articles = []
    for page in range(1, max_pages + 1):
        url = base_url if page == 1 else f"{base_url.rstrip('/')}/page:{page}"
        print(f"Scraping: {url}")
        try:
            articles = scrape_altmetric_news(url)
        except Exception as e:
            print(f"Error scraping {url}: {e}")
            break
        if not articles:
            break
        all_articles.extend(articles)
    return all_articles

# === HTML GENERATION FUNCTIONS ===

def generate_cards_html(articles):
    """Generates HTML cards for each article using inline styles."""
    cards_html = ""
    for article in articles:
        card_html = f"""
    <article style="background: #fff; border-radius: 10px; box-shadow: 0 4px 10px rgba(0,0,0,0.1); overflow: hidden; width: 100%; margin: 20px 0; transition: transform 0.2s, box-shadow 0.2s;"
      onmouseover="this.style.transform='translateY(-5px)'; this.style.boxShadow='0 6px 15px rgba(0,0,0,0.15)';"
      onmouseout="this.style.transform='none'; this.style.boxShadow='0 4px 10px rgba(0,0,0,0.1)';">
      <a target="_blank" href="{article['link']}" style="text-decoration: none; display: flex; flex-wrap: wrap; align-items: center; padding: 15px; color: inherit;">
        <img alt="{article['alt_text']}" src="{article['image']}" style="width:120px; height:120px; object-fit: contain; border-radius: 8px; margin-right: 15px; flex-shrink: 0;">
        <div style="flex: 1; min-width: 0;">
          <h3 style="margin: 0 0 5px; font-size: 1.1em; color: #222; font-weight: bold;">{article['title']}</h3>
          <h4 style="margin: 0 0 10px; font-size: 0.8em; color: #666;">{article['source']} <span style="color: #999;">| {article['time_text']}</span></h4>
          <p style="margin: 0; font-size: 0.85em; color: #444; line-height: 1.4;">{article['summary']}</p>
        </div>
      </a>
    </article>
        """
        cards_html += card_html
    return cards_html

def generate_html_page(cards_html):
    """Wraps the cards HTML in a complete HTML page with inline styles."""
    html_content = f"""<!DOCTYPE html>
<html lang="en">
<head>
  <meta charset="UTF-8">
  <title>News Coverage</title>
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
</head>
<body style="background: linear-gradient(135deg, #f2f2f2, #cccccc); padding: 20px; margin: 0; font-family: Arial, sans-serif;">
  {cards_html}
</body>
</html>
"""
    return html_content

# === HELPER FUNCTION FOR SORTING ===

def get_dt(article):
    """Return a datetime object from an article's 'dt' field.
       If the field is missing or invalid, return datetime.min."""
    dt_val = article.get('dt')
    if dt_val is None:
        return datetime.min
    if isinstance(dt_val, datetime):
        return dt_val
    try:
        return datetime.fromisoformat(dt_val)
    except Exception:
        return datetime.min

# === MAIN PIPELINE ===

if __name__ == "__main__":
    # Load existing database (or empty list if none).
    db = load_database()
    
    # Build an index of existing links for quick lookup.
    existing_links = {entry['link'] for entry in db}
    
    new_articles = []
    
    # Scrape altmetric pages.
    for base_url in ALTMETRIC_BASE_URLS:
        scraped = scrape_altmetric_news_pages(base_url, max_pages=10)
        new_articles.extend(scraped)
    
    # Add manually defined articles.
    for article in MANUAL_ARTICLES:
        article['title'] = titlecase(article['title'])
        new_articles.append(article)
    
    # Add new articles to the database if they don't already exist.
    for article in new_articles:
        if article['link'] not in existing_links:
            db.append(article)
            existing_links.add(article['link'])
    
    # Sort the updated database by date (newest first) before saving.
    db.sort(key=get_dt, reverse=True)
    
    # Save the updated database.
    save_database(db)
    
    # Filter to only active articles.
    active_articles = [a for a in db if a.get('active', True)]
    # Optionally, filter only English articles.
    active_articles = [a for a in active_articles if a.get('lang', 'unknown') == 'en']
    
    # Sort active articles (they should already be sorted in the db, but ensure here too).
    active_articles.sort(key=get_dt, reverse=True)
    
    # Generate HTML from active articles.
    cards_html = generate_cards_html(active_articles)
    html_page = generate_html_page(cards_html)
    
    # Save the final HTML page.
    output_file = "news_coverage.html"
    with open(output_file, "w", encoding="utf-8") as f:
        f.write(html_page)
    
    print(f"HTML file generated: {output_file}")

Scraping: https://nature.altmetric.com/details/49134871/news
Scraping: https://nature.altmetric.com/details/86875450/news
Scraping: https://nature.altmetric.com/details/86875450/news/page:2
Scraping: https://royalsociety.altmetric.com/details/67113987/news
Scraping: https://royalsociety.altmetric.com/details/67113987/news/page:2
Scraping: https://cambridge.altmetric.com/details/148065072/news
Scraping: https://plos.altmetric.com/details/50457092/news
Scraping: https://www.altmetric.com/details/109906728/news
Scraping: https://nature.altmetric.com/details/154178347/news
Scraping: https://nature.altmetric.com/details/154178347/news/page:2
Scraping: https://www.altmetric.com/details/121359234/news
Scraping: https://tandf.altmetric.com/details/81579019/news
Scraping: https://www.altmetric.com/details/142953324/chapter/155827676/news
Scraping: https://royalsociety.altmetric.com/details/161515855/news
Scraping: https://www.altmetric.com/details/108380522/news
Scraping: https://oxfordjournals