This notebook runs all the code to create the data for the analysis of mentions for causes of death in the media

It can be run as part of the OWID etl repo or locally. Changes in code between these two versions are clearly marked. 

In [18]:
import datetime as dt
from pathlib import Path
import click
import mediacloud.api
import pandas as pd
from query_generation import create_full_queries, create_queries
import time


## Set overall variables for analysis

In [19]:
YEAR = 2023
VERBOSE = True

# These are the causes of death we are using for the 2023 version.
# They are based on the 12 leading causes of death in the US for 2023, plus drug overdoses, homicides, and terrorism
CAUSES_OF_DEATH = [
    "heart disease",
    "cancer",
    "accidents",
    "stroke",
    "respiratory",
    "alzheimers",
    "diabetes",
    "kidney",
    "liver",
    "covid",
    "suicide",
    "influenza",
    "drug overdose",
    "homicide",
    "terrorism",
]

## Get deaths for each cause of death

## Get Media Mentions from Media Cloud
To get the number of articles that mention each cause of death for three major newspapers (the New York Times, the Washington Post and Fox News) we use the open-source database [Media Cloud](https://www.mediacloud.org/). 

Before you can run this code you need to create a free account on Media Cloud [here](https://search.mediacloud.org/sign-up) and create an [API token](https://github.com/mediacloud/api-tutorial-notebooks/blob/main/MC01%20-%20setup.ipynb). This API token should be pasted below, so you can access the database.

In [20]:
#TODO remove this part:
from dotenv import load_dotenv
import os

load_dotenv()

MC_API_TOKEN = os.getenv("MC_API_TOKEN") #paste your API TOKEN here e.g. "46971fa7d873b615238234a777c2d867fbbb444b" - this is NOT a valid token
search_api = mediacloud.api.SearchApi(MC_API_TOKEN)

# source IDs for each newspaper
NYT_ID = 1
WAPO_ID = 2
FOX_ID = 1092
US_COLLECTION_ID = 34412234

# create queries - see query_generation.py
QUERIES = create_queries()
STR_QUERIES = create_full_queries()

In [None]:
def get_start_end(year):
    return (dt.date(year, 1, 1), dt.date(year, 12, 31))

# helper function to use Media Cloud API
def query_results(query, source_ids, year, collection_ids=None):
    start_date, end_date = get_start_end(year)
    if collection_ids:
        results = search_api.story_count(
            query=query, start_date=start_date, end_date=end_date, collection_ids=collection_ids
        )
    else:
        results = search_api.story_count(query=query, start_date=start_date, end_date=end_date, source_ids=source_ids)
    return results["relevant"]

# function to get mentions from a specific source
def get_mentions_from_source(
    source_ids: list,
    source_name: str,
    queries: dict,
    year=YEAR,
    collection_ids=None,
):
    """
    Get mentions of causes of death from a specific source.
    Args:
        source_ids (list): List of source IDs to query.
        source_name (str): Name of the source.
        queries (dict): Dictionary of queries to run.
        year (int): Year to query for.
        collection_ids (list): List of collection IDs to query.
    Returns:
        pd.DataFrame: DataFrame containing the results of the queries."""
    query_count = []
    for name, query in queries.items():
        time.sleep(30) # wait for 30 seconds to avoid hitting API rate limits
        start_time = time.time()
        cnt = query_results(query, source_ids, collection_ids=collection_ids, year=year)
        if VERBOSE:
            print(f"Querying: {source_name} for CoD {name}")
            print(f"Query: {query}")
            print(f"Count: {cnt} mentions for {name} in the {source_name} in {year} - retrieved in {time.time() - start_time:.2f} seconds")
            print("-" * 40)
        query_count.append(
            {
                "cause": name,
                "mentions": cnt,
                "source": source_name,
                "year": year,
            }
        )
    return pd.DataFrame(query_count)


In [None]:
assert MC_API_TOKEN is not None, "Get API key from https://www.mediacloud.org/ in order to access this data"
source_ids = [NYT_ID, WAPO_ID, FOX_ID]
sources = ["The New York Times", "The Washington Post", "Fox News"]

mentions_ls = []

queries_in_use = {q: q_str for q, q_str in STR_QUERIES.items() if q in CAUSES_OF_DEATH}

for s_id, s_name in zip(source_ids, sources):
    mentions = get_mentions_from_source([s_id], s_name, queries_in_use, year=YEAR)
    mentions_ls.append(mentions.copy(deep=True))

# add mentions for US collection
collection_mentions = get_mentions_from_source(
        source_ids=[],
        source_name="US Collection",
        queries=queries_in_use,
        year=YEAR,
        collection_ids=[US_COLLECTION_ID],
    )
mentions_ls.append(collection_mentions.copy(deep=True))

# concatenate all mentions into a single DataFrame
mentions_df = pd.concat(mentions_ls, ignore_index=True)

Querying: The New York Times for CoD heart disease
Query: ("heart disease heart"~1000 OR "heart attack heart"~1000 OR "heart cardiac"~1000 OR "cardiac cardiac"~1000 OR "heart infarct artery"~1000 OR "heart coronary artery"~1000 OR "cardiac coronary artery"~1000 OR "heart arrhythmia"~1000 OR "arrhythmia cardiac"~1000 OR "heart failure heart"~1000 OR "heart failure cardiac"~1000 OR "heart pericarditis"~1000 OR "heart endocarditis"~1000 OR "heart cardiomyopathy"~1000 OR "heart hypotension"~1000 OR "cardiac hypertension"~1000 OR "heart infection heart"~1000 OR "heart cardiology"~1000 OR "heart cardiologist"~1000 OR "heart disease blood pressure"~1000 OR "heart attack blood pressure"~1000) AND ("heart disease" OR "heart attack" OR "cardiac arrest" OR "infarct" OR "coronary artery disease" OR "arrhythmia" OR "heart failure" OR "pericarditis" OR "endocarditis" OR "cardiomyopathy" OR "high blood pressure" OR "hypertension" OR "heart infection" OR "cardiology" OR "cardiologist")
Count: 436 ment