# Media Cloud: Measuring Attention

Reference:
- https://github.com/mediacloud/api-tutorial-notebooks
- https://www.mediacloud.org

In [1]:
# ! pip install mediacloud

In [2]:
import json
import os

home_dir = os.environ['HOME']

with open(f'{home_dir}/.api-keys.json') as f:
    keys = json.load(f)

# Your API key
API_KEY = keys['MEDIACLOUD_API_KEY']

## Set Up API Key

In [3]:
# Set up your API key and import needed things
import os, mediacloud.api
from importlib.metadata import version
from dotenv import load_dotenv
import datetime as dt
from IPython.display import JSON
import bokeh.io

bokeh.io.reset_output()
bokeh.io.output_notebook()
search_api = mediacloud.api.SearchApi(API_KEY)
f'Using Media Cloud python client v{version("mediacloud")}'

'Using Media Cloud python client v4.3.0'

## Attention from a Single Media Source

In [4]:
# check how many stories include the query phrase in the Washington Post (media id #2)
my_query = 'politics' # note the double quotes used to indicate use of the whole phrase
start_date = dt.date(2019, 7, 1)
end_date = dt.date(2023, 7, 1)
sources = [1, 2, 3] # NY Times, Washington Post, CS Monitor

search_api.story_count(my_query, start_date, end_date, source_ids=sources)
my_query_name = my_query.replace('"', '')

In [5]:
# you can see this count by day as well
results = search_api.story_count_over_time(my_query, start_date, end_date, source_ids=sources)
results

[{'date': datetime.date(2019, 7, 1),
  'total_count': 21,
  'count': 5,
  'ratio': 0.23809523809523808},
 {'date': datetime.date(2019, 7, 2),
  'total_count': 21,
  'count': 3,
  'ratio': 0.14285714285714285},
 {'date': datetime.date(2019, 7, 3),
  'total_count': 17,
  'count': 2,
  'ratio': 0.11764705882352941},
 {'date': datetime.date(2019, 7, 4),
  'total_count': 9,
  'count': 1,
  'ratio': 0.1111111111111111},
 {'date': datetime.date(2019, 7, 5),
  'total_count': 15,
  'count': 2,
  'ratio': 0.13333333333333333},
 {'date': datetime.date(2019, 7, 6),
  'total_count': 4,
  'count': 2,
  'ratio': 0.5},
 {'date': datetime.date(2019, 7, 7), 'total_count': 3, 'count': 0, 'ratio': 0},
 {'date': datetime.date(2019, 7, 8),
  'total_count': 19,
  'count': 4,
  'ratio': 0.21052631578947367},
 {'date': datetime.date(2019, 7, 9),
  'total_count': 17,
  'count': 3,
  'ratio': 0.17647058823529413},
 {'date': datetime.date(2019, 7, 10),
  'total_count': 20,
  'count': 4,
  'ratio': 0.2},
 {'date':

In [6]:
# and you can chart attention over time with some simple notebook work (using Bokeh here)
import pandas as pd
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource
df = pd.DataFrame.from_dict(results)
df['date']= pd.to_datetime(df['date'])
source = ColumnDataSource(df)
p = figure(x_axis_type="datetime", width=900, height=250)
p.line(x='date', y='count', line_width=2, source=source)  # your could use `ratio` instead of `count` to see normalized attention
show(p)

### Normalizing within a Source

In [7]:
results = search_api.story_count(my_query, start_date, end_date, source_ids=sources)
source_ratio = results['relevant'] / results['total']
f'{source_ratio:.2%} of the media sources {sources} stories were about {my_query_name}'

'11.46% of the media sources [1, 2, 3] stories were about politics'

## Research Within a Country - using collections

In [8]:
# check in our collection of country-level US National media sources
US_NATIONAL_COLLECTION = 34412234
results = search_api.story_count(my_query, start_date, end_date, collection_ids=[US_NATIONAL_COLLECTION])
us_country_ratio = results['relevant'] / results['total']
# '{:.2%} of stories from national-level US media sources mentioneded "climate change"'.format(us_country_ratio)
f'{us_country_ratio:.2%} of stories from national-level US media sources mentioneded {my_query_name}'

'4.94% of stories from national-level US media sources mentioneded politics'

In [16]:
# now we can compare this to the source-level coverage
coverage_ratio = 1 / (source_ratio / us_country_ratio)
f'{my_query_name} received {coverage_ratio:.2%} times less coverage in {sources} than you might expect based on other US national papers'

'politics received 43.07% times less coverage in [1, 2, 3] than you might expect based on other US national papers'

In [10]:
# or compare to another country (India in this case)
INDIA_NATIONAL = 34412118
results = search_api.story_count('"climate change"', start_date, end_date, collection_ids=[INDIA_NATIONAL])
india_country_ratio = results['relevant'] / results['total']
f'{india_country_ratio:.2%} of stories from national-level Indian media sources mentioned {my_query_name}'

'0.50% of stories from national-level Indian media sources mentioned politics'

In [11]:
coverage_ratio =  1 / (india_country_ratio / us_country_ratio)
f'at the national level {my_query_name} is covered {coverage_ratio:.2%} times less in India than the US'

'at the national level politics is covered 987.39% times less in India than the US'

## Listing Stories

In [12]:
# grab the most recent stories about this issue
stories, _ = search_api.story_list(my_query, start_date, end_date)
stories[:3]

[{'id': '5b95e9e6c1aafd926e6adf48bb3fd28e7241fff54322889d811bda065e6bbde8',
  'media_name': 'yourmiddleeast.com',
  'media_url': 'yourmiddleeast.com',
  'title': 'The Arabs’ Moment',
  'publish_date': datetime.date(2020, 3, 6),
  'url': 'https://yourmiddleeast.com/2020/03/06/the-arabs-moment/',
  'language': 'en',
  'indexed_date': datetime.datetime(2024, 11, 25, 4, 59, 44, 377339)},
 {'id': 'f709469e307fcba31cfd1fca3a3165c76754c9b8a42f72f1e42423b38f791977',
  'media_name': 'yahoo.com',
  'media_url': 'yahoo.com',
  'title': "Black voters power Joe Biden's Super Tuesday success",
  'publish_date': datetime.date(2020, 3, 5),
  'url': 'https://news.yahoo.com/black-voters-power-bidens-super-055038667.html',
  'language': 'en',
  'indexed_date': datetime.datetime(2024, 11, 25, 4, 59, 42, 376413)},
 {'id': 'cd8607202f26bed7d766b5a884decf83e73ebbeea46095ebbb6f0fb1379ba6b9',
  'media_name': 'patrika.com',
  'media_url': 'patrika.com',
  'title': 'ईरान: Coronavirus से विदेश मंत्री के सलाहकार क

In [17]:
# let's fetch all the stories matching our query on one day
all_stories = []
more_stories = True
pagination_token = None
story_start_date = dt.date(2023,11,29)
story_end_date = dt.date(2023,11,30)
while more_stories:
    page, pagination_token = search_api.story_list(my_query, story_start_date, story_end_date,
                                                   collection_ids=[US_NATIONAL_COLLECTION],
                                                   pagination_token=pagination_token)
    all_stories += page
    more_stories = pagination_token is not None
len(all_stories)

1203

### Writing a CSV of Story Data

In [24]:
import csv

story_start_date_str = story_start_date.strftime('%Y%m%d')
story_end_date_str = story_end_date.strftime('%Y%m%d')

fieldnames = ['id', 'publish_date', 'title', 'url', 'language', 'media_name', 'media_url', 'indexed_date']
output_filename = f"./mediacloud_api_data/{my_query_name.replace(' ', '_')}-storylist-{story_start_date_str}_{story_end_date_str}.csv"
output_dir = os.path.dirname(output_filename)

if output_dir and not os.path.exists(output_dir):
    os.makedirs(output_dir)  # Create the directory if it doesn't exist
    
with open(output_filename, 'w', newline='') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames, extrasaction='ignore')
    writer.writeheader()
    for s in all_stories:
        writer.writerow(s)

## Top Media Sources

In [15]:
# List media producing the most stories matching the search
INDIA_NATIONAL = 34412118
results = search_api.sources(my_query, start_date, end_date, collection_ids=[INDIA_NATIONAL])
results

[{'source': 'amarujala.com', 'count': 392592},
 {'source': 'indiatimes.com', 'count': 178544},
 {'source': 'patrika.com', 'count': 66994},
 {'source': 'jagran.com', 'count': 50268},
 {'source': 'news18.com', 'count': 28673},
 {'source': 'business-standard.com', 'count': 16273},
 {'source': 'indianexpress.com', 'count': 16006},
 {'source': 'thehindu.com', 'count': 13750},
 {'source': 'newindianexpress.com', 'count': 12385},
 {'source': 'india.com', 'count': 8146},
 {'source': 'hindustantimes.com', 'count': 7252},
 {'source': 'ndtv.com', 'count': 7020},
 {'source': 'freepressjournal.in', 'count': 6847},
 {'source': 'firstpost.com', 'count': 4973},
 {'source': 'thewire.in', 'count': 4889},
 {'source': 'thequint.com', 'count': 4038},
 {'source': 'sify.com', 'count': 3908},
 {'source': 'scroll.in', 'count': 3602},
 {'source': 'tribuneindia.com', 'count': 3583},
 {'source': 'rediff.com', 'count': 3539},
 {'source': 'newsclick.in', 'count': 3473},
 {'source': 'indiasnews.net', 'count': 3073},