MediaCloud GeoDiversity Study
================

In [6]:
import mediacloud, datetime, time, json, time
from multiprocessing import Pool
from datetime import date, timedelta
import numpy as np
import pandas as pd
API_KEY = 'MY_KEY'
mc = mediacloud.api.AdminMediaCloud(API_KEY

In [7]:
# Constants
GEONAMES_TAG_SET_ID = 1011
CLIFF_CLAVIN_2_3_0_TAG_ID = 9353691
GEONAMES_TAG_PREFIX = "geonames_"
COLLECTION_US = 9361368
COLLECTION_UK = 9361367
COLLECTION_CN = 9361369
_lookup = {COLLECTION_US: 'USA', COLLECTION_UK: 'GBR', COLLECTION_CN: 'CAN'}
COLLECTION_IDS = [COLLECTION_US, COLLECTION_UK, COLLECTION_CN]
MEDIA_QUERY = "(tags_id_media: {})".format(" ".join([str(tags_id) for tags_id in COLLECTION_IDS]))
DATE_QUERY = "(publish_date:[2017-01-01T00:00:00Z TO 2017-04-01T00:00:00Z})"
COLLECTIONS = [mc.tag(tags_id) for tags_id in COLLECTION_IDS]

In [8]:
# load in analog/digital manual codings
MEDIA_ID_TO_ANALOG_DIGITAL = {}
manual_codings_df = pd.read_csv("media-digital-or-analog.csv")
for index, media in manual_codings_df.iterrows():
    MEDIA_ID_TO_ANALOG_DIGITAL[media['media_id']] = media['media type'].lower().split('-')[0]


In [9]:
for collection in COLLECTIONS:
    collection['media'] = mc.mediaList(tags_id=collection['tags_id'])
    for m in collection['media']:
        m['country'] = _lookup[collection['tags_id']]
        m['format'] = MEDIA_ID_TO_ANALOG_DIGITAL[m['media_id']]

In [10]:
COUNTRIES = [ {"geonames_id":3041565, "alpha3":"AND"}, {"geonames_id":290557, "alpha3":"ARE"}, {"geonames_id":1149361, "alpha3":"AFG"}, {"geonames_id":3576396, "alpha3":"ATG"}, {"geonames_id":3573511, "alpha3":"AIA"}, {"geonames_id":783754, "alpha3":"ALB"}, {"geonames_id":174982, "alpha3":"ARM"}, {"geonames_id":3351879, "alpha3":"AGO"}, {"geonames_id":6697173, "alpha3":"ATA"}, {"geonames_id":3865483, "alpha3":"ARG"}, {"geonames_id":5880801, "alpha3":"ASM"}, {"geonames_id":2782113, "alpha3":"AUT"}, {"geonames_id":2077456, "alpha3":"AUS"}, {"geonames_id":3577279, "alpha3":"ABW"}, {"geonames_id":661882, "alpha3":"ALA"}, {"geonames_id":587116, "alpha3":"AZE"}, {"geonames_id":3277605, "alpha3":"BIH"}, {"geonames_id":3374084, "alpha3":"BRB"}, {"geonames_id":1210997, "alpha3":"BGD"}, {"geonames_id":2802361, "alpha3":"BEL"}, {"geonames_id":2361809, "alpha3":"BFA"}, {"geonames_id":732800, "alpha3":"BGR"}, {"geonames_id":290291, "alpha3":"BHR"}, {"geonames_id":433561, "alpha3":"BDI"}, {"geonames_id":2395170, "alpha3":"BEN"}, {"geonames_id":3578476, "alpha3":"BLM"}, {"geonames_id":3573345, "alpha3":"BMU"}, {"geonames_id":1820814, "alpha3":"BRN"}, {"geonames_id":3923057, "alpha3":"BOL"}, {"geonames_id":7626844, "alpha3":"BES"}, {"geonames_id":3469034, "alpha3":"BRA"}, {"geonames_id":3572887, "alpha3":"BHS"}, {"geonames_id":1252634, "alpha3":"BTN"}, {"geonames_id":3371123, "alpha3":"BVT"}, {"geonames_id":933860, "alpha3":"BWA"}, {"geonames_id":630336, "alpha3":"BLR"}, {"geonames_id":3582678, "alpha3":"BLZ"}, {"geonames_id":6251999, "alpha3":"CAN"}, {"geonames_id":1547376, "alpha3":"CCK"}, {"geonames_id":203312, "alpha3":"COD"}, {"geonames_id":239880, "alpha3":"CAF"}, {"geonames_id":2260494, "alpha3":"COG"}, {"geonames_id":2658434, "alpha3":"CHE"}, {"geonames_id":2287781, "alpha3":"CIV"}, {"geonames_id":1899402, "alpha3":"COK"}, {"geonames_id":3895114, "alpha3":"CHL"}, {"geonames_id":2233387, "alpha3":"CMR"}, {"geonames_id":1814991, "alpha3":"CHN"}, {"geonames_id":3686110, "alpha3":"COL"}, {"geonames_id":3624060, "alpha3":"CRI"}, {"geonames_id":3562981, "alpha3":"CUB"}, {"geonames_id":3374766, "alpha3":"CPV"}, {"geonames_id":7626836, "alpha3":"CUW"}, {"geonames_id":2078138, "alpha3":"CXR"}, {"geonames_id":146669, "alpha3":"CYP"}, {"geonames_id":3077311, "alpha3":"CZE"}, {"geonames_id":2921044, "alpha3":"DEU"}, {"geonames_id":223816, "alpha3":"DJI"}, {"geonames_id":2623032, "alpha3":"DNK"}, {"geonames_id":3575830, "alpha3":"DMA"}, {"geonames_id":3508796, "alpha3":"DOM"}, {"geonames_id":2589581, "alpha3":"DZA"}, {"geonames_id":3658394, "alpha3":"ECU"}, {"geonames_id":453733, "alpha3":"EST"}, {"geonames_id":357994, "alpha3":"EGY"}, {"geonames_id":2461445, "alpha3":"ESH"}, {"geonames_id":338010, "alpha3":"ERI"}, {"geonames_id":2510769, "alpha3":"ESP"}, {"geonames_id":337996, "alpha3":"ETH"}, {"geonames_id":660013, "alpha3":"FIN"}, {"geonames_id":2205218, "alpha3":"FJI"}, {"geonames_id":3474414, "alpha3":"FLK"}, {"geonames_id":2081918, "alpha3":"FSM"}, {"geonames_id":2622320, "alpha3":"FRO"}, {"geonames_id":3017382, "alpha3":"FRA"}, {"geonames_id":2400553, "alpha3":"GAB"}, {"geonames_id":2635167, "alpha3":"GBR"}, {"geonames_id":3580239, "alpha3":"GRD"}, {"geonames_id":614540, "alpha3":"GEO"}, {"geonames_id":3381670, "alpha3":"GUF"}, {"geonames_id":3042362, "alpha3":"GGY"}, {"geonames_id":2300660, "alpha3":"GHA"}, {"geonames_id":2411586, "alpha3":"GIB"}, {"geonames_id":3425505, "alpha3":"GRL"}, {"geonames_id":2413451, "alpha3":"GMB"}, {"geonames_id":2420477, "alpha3":"GIN"}, {"geonames_id":3579143, "alpha3":"GLP"}, {"geonames_id":2309096, "alpha3":"GNQ"}, {"geonames_id":390903, "alpha3":"GRC"}, {"geonames_id":3474415, "alpha3":"SGS"}, {"geonames_id":3595528, "alpha3":"GTM"}, {"geonames_id":4043988, "alpha3":"GUM"}, {"geonames_id":2372248, "alpha3":"GNB"}, {"geonames_id":3378535, "alpha3":"GUY"}, {"geonames_id":1819730, "alpha3":"HKG"}, {"geonames_id":1547314, "alpha3":"HMD"}, {"geonames_id":3608932, "alpha3":"HND"}, {"geonames_id":3202326, "alpha3":"HRV"}, {"geonames_id":3723988, "alpha3":"HTI"}, {"geonames_id":719819, "alpha3":"HUN"}, {"geonames_id":1643084, "alpha3":"IDN"}, {"geonames_id":2963597, "alpha3":"IRL"}, {"geonames_id":294640, "alpha3":"ISR"}, {"geonames_id":3042225, "alpha3":"IMN"}, {"geonames_id":1269750, "alpha3":"IND"}, {"geonames_id":1282588, "alpha3":"IOT"}, {"geonames_id":99237, "alpha3":"IRQ"}, {"geonames_id":130758, "alpha3":"IRN"}, {"geonames_id":2629691, "alpha3":"ISL"}, {"geonames_id":3175395, "alpha3":"ITA"}, {"geonames_id":3042142, "alpha3":"JEY"}, {"geonames_id":3489940, "alpha3":"JAM"}, {"geonames_id":248816, "alpha3":"JOR"}, {"geonames_id":1861060, "alpha3":"JPN"}, {"geonames_id":192950, "alpha3":"KEN"}, {"geonames_id":1527747, "alpha3":"KGZ"}, {"geonames_id":1831722, "alpha3":"KHM"}, {"geonames_id":4030945, "alpha3":"KIR"}, {"geonames_id":921929, "alpha3":"COM"}, {"geonames_id":3575174, "alpha3":"KNA"}, {"geonames_id":1873107, "alpha3":"PRK"}, {"geonames_id":1835841, "alpha3":"KOR"}, {"geonames_id":831053, "alpha3":"XKX"}, {"geonames_id":285570, "alpha3":"KWT"}, {"geonames_id":3580718, "alpha3":"CYM"}, {"geonames_id":1522867, "alpha3":"KAZ"}, {"geonames_id":1655842, "alpha3":"LAO"}, {"geonames_id":272103, "alpha3":"LBN"}, {"geonames_id":3576468, "alpha3":"LCA"}, {"geonames_id":3042058, "alpha3":"LIE"}, {"geonames_id":1227603, "alpha3":"LKA"}, {"geonames_id":2275384, "alpha3":"LBR"}, {"geonames_id":932692, "alpha3":"LSO"}, {"geonames_id":597427, "alpha3":"LTU"}, {"geonames_id":2960313, "alpha3":"LUX"}, {"geonames_id":458258, "alpha3":"LVA"}, {"geonames_id":2215636, "alpha3":"LBY"}, {"geonames_id":2542007, "alpha3":"MAR"}, {"geonames_id":2993457, "alpha3":"MCO"}, {"geonames_id":617790, "alpha3":"MDA"}, {"geonames_id":3194884, "alpha3":"MNE"}, {"geonames_id":3578421, "alpha3":"MAF"}, {"geonames_id":1062947, "alpha3":"MDG"}, {"geonames_id":2080185, "alpha3":"MHL"}, {"geonames_id":718075, "alpha3":"MKD"}, {"geonames_id":2453866, "alpha3":"MLI"}, {"geonames_id":1327865, "alpha3":"MMR"}, {"geonames_id":2029969, "alpha3":"MNG"}, {"geonames_id":1821275, "alpha3":"MAC"}, {"geonames_id":4041468, "alpha3":"MNP"}, {"geonames_id":3570311, "alpha3":"MTQ"}, {"geonames_id":2378080, "alpha3":"MRT"}, {"geonames_id":3578097, "alpha3":"MSR"}, {"geonames_id":2562770, "alpha3":"MLT"}, {"geonames_id":934292, "alpha3":"MUS"}, {"geonames_id":1282028, "alpha3":"MDV"}, {"geonames_id":927384, "alpha3":"MWI"}, {"geonames_id":3996063, "alpha3":"MEX"}, {"geonames_id":1733045, "alpha3":"MYS"}, {"geonames_id":1036973, "alpha3":"MOZ"}, {"geonames_id":3355338, "alpha3":"NAM"}, {"geonames_id":2139685, "alpha3":"NCL"}, {"geonames_id":2440476, "alpha3":"NER"}, {"geonames_id":2155115, "alpha3":"NFK"}, {"geonames_id":2328926, "alpha3":"NGA"}, {"geonames_id":3617476, "alpha3":"NIC"}, {"geonames_id":2750405, "alpha3":"NLD"}, {"geonames_id":3144096, "alpha3":"NOR"}, {"geonames_id":1282988, "alpha3":"NPL"}, {"geonames_id":2110425, "alpha3":"NRU"}, {"geonames_id":4036232, "alpha3":"NIU"}, {"geonames_id":2186224, "alpha3":"NZL"}, {"geonames_id":286963, "alpha3":"OMN"}, {"geonames_id":3703430, "alpha3":"PAN"}, {"geonames_id":3932488, "alpha3":"PER"}, {"geonames_id":4030656, "alpha3":"PYF"}, {"geonames_id":2088628, "alpha3":"PNG"}, {"geonames_id":1694008, "alpha3":"PHL"}, {"geonames_id":1168579, "alpha3":"PAK"}, {"geonames_id":798544, "alpha3":"POL"}, {"geonames_id":3424932, "alpha3":"SPM"}, {"geonames_id":4030699, "alpha3":"PCN"}, {"geonames_id":4566966, "alpha3":"PRI"}, {"geonames_id":6254930, "alpha3":"PSE"}, {"geonames_id":2264397, "alpha3":"PRT"}, {"geonames_id":1559582, "alpha3":"PLW"}, {"geonames_id":3437598, "alpha3":"PRY"}, {"geonames_id":289688, "alpha3":"QAT"}, {"geonames_id":935317, "alpha3":"REU"}, {"geonames_id":798549, "alpha3":"ROU"}, {"geonames_id":6290252, "alpha3":"SRB"}, {"geonames_id":2017370, "alpha3":"RUS"}, {"geonames_id":49518, "alpha3":"RWA"}, {"geonames_id":102358, "alpha3":"SAU"}, {"geonames_id":2103350, "alpha3":"SLB"}, {"geonames_id":241170, "alpha3":"SYC"}, {"geonames_id":366755, "alpha3":"SDN"}, {"geonames_id":7909807, "alpha3":"SSD"}, {"geonames_id":2661886, "alpha3":"SWE"}, {"geonames_id":1880251, "alpha3":"SGP"}, {"geonames_id":3370751, "alpha3":"SHN"}, {"geonames_id":3190538, "alpha3":"SVN"}, {"geonames_id":607072, "alpha3":"SJM"}, {"geonames_id":3057568, "alpha3":"SVK"}, {"geonames_id":2403846, "alpha3":"SLE"}, {"geonames_id":3168068, "alpha3":"SMR"}, {"geonames_id":2245662, "alpha3":"SEN"}, {"geonames_id":51537, "alpha3":"SOM"}, {"geonames_id":3382998, "alpha3":"SUR"}, {"geonames_id":2410758, "alpha3":"STP"}, {"geonames_id":3585968, "alpha3":"SLV"}, {"geonames_id":7609695, "alpha3":"SXM"}, {"geonames_id":163843, "alpha3":"SYR"}, {"geonames_id":934841, "alpha3":"SWZ"}, {"geonames_id":3576916, "alpha3":"TCA"}, {"geonames_id":2434508, "alpha3":"TCD"}, {"geonames_id":1546748, "alpha3":"ATF"}, {"geonames_id":2363686, "alpha3":"TGO"}, {"geonames_id":1605651, "alpha3":"THA"}, {"geonames_id":1220409, "alpha3":"TJK"}, {"geonames_id":4031074, "alpha3":"TKL"}, {"geonames_id":1966436, "alpha3":"TLS"}, {"geonames_id":1218197, "alpha3":"TKM"}, {"geonames_id":2464461, "alpha3":"TUN"}, {"geonames_id":4032283, "alpha3":"TON"}, {"geonames_id":298795, "alpha3":"TUR"}, {"geonames_id":3573591, "alpha3":"TTO"}, {"geonames_id":2110297, "alpha3":"TUV"}, {"geonames_id":1668284, "alpha3":"TWN"}, {"geonames_id":149590, "alpha3":"TZA"}, {"geonames_id":690791, "alpha3":"UKR"}, {"geonames_id":226074, "alpha3":"UGA"}, {"geonames_id":5854968, "alpha3":"UMI"}, {"geonames_id":6252001, "alpha3":"USA"}, {"geonames_id":3439705, "alpha3":"URY"}, {"geonames_id":1512440, "alpha3":"UZB"}, {"geonames_id":3164670, "alpha3":"VAT"}, {"geonames_id":3577815, "alpha3":"VCT"}, {"geonames_id":3625428, "alpha3":"VEN"}, {"geonames_id":3577718, "alpha3":"VGB"}, {"geonames_id":4796775, "alpha3":"VIR"}, {"geonames_id":1562822, "alpha3":"VNM"}, {"geonames_id":2134431, "alpha3":"VUT"}, {"geonames_id":4034749, "alpha3":"WLF"}, {"geonames_id":4034894, "alpha3":"WSM"}, {"geonames_id":69543, "alpha3":"YEM"}, {"geonames_id":1024031, "alpha3":"MYT"}, {"geonames_id":953987, "alpha3":"ZAF"}, {"geonames_id":895949, "alpha3":"ZMB"}, {"geonames_id":878675, "alpha3":"ZWE"} ]
GEONAME_TO_ALPHA3 = {c['geonames_id']:c['alpha3'] for c in COUNTRIES}
#for country in COUNTRIES:
#    tag = mc.tagList(name_like="{}{}".format(GEONAMES_TAG_PREFIX, country['geonames_id']))[0]
#    country['tags_id'] = tag['tags_id']

## Story Counts

In [11]:
total_query = "{} AND {}".format(MEDIA_QUERY, DATE_QUERY)
print(total_query)
#total_stories = mc.storyCount(total_query)['count']
#print("Total Stories: {}".format(total_stories))

(tags_id_media: 9361368 9361367 9361369) AND (publish_date:[2017-01-01T00:00:00Z TO 2017-04-01T00:00:00Z})


In [12]:
# WARNING: takes about 30 seconds
def source_story_count_worker(media):
    story_count = mc.storyCount("(media_id:{}) AND {}".format(media['media_id'], DATE_QUERY))['count']
    sentence_count = mc.sentenceCount("(media_id:{}) AND {}".format(media['media_id'], DATE_QUERY))['count']
    media['story_count'] = story_count
    media['sentence_count'] = sentence_count
    del media['media_source_tags']
    print("{}: {} stories".format(media['name'], story_count))
    return media
# fetch story count for each source
jobs = []
for collection in COLLECTIONS:
    for media in collection['media']:
        jobs.append(media)
pool = Pool(processes=16)
media_with_story_counts = pool.map(source_story_count_worker, jobs)  # blocks until they are all done
pool.terminate()  # extra safe garbage collection

CNN: 10017 stories
Daily Kos: 6488 stories
The Atlantic: 5719 stories
Daily News: 15566 stories
LA Times: 25722 stories
New York Times: 23580 stories
CNET: 13576 stories
ESPN: 17063 stories
Reuters: 54659 stories
Lifehacker: 1961 stories
Bloomberg.com: 2771 stories
Buzzfeed: 14634 stories
Economist: 2510 stories
NY Post: 17179 stories
Time: 10323 stories
Daily Telegraph: 17999 stories
NPR: 7641 stories
cbs news: 15196 stories
Forbes: 22844 stories
Wall Street Journal: 23344 stories
FOX News: 18117 stories
newstatesman.com: 1200 stories
The Sun | The Best for News, Sport, Showbiz, Celebrities & TV | The Sun| The Sun: 40097 stories
bbc: 23675 stories
Washington Post: 48009 stories
ft.com: 12444 stories
metro.co.uk: 18585 stories
news.sky.com: 839 stories
ESPN: The Worldwide Leader in Sports: 1 stories
huffingtonpost.co.uk: 9719 stories
theweek.co.uk: 1373 stories
Daily Star UK: 0 stories
themirror: 25287 stories
Wales Online: 6810 stories
theguardian.com: 24416 stories
express.co.uk: 368

In [13]:
media_df = pd.DataFrame(media_with_story_counts)
media_df

Unnamed: 0,country,editor_notes,format,is_healthy,is_monitored,media_id,name,public_notes,sentence_count,story_count,url
0,USA,,analog,0,1,1,New York Times,,664504,23580,http://nytimes.com
1,USA,,analog,1,1,2,Washington Post,,1176812,48009,http://washingtonpost.com
2,USA,,analog,1,1,6,LA Times,,599867,25722,http://www.latimes.com/
3,USA,some editor notes,analog,1,1,7,NY Post,,376928,17179,http://www.nypost.com/
4,USA,daily news notes,analog,1,1,8,Daily News,Public info about ny daily news,382134,15566,http://www.nydailynews.com/
5,USA,,digital,0,0,115,Daily Kos,,164182,6488,http://www.dailykos.com
6,USA,,analog,1,1,1092,FOX News,,362757,18117,http://www.foxnews.com/
7,USA,an editor note about cnn,analog,0,1,1095,CNN,this is a public note about cnn,306851,10017,http://www.cnn.com/
8,USA,,analog,1,0,1096,NPR,,214634,7641,http://www.npr.org/
9,USA,,analog,1,1,1104,Forbes,,656304,22844,http://www.forbes.com/


In [14]:
# write it out to a CSV
media_df.to_csv("geostudy-media-story-counts.csv", encoding='utf-8')

## Fetch all the Stories

We have to do this story by story because that's the only way to respect the AP flag. So this fetches all the stories and writes them to a CSV we can use afterwards.

In [28]:
media_df = pd.read_csv("geostudy-media-story-counts.csv")

In [38]:
def stories_in_media_source_worker(media):
    more_stories = True
    last_processed_stories_id = 0
    query = "(media_id:{}) AND {}".format(media['media_id'], DATE_QUERY)
    all_stories = []
    while more_stories:
        print("  {}:{}".format(media['media_id'], last_processed_stories_id))
        stories = mc.storyList(query, ap_stories_id=1, last_processed_stories_id=last_processed_stories_id, rows=5000)
        for s in stories:
            s['media_country'] = media['country'] # keep track of which collection it came from
            del s['description']
        all_stories += stories
        if len(stories) is 0:
            more_stories = False
        else:
            #more_stories = False # DEBUG: ONE PAGE FOR NOW!
            last_processed_stories_id = stories[-1]['processed_stories_id']
    return all_stories

In [39]:
jobs = []
for index, media in media_df.iterrows():
    jobs.append(media)
pool = Pool(processes=16)
stories_by_source = pool.map(stories_in_media_source_worker, jobs)  # blocks until they are all done
pool.terminate()  # extra safe garbage collection

  115:0
  2:0
  7:0
  8:0
  6:0
  1:0
  1150:0
  4415:0
  1752:0
  4419:0
  1092:0
  1096:0
  1095:0
  1110:0
  1713:0
  1104:0
  1713:990605058
  4442:0
  1:866230264
  1150:860092792
  6:857693622
  1104:859837637
  7:865885831
  1092:861248816
  8:866226588
  115:963526255
  4415:876352800
  2:852228946
  1752:870866041
  1096:889324640
  1095:870884703
  4419:870909631
  1110:968665795
  4442:847817402
  1110:995539375
  4451:0
  115:993810163
  4508:0
  1:881963041
  1150:871908206
  4508:957937764
  4451:938773069
  1104:871895358
  6218:0
  6:871564879
  1092:873776903
  4442:853846062
  7:880333413
  1096:995586477
  2:864003568
  1094:0
  8:882773548
  1752:890361255
  1095:993822099
  4419:962060117
  1095:995346134
  4451:941916352
  1106:0
  4415:890736882
  4419:994960834
  1750:0
  1150:883342596
  1104:884534636
  1:896222078
  6:882534271
  6218:993389821
  1106:994945402
  1754:0
  4442:859090575
  1094:858113706
  2:871277984
  4451:945730007
  1092:893219378
  7:8934

In [40]:
len(stories_by_source)

58

In [41]:
stories = [s for media_stories in stories_by_source for s in media_stories]
len(stories)

713616

In [42]:
for s in stories:
    country_alpha3s = []
    s['geocoded'] = len([t for t in s['story_tags'] if t['tags_id'] == CLIFF_CLAVIN_2_3_0_TAG_ID]) > 0
    if 'story_tags' in s:
        geo_tags = [t for t in s['story_tags'] if t['tag_sets_id'] == GEONAMES_TAG_SET_ID]
        geoname_ids = [int(t['tag'][9:]) for t in geo_tags]
        country_alpha3s = [GEONAME_TO_ALPHA3[gid] for gid in geoname_ids if gid in GEONAME_TO_ALPHA3.keys()]
        del s['story_tags']
    s['countries'] = ",".join(country_alpha3s)

In [43]:
stories_df = pd.DataFrame(stories)
stories_df.to_csv("geostudy-stories.csv", encoding='utf-8')

## Count Country Representation

For each media source we need the percentage of stories about each country (all, without-ap, only-ap).  

In [15]:
# read the data generated above (so you don't have to fetch again)
#stories_df = pd.read_csv("geostudy-stories.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [23]:
def write_country_represenation_csv(df, filename):
    for m in media_with_story_counts:
        # grab totals for this media source
        all_media_stories_df = stories_df[(stories_df.media_id == m['media_id'])]
        non_ap_media_stories_df = all_media_stories_df[(all_media_stories_df.ap_syndicated == 0)]
        ap_media_stories_df = all_media_stories_df[(all_media_stories_df.ap_syndicated == 1)]
        m['total_story_count'] = all_media_stories_df.shape[0]
        m['non_ap_story_count'] = non_ap_media_stories_df.shape[0]
        m['ap_story_count'] = ap_media_stories_df.shape[0]
        m['uses.AP'] = 1 if (ap_media_stories_df.shape[0] > 0) else 0
        if all_media_stories_df.shape[0] > 0:
            m['proportion.AP'] = float(ap_media_stories_df.shape[0]) / float(all_media_stories_df.shape[0])
        else:
            m['proportion.AP'] = 0
        # now go throguh just the filtered stories passed in
        media_stories_df = df[(df.media_id == m['media_id'])]
        for c in COUNTRIES:
            media_country_stories = media_stories_df[media_stories_df['countries'].str.contains(c['alpha3'])==True]
            if media_stories_df.shape[0] == 0:
                m[c['alpha3']] = 0
            else:
                m[c['alpha3']] = float(media_country_stories.shape[0]) / float(media_stories_df.shape[0])
    media_df = pd.DataFrame(media_with_story_counts)
    media_df.to_csv(filename, encoding='utf-8')

In [24]:
write_country_represenation_csv(stories_df, "geostudy-media-country-representation-all.csv")
write_country_represenation_csv(stories_df[stories_df.ap_syndicated == 0], "geostudy-media-country-representation-no-ap.csv")
write_country_represenation_csv(stories_df[stories_df.ap_syndicated == 1], "geostudy-media-country-representation-only-ap.csv")

## Count Stories About Each Country in Media
One row for each media source / country pair

In [64]:
# read the data generated above (so you don't have to fetch again)
#stories_df = pd.read_csv("geostudy-stories.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [31]:
results = []
for m in media_with_story_counts:
    print(m['name'])
    media_stories_df = stories_df[(stories_df.media_id == m['media_id'])]
    for c in COUNTRIES:
        row = {}
        row['media_id'] = m['media_id']
        row['media_url'] = m['url']
        row['media_name'] = m['name']
        row['origin_country'] = m['country']
        row['country_covered'] = c['alpha3']
        row['media_format'] = m['format']
        row['total_story_count'] = media_stories_df.shape[0]
        media_id = m['media_id']
        alpha3 = m['media_id']
        # story count
        media_country_stories_df = media_stories_df[media_stories_df['countries'].str.contains(c['alpha3'])==True]
        row['stories_about_country_count'] = media_country_stories_df.shape[0]
        results.append(row)

New York Times
Washington Post
LA Times
NY Post
Daily News
Daily Kos
FOX News
CNN
NPR
Forbes
The Atlantic
Wall Street Journal
Bloomberg.com
cbs news
CNET
Time
Reuters
ESPN
Lifehacker
Buzzfeed
bbc
Economist
Daily Telegraph
The Sun | The Best for News, Sport, Showbiz, Celebrities & TV | The Sun| The Sun
ft.com
newstatesman.com
news.sky.com
huffingtonpost.co.uk
metro.co.uk
express.co.uk
ibtimes-uk
themirror
Independent
theguardian.com
theweek.co.uk
standard.co.uk
dailymail.co.uk.
ESPN: The Worldwide Leader in Sports
Wales Online
Daily Star UK
Globe and Mail
news.nationalpost.com
vancouversun.com
torontosun.com
theprovince.com
TorontoStar
montrealgazette
ottawacitizen.com
huffingtonpost.ca
o.canada.com
Canada Standard | National News Service for Canada
Radio Canada International
en.canoe.com
Metro Toronto | Latest News and Local Views
CBC.ca
CTV news - Canada
Global News Canada
Maclean's Canada


In [32]:
pairwise_df = pd.DataFrame(results)
pairwise_df.to_csv("geostudy-media-stories-by-country.csv", encoding='utf-8')