In [1]:
# Import statements
import time
import datetime
import json
import math
import ast
import os
import csv

import requests

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

% matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
# Importing API key
nytAPIPath = 'nytDevKey.txt'
fin=open(nytAPIPath,'r')
nyt_api_key = str(fin.readline()).rstrip()

guardianAPIPath = 'guardianAPI.txt'
fin=open(guardianAPIPath,'r')
guardian_api_key = str(fin.readline()).rstrip()

print(nyt_api_key, guardian_api_key)

654dc0be769c45799b40f23a11eba6fe 6ebac18d-5fe7-4f37-a4be-1a9eb72a7136


# Analyzing the News:
We have two different features we can extract from our news database:

1) The number of times the phrase occurs *anywhere* within the text. This has its inherent biases, as certain words will be more common and may not be wholly indicative of what we are trying to measure. However, we can use this to get a feel for the relative frequency that certain words appear in the text, which might be a useful benchmark.

2) The number of times the phrase occurs in *the headline or snippet* of the article. This is a little costlier to measure (as we need to make lots of API calls), but it yields useful information about headlines in particular.

In [3]:
# NYT API:

nyt_url = "https://api.nytimes.com/svc/search/v2/articlesearch.json";

# NYT API doesn't allow us to directly filter by title, so our keywords are ones that 
# exist in the body, title, or url
def searchNYT(string, startDate, endDate, page="0", justHits=False):
    searchString = (nyt_url + '?fq=' + '"' + string + '"' + '&page=' + page 
                    + '&begin_date=' + startDate + '&end_date=' + endDate + '&api-key=' + nyt_api_key)
    
    response = requests.get(searchString).json();

    if (justHits):
        
        # logic to handle different response types, standardizes them
        if 'response' not in response.keys():
            response = ast.literal_eval(response['message'])
    
        return response['response']['meta']['hits']
    
    else:
        return response



# Filtering functions to check *only* for the title
def filterNYT(string, startDate, endDate):
    count = 0
    total = searchNYT(string, startDate, endDate, justHits=True)
    
    print('total hits: ', total)
    
    times = round(math.ceil(total/10))
    
    nytResponse = []
    articlesList = []
    
    # Sleeping to avoid API rate limits
    time.sleep(0.4)
    
    # Iterate through all hits during the time-frame
    for i in range(times):
        
        # print('count: ', count)
        
        response = searchNYT(string, startDate, endDate, page=str(i))
        
        # Sleeping to avoid API rate limits
        time.sleep(0.4)
                
        # logic to handle different response types, standardizes them
        if 'response' in response.keys():
            nytResponse = response['response']['docs']
            
        # 3/1/18 WTF IS THIS DOING?
        else:
            nytResponse = ast.literal_eval(response['message'])
    
    
    
        # Paginate through all times
        for i in range(len(nytResponse)):
            
            articleResponse = nytResponse[i]
            
            # Sleeping to avoid API rate limits
            time.sleep(0.4)
            
            if string in articleResponse['headline']['main'].lower():
                # articlesList.append(articleResponse['headline']['main'])
                print('*', end='')
                count += 1
                
            # Give a little visual indication that it's working
            else:
                print('.', end='')
            
            '''
            elif 'abstract' in articleResponse.keys():
                if string in articleResponse['abstract'].lower():
                    articlesList.append(articleResponse['abstract'])
                    print('*', end='')
                    count += 1
            '''
                
    return count #, articlesList



# Saves results as a csv from filterNYT
def saveCounts(year, word):
    yearStart = str(year)+"-01-01"
    yearEnd = str(year+1)+"-01-01"
    result = filterNYT(word, yearStart, yearEnd)
    resultList = [result]
    
    print(result)
    
    if not os.path.exists('results/' + str(year)):
        os.makedirs('results/' + str(year))
    
    np.savetxt('results/' + str(year) + '/' + str(year) + '_' + word + '.csv', resultList)
        
    print("Done!")

In [36]:
# The Guardian's API
guardian_url = "https://content.guardianapis.com/search"

def searchGuardian(string, startDate, page='1', justHits=False):
    searchString = (guardian_url + '?q=' + '"' + string + '"' + '&from-date=' + startDate + 
                    '&page=' + page + '&page-size=50' + '&order-by=oldest' + 
                    '&api-key=' + guardian_api_key)
    
    response = requests.get(searchString);
    
    response = response.json()
    
    if (justHits):
        return response['response']['total']
    else:
        return response
    
    
    
# Filter Guardian headlines
def filterGuardian(string, startDate, endDate):
    count = 0
    total = searchGuardian(string, startDate, endDate, justHits=True)
    
    print('total hits: ', total)
    
    # We can return up to 50 responses a time
    times = round(math.ceil(total/50))
    
    nytResponse = []
    
    # Sleeping to avoid API rate limits
    time.sleep(0.25)
    
    # Iterate through all hits during the time-frame
    for i in range(1, times+1):
        
        # print('count: ', count)
        
        response = searchGuardian(string, startDate, endDate, page=str(i))
        
        # Sleeping to avoid API rate limits
        time.sleep(0.25)
        
        guardianResponse = response['response']['results']
        
        # Paginate through all pages
        for i in range(len(guardianResponse)):
            
            guardianTitle = guardianResponse[i]['webTitle']
            
            if string in guardianTitle.lower():
                count += 1
                print(guardianResponse[i]['webTitle'])
    
    return count

In [5]:
year = 1999
yearStart = str(year)+"-01-01"
yearEnd = str(year+1)+"-01-01"


In [None]:
yearWordCountList = np.zeros([17, len(wordsList)])

for i in range(len(wordsList)):
    time.sleep(1)
    yearWordCountList[0,i] = searchNYT(wordsList.iloc[i][0], yearStart, yearEnd, justHits=True)
    print(yearWordCountList[0,i])

In [163]:
yearWordCountList[0]

array([  2.42000000e+02,   1.62000000e+02,   3.30000000e+01,
         2.90800000e+03,   0.00000000e+00,   5.00000000e+00,
         3.50000000e+01,   6.10000000e+01,   2.37000000e+02,
         2.00000000e+00,   1.42000000e+02,   2.50000000e+01,
         8.30000000e+01,   8.22000000e+02,   0.00000000e+00,
         1.41000000e+02,   3.32000000e+02,   6.40000000e+01,
         1.78000000e+02,   2.34000000e+02,   3.00000000e+01,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         9.08000000e+02,   0.00000000e+00,   3.66000000e+02,
         2.34400000e+03,   1.72000000e+02,   3.88000000e+02,
         6.86000000e+02,   6.71000000e+02,   3.00000000e+00,
         6.93000000e+02,   1.03000000e+02,   1.00000000e+00,
         1.60000000e+01,   5.30000000e+01,   3.50000000e+01])

In [16]:
tempList = []

year = 2016

for j in range(len(wordsList)):
    time.sleep(0.7)
    tempList.append(searchNYT(wordsList.iloc[j][0], str(year)+"-01-01", str(year+1)+"-01-01", justHits=True))
    print(tempList[j], end=' ')

np.savetxt(str(year) + '.csv', tempList)

print("Done!")

305 196 76 3128 0 15 30 24 202 4 191 54 197 1171 0 234 555 46 392 233 38 0 1 2 2464 33 608 3670 463 544 2973 3299 83 1396 515 47 12 98 92 Done!


In [13]:
searchGuardian(wordsList.iloc[0][0], str(1999)+"-01-01", str(1999+1)+"-01-01", justHits=True)

https://content.guardianapis.com/search?q="heart disease"&from-date=1999-01-01&page=1&page-size=50&order-by=oldest&api-key=6ebac18d-5fe7-4f37-a4be-1a9eb72a7136


5702

In [12]:
wordsList = pd.read_csv('./keywords.csv')

In [14]:
searchGuardian(wordsList.iloc[0][0], str(2000)+"-01-01", str(2000+1)+"-01-01", justHits=True)

https://content.guardianapis.com/search?q="heart disease"&from-date=2000-01-01&page=1&page-size=50&order-by=oldest&api-key=6ebac18d-5fe7-4f37-a4be-1a9eb72a7136


5487

In [22]:
prevTempList = tempList2

year = 2015

while (year > 2010):
    
    tempList = []
    
    for j in range(len(wordsList)):
        time.sleep(0.5)
        tempList.append(searchGuardian(wordsList.iloc[j][0], str(year)+"-01-01", str(year+1)+"-01-01", justHits=True) 
                         - prevTempList[j])
        
        print(tempList[j], end=' ')

    np.savetxt('./results/' + str(year) + '.csv', tempList)

    print("Done!", year)
    
    prevTempList = tempList
    
    year -= 1

651 147 161 5947 0 45 69 52 539 2 390 23 876 2519 0 12 1105 120 373 691 74 0 0 1 5017 822 794 8116 612 1065 6519 5749 1019 1892 515 123 152 247 104 Done! 2015
625 198 133 5975 0 46 83 58 496 2 367 20 793 2459 1 57 1115 95 422 606 53 0 2 2 4786 878 753 8390 522 1201 5602 5026 467 1645 482 130 161 159 111 Done! 2014
939 246 211 8825 1 49 105 72 736 5 567 33 1284 3734 0 100 1606 178 562 1035 102 0 1 1 6795 970 1006 12011 852 1722 8606 7672 1103 2472 712 157 198 294 115 Done! 2013
844 269 166 8480 0 60 112 80 653 2 486 27 1071 3573 1 149 1507 128 594 889 68 0 3 2 6437 1008 971 11802 804 1789 6895 6409 504 2471 562 144 198 211 122 Done! 2012
1170 329 234 11303 1 62 127 95 886 5 676 39 1616 4822 0 185 1976 216 741 1348 124 0 2 1 8578 1084 1188 15445 1106 2406 10073 9131 1157 2972 731 184 254 361 147 Done! 2011


In [27]:
year = 2010

while (year > 1998):
    
    tempList = []
    
    for j in range(len(wordsList)):
        time.sleep(0.3)
        tempList.append(searchGuardian(wordsList.iloc[j][0], str(year)+"-01-01", str(year+1)+"-01-01", justHits=True) 
                         - prevTempList[j])
        
        print(tempList[j], end=' ')

    np.savetxt('./results/' + str(year) + '.csv', tempList)

    print("Done!", year)
    
    prevTempList = tempList
    
    year -= 1

1052 331 193 10505 0 66 134 114 805 3 595 30 1383 4479 1 226 1817 178 721 1262 92 0 5 3 8159 1100 1135 14825 1039 2376 8711 8023 582 2908 577 155 240 272 140 Done! 2010
1379 378 267 13570 2 75 154 104 1052 5 819 43 2003 5902 0 262 2237 374 887 2612 138 0 2 1 10622 1190 1375 18625 1365 2930 12155 10983 1248 3420 744 211 304 426 179 Done! 2009
1346 440 223 13054 0 76 167 155 1019 5 762 35 1749 5769 1 337 2221 228 907 1636 107 0 6 3 10852 1225 1353 19076 1303 3278 11449 10537 684 3345 597 205 417 354 186 Done! 2008
1744 467 302 16053 2 81 179 140 1275 5 978 44 2497 7093 0 377 2702 460 1051 3154 168 1 4 1 13391 1314 1544 22953 1711 3674 15031 13307 1306 3934 775 233 374 503 220 Done! 2007
1640 509 250 15559 0 84 195 183 1233 5 873 36 2033 6984 1 431 2555 333 1040 2349 117 0 6 6 13260 1362 1506 22663 1641 3947 14339 13034 718 3667 616 230 483 402 210 Done! 2006
2092 547 338 18381 2 95 218 173 1482 5 1081 44 2745 8104 1 458 3087 567 1221 3824 182 1 6 3 16134 1426 1663 26583 2102 4264 18082 1

In [34]:
year = 1999

while (year < 2017):
    
    tempList = []
    
    for j in range(len(wordsList)):
        
        time.sleep(0.3)
        
        diff = (searchGuardian(wordsList.iloc[j][0], str(year)+"-01-01", justHits=True) - 
        searchGuardian(wordsList.iloc[j][0], str(year+1)+"-01-01", justHits=True) )
        
        tempList.append(diff)
        
        print(tempList[j], end=' ')
        
        np.savetxt('./results/' + str(year) + '.csv', tempList)

    print("Done!", year, end='\n\n')

    year -= 1

215 31 14 1223 0 5 38 32 158 0 49 0 136 536 0 63 121 42 95 255 11 0 1 0 875 14 79 2552 205 309 627 854 4 333 44 7 18 27 12 Done! 1999

1 3 0 30 0 0 0 0 5 0 3 0 5 43 0 0 2 1 4 13 1 0 0 0 29 0 2 87 3 19 37 39 1 2 0 0 0 1 1 Done! 1998

2 0 0 15 0 0 0 0 1 0 0 0 3 12 0 3 0 0 0 

KeyboardInterrupt: 

In [39]:
year = 1999
for j in range(len(wordsList)):
    time.sleep(0.3)

    diff = (searchGuardian(wordsList.iloc[j][0], str(year)+"-01-01", justHits=True) - 
    searchGuardian(wordsList.iloc[j][0], str(year+1)+"-01-01", justHits=True) )

    tempList.append(diff)

    print(tempList[j], end=' ')

    np.savetxt('./results/' + str(year) + '.csv', tempList)

print("Done!", year, end='\n\n')

2 0 0 15 0 0 0 

From cffi callback <function _verify_callback at 0x0000017A66017D90>:
Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\OpenSSL\SSL.py", line 221, in wrapper
    @wraps(callback)
KeyboardInterrupt


SSLError: HTTPSConnectionPool(host='content.guardianapis.com', port=443): Max retries exceeded with url: /search?q=%22emphysema%22&from-date=1999-01-01&page=1&page-size=50&order-by=oldest&api-key=6ebac18d-5fe7-4f37-a4be-1a9eb72a7136 (Caused by SSLError(SSLError("bad handshake: Error([('SSL routines', 'ssl3_get_server_certificate', 'certificate verify failed')],)",),))

In [48]:
tempList = []

for j in range(len(wordsList)):
    year = 1999

    diff = (searchGuardian(wordsList.iloc[j][0], str(year)+"-01-01", justHits=True) - 
        searchGuardian(wordsList.iloc[j][0], str(year+1)+"-01-01", justHits=True) )
    
    tempList.append(diff)
    
    print(diff, end=' ')

np.savetxt('./results/' + str(year) + '.csv', tempList)

215 31 14 1223 0 5 38 32 158 0 49 0 136 536 0 63 121 42 95 255 11 0 1 0 875 14 79 2552 205 309 627 854 4 333 44 7 18 27 12 

In [49]:
year = 2000

while (year < 2017):
    tempList = []

    for j in range(len(wordsList)):
        
        time.sleep(0.3)
        
        diff = (searchGuardian(wordsList.iloc[j][0], str(year)+"-01-01", justHits=True) - 
            searchGuardian(wordsList.iloc[j][0], str(year+1)+"-01-01", justHits=True) )

        tempList.append(diff)

        print(diff, end=' ')

    np.savetxt('./results/' + str(year) + '.csv', tempList)

    print('Done!')

    year += 1

292 37 17 1653 0 4 26 25 180 0 83 2 171 667 0 92 214 34 105 349 21 0 0 0 975 33 79 2862 267 349 542 692 2 255 72 19 38 56 14 Done!
285 61 12 2091 0 6 44 49 169 0 88 0 206 781 0 94 201 34 149 244 14 0 0 0 1952 40 103 3089 329 551 3612 3965 65 326 92 12 30 44 13 Done!
317 56 15 2054 0 5 37 38 186 0 109 2 290 891 0 74 262 19 145 235 11 0 1 1 2235 70 107 3325 278 551 2913 3005 42 340 17 7 32 46 25 Done!
392 66 19 1996 0 12 50 26 205 0 127 1 274 994 0 79 352 56 213 347 22 0 1 4 2663 69 144 3384 307 649 2701 2846 54 397 47 21 34 47 14 Done!
469 82 46 2244 0 16 62 50 268 0 111 0 272 1043 0 99 413 59 121 320 17 0 1 4 2475 169 154 3457 307 653 3086 2678 70 274 14 16 43 41 18 Done!
348 80 36 2328 0 14 39 33 207 0 103 0 248 1011 1 81 385 107 170 670 14 0 2 2 2743 112 119 3630 391 590 3051 2775 68 341 14 33 42 36 24 Done!
294 69 27 2505 0 8 28 28 214 0 111 1 284 1215 0 94 334 105 133 713 10 0 0 3 2408 137 153 3587 338 669 2890 2497 34 322 19 25 66 48 24 Done!
365 89 35 2483 0 6 25 36 223 0 159 1 4

# Google Trends Info

In [3]:
from pytrends.request import TrendReq
pytrends = TrendReq(hl='en-US', tz=360)

In [7]:
# pytrends.interest_over_time()

googleTrendSum = pd.DataFrame();

majorWords = ['heart disease', 'cancer', 'respiratory disease', 'car accidents', 'stroke', 'diabetes', 
              'alzheimer\'s', 'pneumonia', 'kidney disease', 'suicide', 'homicide', 'terrorism', 'overdose']

for i in majorWords:
    kw_list = []
    kw_list.append(i)
    pytrends.build_payload(kw_list, cat=0, 
                       timeframe='all', geo='', gprop='')
    df = pytrends.interest_over_time()[i]
    googleTrendSum[i] = df

In [8]:
googleTrendSum = googleTrendSum.transpose()

In [9]:
googleTrendSum

date,2004-01-01 00:00:00,2004-02-01 00:00:00,2004-03-01 00:00:00,2004-04-01 00:00:00,2004-05-01 00:00:00,2004-06-01 00:00:00,2004-07-01 00:00:00,2004-08-01 00:00:00,2004-09-01 00:00:00,2004-10-01 00:00:00,...,2017-06-01 00:00:00,2017-07-01 00:00:00,2017-08-01 00:00:00,2017-09-01 00:00:00,2017-10-01 00:00:00,2017-11-01 00:00:00,2017-12-01 00:00:00,2018-01-01 00:00:00,2018-02-01 00:00:00,2018-03-01 00:00:00
heart disease,75,100,94,86,80,63,54,62,79,84,...,37,34,33,36,40,40,35,41,46,44
cancer,83,86,89,90,89,81,79,81,89,100,...,71,68,67,67,79,73,65,71,71,71
respiratory disease,77,92,100,74,87,61,50,65,76,81,...,43,38,36,45,52,58,44,46,52,53
car accidents,79,100,95,90,78,73,73,66,86,90,...,34,37,35,33,36,36,35,36,40,36
stroke,91,97,96,98,98,93,96,95,89,86,...,99,96,91,91,92,93,86,89,94,92
diabetes,86,95,96,98,89,88,81,84,92,96,...,64,64,62,66,70,77,60,65,69,70
alzheimer's,95,90,90,87,78,100,91,70,81,81,...,26,26,25,29,28,29,24,26,27,27
pneumonia,51,44,46,45,41,37,32,34,39,47,...,51,46,50,56,64,70,80,100,86,75
kidney disease,74,80,76,74,64,67,70,70,71,69,...,85,86,88,89,90,96,85,96,100,100
suicide,21,23,22,23,21,17,17,17,18,21,...,18,19,17,18,17,17,17,23,16,15


In [11]:
newGoogle = pd.DataFrame()
for i in range(0,13):
    newGoogle[str(i+2004)] = googleTrendSum.iloc[:, (i*12):(i+1)*12].sum(axis=1)

In [12]:
newGoogle.to_csv('./new_google_trends.csv')

In [14]:
google_totals = newGoogle.iloc[:, :].sum()

In [18]:
newGoogle.divide(google_totals).to_csv('./new_google_normalized_trends.csv')

# Newspaper Scraping

In [2]:
nyt_hits = pd.read_csv('./results/NYT_counts.csv')

In [34]:
nyt_totals = nyt_hits.iloc[:, 2:].sum()

In [49]:
nyt_hits.iloc[:, 2:].divide(nyt_totals).to_csv('./results/nyt_normalized_counts.csv')

In [50]:
guardian_hits = pd.read_csv('./results/Guardian_counts.csv')

In [55]:
guardian_totals = guardian_hits.iloc[:, 2:].sum()
guardian_hits.iloc[:, 2:].divide(guardian_totals).to_csv('./results/Guardian_normalized_counts.csv')

In [22]:
google_hits = pd.read_csv('./new_google_trends.csv')

In [23]:
google_sum = google_hits.iloc[:, 1:].sum()
google_hits.iloc[:, 1:].divide(google_sum).to_csv('./results/new_Google_trends_normalized_counts.csv')

In [36]:
nyt_final = pd.read_csv('./results/NYT/NYT_counts.csv').drop(['Unnamed: 0'], axis=1)

In [37]:
nyt_final

Unnamed: 0,Words,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016
0,heart disease,242.0,234.0,249.0,304.0,250.0,322.0,300.0,423.0,426.0,365.0,401.0,322.0,328.0,402.0,355.0,270.0,290.0,305.0
1,heart failure,162.0,102.0,145.0,139.0,120.0,177.0,233.0,265.0,252.0,285.0,280.0,239.0,243.0,205.0,188.0,159.0,186.0,196.0
2,cardiovascular disease,33.0,33.0,32.0,50.0,49.0,61.0,48.0,73.0,71.0,81.0,78.0,61.0,59.0,70.0,82.0,70.0,97.0,76.0
3,cancer,2908.0,3068.0,2775.0,2931.0,2572.0,2609.0,2661.0,3417.0,3999.0,4019.0,4609.0,4161.0,4239.0,3895.0,3137.0,3300.0,3184.0,3128.0
4,malignant neoplasms,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,respiratory disease,5.0,4.0,1.0,7.0,120.0,8.0,16.0,11.0,16.0,12.0,26.0,15.0,13.0,17.0,11.0,11.0,13.0,15.0
6,bronchitis,35.0,29.0,34.0,26.0,26.0,27.0,28.0,55.0,49.0,27.0,21.0,29.0,14.0,35.0,36.0,36.0,25.0,30.0
7,emphysema,61.0,46.0,44.0,39.0,53.0,70.0,64.0,60.0,58.0,55.0,58.0,46.0,51.0,38.0,42.0,41.0,29.0,24.0
8,asthma,237.0,247.0,290.0,184.0,221.0,215.0,180.0,243.0,332.0,209.0,274.0,291.0,245.0,243.0,191.0,210.0,142.0,202.0
9,unintentional injuries,2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,1.0,1.0,0.0,0.0,4.0,3.0,2.0,4.0


In [38]:
nyt_total = pd.DataFrame()

nyt_total['Heart Disease'] = nyt_final.iloc[0:3, 1:].sum()
nyt_total['Cancer'] = nyt_final.iloc[3:5, 1:].sum()
nyt_total['Lower Respiratory Disease'] = nyt_final.iloc[5:9, 1:].sum()
nyt_total['Car Accidents'] = nyt_final.iloc[10:13, 1:].sum()
nyt_total['Stroke'] = nyt_final.iloc[13:15, 1:].sum()
nyt_total['Alzheimer\'s Disease'] = nyt_final.iloc[15:16, 1:].sum()
nyt_total['Diabetes'] = nyt_final.iloc[16:17, 1:].sum()
nyt_total['Pneumonia & Influenza'] = nyt_final.iloc[17:20, 1:].sum()
nyt_total['Kidney'] = nyt_final.iloc[20:24, 1:].sum()
nyt_total['Suicide'] = nyt_final.iloc[24:26, 1:].sum()
nyt_total['Homicide'] = nyt_final.iloc[26:30, 1:].sum()
nyt_total['Terrorism'] = nyt_final.iloc[30:33, 1:].sum() + nyt_final.iloc[33:35, 1:].sum() 
nyt_total['Overdose'] = nyt_final.iloc[38:39, 1:].sum()

In [39]:
nyt_total = nyt_total.transpose()

In [40]:
nyt_total.to_csv('./new_nyt_counts.csv')

In [44]:
nyt_sums = nyt_total.iloc[:, 0:].sum()

In [45]:
nyt_sums

1999    12103.0
2000    11748.0
2001    19657.0
2002    20322.0
2003    17643.0
2004    17230.0
2005    15905.0
2006    20602.0
2007    21651.0
2008    18163.0
2009    21097.0
2010    18243.0
2011    18668.0
2012    17869.0
2013    16878.0
2014    16540.0
2015    18335.0
2016    23230.0
dtype: float64

In [46]:
nyt_total.divide(nyt_sums).to_csv('./new_nyt_counts_normalized.csv')

In [61]:
guardian_final = pd.read_csv('./results/Guardian/Guardian_counts.csv').drop(['Unnamed: 0'], axis=1)

In [62]:
guardian_final

Unnamed: 0,Words,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016
0,heart disease,215.0,292.0,285.0,317.0,392.0,469.0,348.0,294.0,365.0,294.0,209.0,208.0,231.0,219.0,288.0,287.0,304.0,338.0
1,heart failure,31.0,37.0,61.0,56.0,66.0,82.0,80.0,69.0,89.0,109.0,49.0,62.0,83.0,71.0,99.0,97.0,66.0,101.0
2,cardiovascular disease,14.0,17.0,12.0,15.0,19.0,46.0,36.0,27.0,35.0,30.0,33.0,27.0,23.0,33.0,50.0,64.0,80.0,69.0
3,cancer,1223.0,1653.0,2091.0,2054.0,1996.0,2244.0,2328.0,2505.0,2483.0,2549.0,2267.0,2025.0,2478.0,2505.0,2878.0,2926.0,2990.0,3049.0
4,malignant neoplasms,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5,respiratory disease,5.0,4.0,6.0,5.0,12.0,16.0,14.0,8.0,6.0,10.0,13.0,6.0,13.0,14.0,4.0,26.0,23.0,20.0
6,bronchitis,38.0,26.0,44.0,37.0,50.0,62.0,39.0,28.0,25.0,33.0,27.0,22.0,22.0,29.0,36.0,33.0,41.0,50.0
7,emphysema,32.0,25.0,49.0,38.0,26.0,50.0,33.0,28.0,36.0,41.0,9.0,34.0,23.0,22.0,20.0,30.0,24.0,28.0
8,asthma,158.0,180.0,169.0,186.0,205.0,268.0,207.0,214.0,223.0,214.0,166.0,152.0,150.0,157.0,197.0,211.0,245.0,285.0
9,unintentional injuries,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,3.0,0.0,1.0,2.0


In [69]:
guardian_total = pd.DataFrame()

guardian_total['Heart Disease'] = guardian_final.iloc[0:3, 1:].sum()
guardian_total['Cancer'] = guardian_final.iloc[3:5, 1:].sum()
guardian_total['Lower Respiratory Disease'] = guardian_final.iloc[5:9, 1:].sum()
guardian_total['Car Accidents'] = guardian_final.iloc[10:13, 1:].sum()
guardian_total['Stroke'] = guardian_final.iloc[13:15, 1:].sum()
guardian_total['Alzheimer\'s Disease'] = guardian_final.iloc[15:16, 1:].sum()
guardian_total['Diabetes'] = guardian_final.iloc[16:17, 1:].sum()
guardian_total['Pneumonia & Influenza'] = guardian_final.iloc[17:20, 1:].sum()
guardian_total['Kidney'] = guardian_final.iloc[20:24, 1:].sum()
guardian_total['Suicide'] = guardian_final.iloc[24:26, 1:].sum()
guardian_total['Homicide'] = guardian_final.iloc[26:30, 1:].sum()
guardian_total['Terrorism'] = guardian_final.iloc[30:33, 1:].sum() + guardian_final.iloc[33:35, 1:].sum()
guardian_total['Overdose'] = guardian_final.iloc[38:39, 1:].sum()

In [70]:
guardian_total = guardian_total.transpose()

In [71]:
guardian_total

Unnamed: 0,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016
Heart Disease,260.0,346.0,358.0,388.0,477.0,597.0,464.0,390.0,489.0,433.0,291.0,297.0,337.0,323.0,437.0,448.0,450.0,508.0
Cancer,1223.0,1653.0,2091.0,2054.0,1996.0,2244.0,2328.0,2505.0,2483.0,2549.0,2268.0,2025.0,2478.0,2505.0,2879.0,2926.0,2990.0,3049.0
Lower Respiratory Disease,233.0,235.0,268.0,266.0,293.0,396.0,293.0,278.0,290.0,298.0,215.0,214.0,208.0,222.0,257.0,300.0,333.0,383.0
Car Accidents,185.0,256.0,294.0,401.0,402.0,383.0,351.0,396.0,654.0,538.0,534.0,424.0,447.0,404.0,595.0,503.0,642.0,677.0
Stroke,536.0,667.0,781.0,891.0,994.0,1043.0,1012.0,1215.0,1191.0,1290.0,1080.0,906.0,1088.0,1114.0,1215.0,1214.0,1231.0,1246.0
Alzheimer's Disease,63.0,92.0,94.0,74.0,79.0,99.0,81.0,94.0,115.0,111.0,77.0,77.0,85.0,92.0,88.0,56.0,10.0,1.0
Diabetes,121.0,214.0,201.0,262.0,352.0,413.0,385.0,334.0,465.0,404.0,261.0,310.0,370.0,392.0,501.0,574.0,556.0,541.0
Pneumonia & Influenza,392.0,488.0,427.0,399.0,616.0,500.0,947.0,951.0,792.0,610.0,1568.0,550.0,530.0,488.0,591.0,568.0,551.0,555.0
Kidney,12.0,21.0,14.0,13.0,27.0,22.0,18.0,13.0,33.0,16.0,14.0,27.0,23.0,16.0,29.0,23.0,37.0,34.0
Suicide,889.0,1008.0,1992.0,2305.0,2732.0,2644.0,2855.0,2545.0,2893.0,2818.0,2150.0,1814.0,1897.0,1781.0,1926.0,2294.0,2919.0,3370.0


In [72]:
guardian_total.to_csv('./new_guardian_counts.csv')

In [178]:
guardian_total.to_csv('./Guardian_counts.csv')

In [73]:
guardian_sums = guardian_total.iloc[:, 0:].sum()

In [74]:
guardian_total.divide(guardian_sums).to_csv('./new_guardian_normalized_counts.csv')

In [59]:
guardian_total

Unnamed: 0,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016
Heart Disease,260.0,346.0,358.0,388.0,477.0,597.0,464.0,390.0,489.0,433.0,291.0,297.0,337.0,323.0,437.0,448.0,450.0,508.0
Cancer,1223.0,1653.0,2091.0,2054.0,1996.0,2244.0,2328.0,2505.0,2483.0,2549.0,2268.0,2025.0,2478.0,2505.0,2879.0,2926.0,2990.0,3049.0
Lower Respiratory Disease,233.0,235.0,268.0,266.0,293.0,396.0,293.0,278.0,290.0,298.0,215.0,214.0,208.0,222.0,257.0,300.0,333.0,383.0
Car Accidents,185.0,256.0,294.0,401.0,402.0,383.0,351.0,396.0,654.0,538.0,534.0,424.0,447.0,404.0,595.0,503.0,642.0,677.0
Stroke,536.0,667.0,781.0,891.0,994.0,1043.0,1012.0,1215.0,1191.0,1290.0,1080.0,906.0,1088.0,1114.0,1215.0,1214.0,1231.0,1246.0
Alzheimer's Disease,63.0,92.0,94.0,74.0,79.0,99.0,81.0,94.0,115.0,111.0,77.0,77.0,85.0,92.0,88.0,56.0,10.0,1.0
Diabetes,121.0,214.0,201.0,262.0,352.0,413.0,385.0,334.0,465.0,404.0,261.0,310.0,370.0,392.0,501.0,574.0,556.0,541.0
Pneumonia & Influenza,392.0,488.0,427.0,399.0,616.0,500.0,947.0,951.0,792.0,610.0,1568.0,550.0,530.0,488.0,591.0,568.0,551.0,555.0
Kidney,12.0,21.0,14.0,13.0,27.0,22.0,18.0,13.0,33.0,16.0,14.0,27.0,23.0,16.0,29.0,23.0,37.0,34.0
Suicide,889.0,1008.0,1992.0,2305.0,2732.0,2644.0,2855.0,2545.0,2893.0,2818.0,2150.0,1814.0,1897.0,1781.0,1926.0,2294.0,2919.0,3370.0
