In [8]:
# credits to Dinesh Daultani (@dineshdaultani)

import requests
import json
from requests.exceptions import HTTPError
from db_manager import *
import datetime
import pandas as pd
from newspaper import Article
from tqdm import tqdm
import sys

nyt_db = '/home/ostapkharysh/Documents/bt_data/DB/NYT'

#api = NewsApiClient(api_key='0763606ae56d49e08902365e0dbcb239')
#sources = api.sources(params)
#articles = api.articles(sources[0]['id'], params) 

In [9]:
def get_text(link):
    try:
        r = requests.get(link.strip())
        r.raise_for_status()
        article = Article(link.strip())
        article.download()
        article.parse()
        return article.text
    except HTTPError:
        print("No article found by this link!: " + link)
        
def conv_time(time):
    try:
        return datetime.datetime.strptime(time, "%Y-%m-%dT%H:%M:%SZ")
    except ValueError:
        return datetime.datetime.strptime(time.split("+")[0] +"Z", "%Y-%m-%dT%H:%M:%SZ")
        

In [10]:
class ArchiveAPI(object):
    def __init__(self, key=None):
        self.key = key
        """
        Initializes the ArchiveAPI class. Raises an exception if no API key is given.
        :param key: New York Times API Key
        """
        self.root = 'http://api.nytimes.com/svc/archive/v1/{}/{}.json?api-key={}' 
        if not self.key:
            nyt_dev_page = 'http://developer.nytimes.com/docs/reference/keys'
            exception_str = 'API Key required.{}'
            raise NoAPIKeyException(exception_str.format(nyt_dev_page))

    def query(self, year=None, month=None, key=None,):
        """
        Calls the archive API and returns the results as a dictionary.
        :param key: Defaults to the API key used to initialize the ArchiveAPI class.
        """
        if not key: key = self.key
        if (year < 1882) or not (0 < month < 13):
            # currently the Archive API only supports year >= 1882
            exception_str = 'Invalid query: See http://developer.nytimes.com/archive_api.json'
            raise InvalidQueryException(exception_str)
        url = self.root.format(year, month, key)
        r = requests.get(url)
        return r.json()

In [11]:
my_api = ArchiveAPI('0ba6dc04a8cb44e0a890c00df88c393a')

In [13]:
years = [2013] # 2015, 2014, 2013, 2012, 2011, 2010, 2009, 2008, 2007]
months = [1] #, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

conn = create_connection(nyt_db)
try:
    for year in years:
        for month in months:
            mydict = my_api.query(year, month)
            #file_str = '/home/ostapkharysh/Documents/bt_data/NYT_news/' + str(year) + '-' + '{:02}'.format(month) + '.json'
            df = pd.DataFrame(mydict)
            df = df['response']['docs']
            print("DONE")
            with conn:
                for el in tqdm(df):
                    news = (conv_time(el['pub_date']), el['headline']['main'], get_text(el['web_url']), 
                            el['web_url']) 
                    create_task(conn, news)

            #with open(file_str, 'w') as fout:
            #    json.dump(mydict, fout)
            #fout.close()

    close_db(conn)
    print("FINISHED!")
except:
    print ("Unexpected error:", sys.exc_info()[0])
    raise
    
"""
ConnectionError: HTTPSConnectionPool(host='dealbook.nytimes.com', port=443): 
Max retries exceeded with url: /2013/01/02/deferring-six-figures-on-wall-street-for-teachers-salary/ 
(Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x7f9ae8d7bd30>: 
Failed to establish a new connection: [Errno -3] Temporary failure in name resolution',))


NewConnectionError: <urllib3.connection.VerifiedHTTPSConnection object at 0x7f9ae8d7bd30>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution


MaxRetryError: HTTPSConnectionPool(host='dealbook.nytimes.com', port=443): Max retries exceeded with url: /2013/01/02/deferring-six-figures-on-wall-street-for-teachers-salary/ 
(Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x7f9ae8d7bd30>: \
Failed to establish a new connection: [Errno -3] Temporary failure in name resolution',))



ConnectionError: HTTPSConnectionPool(host='dealbook.nytimes.com', port=443): 
Max retries exceeded with url: /2013/01/02/deferring-six-figures-on-wall-street-for-teachers-salary/ 
(Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x7f9ae8d7bd30>: 
Failed to establish a new connection: [Errno -3] Temporary failure in name resolution',))
"""    

#HTTPSConnectionPool, ConnectionRefusedError



  0%|          | 0/8496 [00:00<?, ?it/s][A[A

DONE




  0%|          | 1/8496 [00:00<1:35:15,  1.49it/s][A[A

  0%|          | 2/8496 [00:01<1:30:19,  1.57it/s][A[A

  0%|          | 3/8496 [00:01<1:21:15,  1.74it/s][A[A

  0%|          | 4/8496 [00:02<1:16:43,  1.84it/s][A[A

  0%|          | 5/8496 [00:02<1:10:14,  2.01it/s][A[A

  0%|          | 6/8496 [00:02<1:07:06,  2.11it/s][A[A

  0%|          | 7/8496 [00:03<1:08:50,  2.06it/s][A[A

  0%|          | 8/8496 [00:03<1:09:04,  2.05it/s][A[A

  0%|          | 9/8496 [00:04<1:09:29,  2.04it/s][A[A

  0%|          | 10/8496 [00:04<1:08:06,  2.08it/s][A[A

  0%|          | 11/8496 [00:05<1:04:23,  2.20it/s][A[A

No article found by this link!: https://straightsets.blogs.nytimes.com/2013/01/01/court-comes-apart-at-ferrers-feet/




  0%|          | 12/8496 [00:05<1:08:10,  2.07it/s][A[A

  0%|          | 13/8496 [00:06<1:07:40,  2.09it/s][A[A

  0%|          | 14/8496 [00:06<1:06:26,  2.13it/s][A[A

  0%|          | 15/8496 [00:07<1:03:38,  2.22it/s][A[A

  0%|          | 16/8496 [00:07<1:04:27,  2.19it/s][A[A

  0%|          | 17/8496 [00:08<1:19:08,  1.79it/s][A[A

  0%|          | 18/8496 [00:08<1:14:40,  1.89it/s][A[A

  0%|          | 19/8496 [00:09<1:12:49,  1.94it/s][A[A

  0%|          | 20/8496 [00:09<1:08:33,  2.06it/s][A[A

  0%|          | 21/8496 [00:10<1:05:02,  2.17it/s][A[A

  0%|          | 22/8496 [00:11<1:30:30,  1.56it/s][A[A

  0%|          | 23/8496 [00:11<1:22:49,  1.70it/s][A[A

  0%|          | 24/8496 [00:12<1:16:22,  1.85it/s][A[A

  0%|          | 25/8496 [00:12<1:16:47,  1.84it/s][A[A

  0%|          | 26/8496 [00:13<1:10:56,  1.99it/s][A[A

  0%|          | 27/8496 [00:13<1:10:08,  2.01it/s][A[A

  0%|          | 28/8496 [00:14<1:12:43,  1.94it/s][A

Unexpected error: <class 'requests.exceptions.ConnectionError'>


ConnectionError: HTTPSConnectionPool(host='dealbook.nytimes.com', port=443): Max retries exceeded with url: /2013/01/02/deferring-six-figures-on-wall-street-for-teachers-salary/ (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x7f9ae8d7bd30>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution',))

In [None]:
import pandas as pd
with open('/home/ostapkharysh/Documents/bt_data/NYT_news/2016-02.json', 'r') as f: 
    data = json.load(f)
df = pd.DataFrame(data)
df

In [None]:
#  'pub_date', 'web_url','word_count'
data = df['response']['docs']

#len(df)#[0]['word_count']
len(data)
df['response']['docs'][5]

In [22]:
for el in df['response']['docs']:
    print(el['pub_date'])

2016-02-01T00:00:00Z
2016-02-01T00:00:00Z
2016-02-01T00:00:00Z
2016-02-01T00:00:00Z
2016-02-01T00:00:00Z
2016-02-01T23:42:57Z
2016-02-01T00:00:00Z
2016-02-01T22:00:58Z
2016-02-01T00:00:00Z
2016-02-01T00:00:00Z
2016-02-01T00:00:00Z
2016-02-01T20:27:37Z
2016-02-01T00:00:00Z
2016-02-01T00:00:00Z
2016-02-01T00:00:00Z
2016-02-01T18:56:10Z
2016-02-01T18:30:10Z
2016-02-01T00:00:00Z
2016-02-01T18:13:55Z
2016-02-01T00:00:00Z
2016-02-01T18:06:45Z
2016-02-01T00:00:00Z
2016-02-01T00:00:00Z
2016-02-01T17:24:55Z
2016-02-01T00:00:00Z
2016-02-01T00:00:00Z
2016-02-01T00:00:00Z
2016-02-01T00:00:00Z
2016-02-01T16:26:45+0000
2016-02-01T00:00:00Z
2016-02-01T00:00:00Z
2016-02-01T00:00:00Z
2016-02-01T15:06:19Z
2016-02-01T00:00:00Z
2016-02-01T14:59:37Z
2016-02-01T00:00:00Z
2016-02-01T00:00:00Z
2016-02-01T00:00:00Z
2016-02-01T00:00:00Z
2016-02-01T00:00:00Z
2016-02-01T13:49:08Z
2016-02-01T13:43:16Z
2016-02-01T00:00:00Z
2016-02-01T00:00:00Z
2016-02-01T00:00:00Z
2016-02-01T12:30:42Z
2016-02-01T00:00:00Z
2016-02-0

2016-02-10T00:00:00Z
2016-02-10T00:00:00Z
2016-02-10T00:00:00Z
2016-02-10T00:00:00Z
2016-02-10T00:00:00Z
2016-02-10T00:00:00Z
2016-02-10T00:00:00Z
2016-02-10T00:00:00Z
2016-02-10T00:00:00Z
2016-02-10T00:00:00Z
2016-02-10T00:00:00Z
2016-02-10T00:00:00Z
2016-02-10T00:00:00Z
2016-02-10T00:00:00Z
2016-02-10T00:00:00Z
2016-02-10T00:00:00Z
2016-02-10T00:00:00Z
2016-02-10T00:00:00Z
2016-02-10T00:00:00Z
2016-02-10T00:00:00Z
2016-02-10T00:00:00Z
2016-02-10T00:00:00Z
2016-02-10T00:00:00Z
2016-02-10T00:00:00Z
2016-02-10T00:00:00Z
2016-02-10T00:00:00Z
2016-02-10T00:00:00Z
2016-02-10T00:00:00Z
2016-02-10T00:00:00Z
2016-02-10T00:00:00Z
2016-02-10T00:00:00Z
2016-02-10T00:00:00Z
2016-02-10T00:00:00Z
2016-02-10T00:00:00Z
2016-02-10T00:00:00Z
2016-02-10T00:00:00Z
2016-02-10T00:00:00Z
2016-02-10T00:00:00Z
2016-02-10T00:00:00Z
2016-02-10T00:00:00Z
2016-02-10T00:00:00Z
2016-02-10T00:00:00Z
2016-02-10T00:00:00Z
2016-02-10T00:00:00Z
2016-02-10T00:00:00Z
2016-02-10T00:00:00Z
2016-02-10T00:00:00Z
2016-02-10T00

2016-02-20T00:00:00Z
2016-02-20T00:00:00Z
2016-02-20T00:00:00Z
2016-02-20T00:00:00Z
2016-02-20T00:00:00Z
2016-02-20T00:00:00Z
2016-02-20T00:00:00Z
2016-02-20T00:00:00Z
2016-02-20T00:00:00Z
2016-02-20T00:00:00Z
2016-02-20T00:00:00Z
2016-02-20T00:00:00Z
2016-02-20T00:00:00Z
2016-02-20T00:00:00Z
2016-02-20T00:00:00Z
2016-02-20T00:00:00Z
2016-02-20T00:00:00Z
2016-02-20T00:00:00Z
2016-02-20T00:00:00Z
2016-02-20T00:00:00Z
2016-02-20T00:00:00Z
2016-02-20T00:00:00Z
2016-02-20T00:00:00Z
2016-02-20T00:00:00Z
2016-02-20T00:00:00Z
2016-02-20T00:00:00Z
2016-02-20T00:00:00Z
2016-02-20T00:00:00Z
2016-02-20T00:00:00Z
2016-02-20T00:00:00Z
2016-02-20T00:00:00Z
2016-02-20T00:00:00Z
2016-02-20T00:00:00Z
2016-02-20T00:00:00Z
2016-02-20T00:00:00Z
2016-02-20T00:00:00Z
2016-02-20T00:00:00Z
2016-02-20T00:00:00Z
2016-02-20T00:00:00Z
2016-02-20T00:00:00Z
2016-02-20T00:00:00Z
2016-02-20T00:00:00Z
2016-02-20T00:00:00Z
2016-02-20T00:00:00Z
2016-02-20T00:00:00Z
2016-02-20T00:00:00Z
2016-02-20T00:00:00Z
2016-02-20T00

In [7]:
k = df['response']['docs'][5]['web_url']
print(k)
print(df['response']['docs'][5]['pub_date'])
df['response']['docs'][5]['word_count']

https://takingnote.blogs.nytimes.com/2016/02/01/so-this-is-the-humble-trump/
2016-02-01T23:42:57Z


'255'

In [8]:
from newspaper import Article
import newspaper

In [9]:
article = Article(k)
try:
    article.download()
except:
    print('***FAILED TO DOWNLOAD***', article.url)
article.parse()

In [13]:
article.publish_date

datetime.datetime(2016, 2, 1, 0, 0)

In [None]:
article.text

In [None]:
article.title

In [33]:
for el in df['response']['docs']:
    print(el['web_url'])


https://www.nytimes.com/slideshow/2016/02/01/nyregion/movers-on-bikes.html
https://www.nytimes.com/slideshow/2016/02/01/t-magazine/lady-of-the-valley-jane-ormsby-gores-home.html
https://www.nytimes.com/interactive/2016/02/01/arts/television/oj-simpson-murder-trial-coverage.html
https://www.nytimes.com/slideshow/2016/02/01/blogs/life-among-colombias-farc-rebels-as-peace-nears.html
https://www.nytimes.com/slideshow/2016/02/01/blogs/la-vida-al-interior-de-las-farc-antes-de-la-paz.html
https://takingnote.blogs.nytimes.com/2016/02/01/so-this-is-the-humble-trump/
https://www.nytimes.com/slideshow/2016/02/01/us/winners-in-iowa.html
https://wordplay.blogs.nytimes.com/2016/02/01/supersonic-speed/
https://www.nytimes.com/2016/02/01/arts/television/the-x-files-season-10-episode-3-were-monster-review.html
https://www.nytimes.com/interactive/2016/02/01/us/elections/iowa-republican-poll.html
https://www.nytimes.com/interactive/2016/02/01/us/elections/iowa-democrat-poll.html
https://www.nytimes.com/p

https://www.nytimes.com/2016/02/11/world/asia/china-patriotic-education.html
https://www.nytimes.com/2016/02/11/business/dealbook/china-opera-kunlun-qihoo-golden-brick.html
https://www.nytimes.com/2016/02/11/business/dealbook/hikma-pharmaceuticals-cuts-offer-price-for-generic-drug-maker-roxane.html
https://www.nytimes.com/2016/02/11/us/politics/new-hampshire-highlights.html
https://www.nytimes.com/2016/02/11/fashion/fashion-week-vladimir-teriokhin-secret-knitwear-weapon.html
https://www.nytimes.com/slideshow/2016/02/11/fashion/fashion-set-celebrates-chinese-new-year.html
https://www.nytimes.com/2016/02/11/fashion/cocha-rocha-model-spin-athlesiure.html
https://www.nytimes.com/2016/02/11/fashion/natalia-vodianova-beauty-routine.html
https://www.nytimes.com/2016/02/09/science/earth/richard-p-von-herzen-explorer-of-earths-undersea-furnaces-dies-at-85.html
https://www.nytimes.com/2016/02/05/fashion/fashion-snapchat-app.html
https://www.nytimes.com/interactive/2016/02/16/world/europe/france-

https://www.nytimes.com/2016/02/21/arts/dance/pacific-northwest-ballet-returns-to-city-center.html
https://www.nytimes.com/2016/02/21/sports/a-civil-rights-warrior-at-grambling-armed-with-silverware-and-thank-you-notes.html
https://www.nytimes.com/interactive/2016/02/21/magazine/larry-levis-threshold-of-the-oblivious-blossoming.html
https://www.nytimes.com/2016/02/21/magazine/learning-to-recycle-in-switzerland-and-paying-for-it.html
https://www.nytimes.com/2016/02/21/magazine/how-to-lull-a-grown-up-to-sleep.html
https://www.nytimes.com/2016/02/21/magazine/the-2-716-issue.html
https://www.nytimes.com/2016/02/21/jobs/the-resume-and-references-check-out-how-about-social-media.html
https://www.nytimes.com/2016/02/21/fashion/what-luck-means-now.html
https://www.nytimes.com/2016/02/21/business/lorna-borenstein-of-grokker-invest-in-people-for-the-long-term.html
https://www.nytimes.com/2016/02/20/us/storm-water-long-a-nuisance-may-be-a-parched-californias-salvation.html
https://www.nytimes.com

In [8]:
get_text('https://straightsets.blogs.nytimes.com/2013/01/01/court-comes-apart-at-ferrers-feet/')

No article found by this link!


In [25]:
datetime.datetime.strptime('2013-01-02T19:07:27+0000'.split("+")[0] +"Z", "%Y-%m-%dT%H:%M:%SZ")

datetime.datetime(2013, 1, 2, 19, 7, 27)