#Sentence extractor for The New York Times

In [1]:
import logging
import math
import os
from datetime import timedelta, date, datetime
from dateutil import parser
from time import sleep, time

import requests
from joblib import Parallel, delayed
from pymongo import MongoClient
from pymongo.errors import BulkWriteError, DuplicateKeyError

##Logging

In [2]:
try:
    os.remove('search.log')
except:
    pass

In [3]:
logging.getLogger().handlers = []
logging.getLogger('requests.packages.urllib3').setLevel(logging.WARNING)
logging.basicConfig(filename='search.log', level=logging.INFO)

In [4]:
def write_log(*args, status=None):
    dt_str = datetime.now().strftime('%Y-%m-%d @ %I:%M:%S %p')
    record = '{} -- {} ==> {}'.format(dt_str, args, status)
    logging.info(record)

##MongoDB

In [5]:
client = MongoClient()
client.drop_database('nytimes')
db = client.nytimes

In [6]:
def insert_documents(docs):
    total_inserted = 0
    try:
        inserted = db.articles.insert_many(docs)
        total_inserted = len(inserted.inserted_ids)
    except BulkWriteError as e:
        write_log(e, status='EXCEPTION')
        for doc in docs:
            try:
                db.articles.insert_one(doc)
                total_inserted += 1
            except DuplicateKeyError as e:
                write_log(e, status='EXCEPTION')
    except Exception as e:
        write_log(e, status='EXCEPTION')
    return total_inserted

##Search terms

In [7]:
def preprocess_terms(term_list):
    search_terms = []
    for term in term_list:
        if '-' in term or ' & ' in term or ' and ' in term:
            if '-' in term:
                search_terms.append(term)
                search_terms.append(term.replace('-', ''))
                search_terms.append(term.replace('-', ' '))
            if ' & ' in term:
                search_terms.append(term.replace(' & ', ' '))
            if ' and ' in term:
                search_terms.append(term.replace(' and ', ' '))
        else:
            search_terms.append(term)
    search_terms = map(lambda x: '"{}"'.format(x), search_terms)
    return search_terms

In [8]:
st_file = open('search_terms.txt')
term_list = map(lambda x: x.strip(), st_file.readlines())
search_terms = preprocess_terms(term_list)

##NYTimes API keys

In [9]:
# One API key for each of the cores
api_keys = [
#     "3439a9084efa80c4f5fb1d290dfc1b44:11:70233981", # my api key
#     "a5c709f3168b829711241b243457e9d6:13:70235641", # the other api key
    "c7ba2eac72924572152e63f4516210d7:14:72380734", # my second api key
    "7e692d35c7bd20618395859a3c4cbef6:15:72380785", # my third api key
    "ba47374fd391c9bc5fd3ca51ff953a44:14:70229228",
    "4557e02788189abb3642a33bca7469ff:11:69136863",
    "2b3d39fd4c7836168a2a370c25ad6232:16:70235576",
    "87d7b22c0feec4f3112d80b71d0b500a:1:69642501",
    "d7655429355ab2df4621a10c01d04865:8:69135199",
    "1944df13b86dd83e4a8c4ea82e767975:2:65092848",
#     "730e30f5220059551e666430644fbf87:11:69642501", # developer inactive
]

In [10]:
def next_multiple(n, m):
    # 4, 17 ==> 20
    rest = m % n
    return m if rest == 0 else m + n - rest

def chunks(l, n_chunks):
    l = list(l)
    size = len(l)
    n = next_multiple(n_chunks, size) // n_chunks
    for i in range(0, size, n):
        yield l[i:i + n]

In [11]:
search_terms_by_api_key = {}
for t in zip(api_keys, chunks(search_terms, len(api_keys))):
    search_terms_by_api_key[t[0]] = t[1]

##Dates

In [12]:
def month_duration(d):
    if d.month in [1, 3, 5, 7, 8, 10, 12]:
        ndays = 31
    elif d.month in [4, 6, 9, 11]:
        ndays = 30
    else: # d.month == 2
        if d.year % 400 == 0 or d.year % 4 == 0 and d.year % 100 != 0: # lap-year
            ndays = 29
        else:
            ndays = 28
    return ndays

def n_days(d, n_months):
    ndays = 0
    new_d = d
    for _ in range(n_months):
        m_duration = month_duration(d)
        d += timedelta(m_duration)
        ndays += m_duration
    return ndays - 1

def date_ranges(begin_date, end_date, n_months=1):
    aux_date = begin_date
    while aux_date < end_date:
        ndays = n_days(aux_date, n_months)
        yield (aux_date, min(aux_date + timedelta(ndays), end_date))
        aux_date += timedelta(ndays + 1)

##Timer

In [13]:
last_request = time()

In [14]:
def wait(f, *args, t=9):
    global last_request
    now = time()
    elapsed_time = now - last_request
    if elapsed_time < t:
        sleep(t - elapsed_time)
    last_request = time()
    return f(*args)

##Downloader

In [15]:
def filter_fields_doc(doc):
    fl = [
        'web_url', 'snippet', 'lead_paragraph', 'abstract', 'source', 'headline',
        'keywords', 'pub_date', 'document_type', 'section_name', '_id',
    ]
    filtered = {key: doc[key] for key in fl}
    filtered['headline'] = filtered['headline']['main']
    return filtered

def filter_fields(docs):
    return list(map(filter_fields_doc, docs))

In [16]:
def search(q, begin_date, end_date, page, api_key):
    base_url = 'http://api.nytimes.com/svc/search/v2/articlesearch.json'
    payload = {'q': q, 'begin_date': begin_date, 'end_date': end_date, 'page': page, 'api-key': api_key}
    payload.update({'sort': 'oldest', 'fq': 'source:("The New York Times")'})
    response = requests.get(base_url, params=payload)
    return response.json()

In [17]:
def get_documents(term, begin_date, end_date, api_key='sample-key'):
    q = term
    response = wait(search, q, begin_date, end_date, 0, api_key)
    
    if response['status'] != 'OK':
        write_log(q, begin_date, end_date, api_key, status=response['status'])
    else:
        total_results = response['response']['meta']['hits']
        write_log(q, begin_date, end_date, api_key, status='{} results found'.format(total_results))
        
        if total_results <= 1010:
            if total_results == 0:
                write_log(q, begin_date, end_date, api_key, status='0')
                
            n_pages = math.ceil(total_results / 10)
            for page in range(n_pages):
                r = wait(search, q, begin_date, end_date, page, api_key)
                if r['status'] != 'OK':
                    write_log(q, begin_date, end_date, page, api_key, status=response['status'])
                else:
                    total_inserted = insert_documents(filter_fields(r['response']['docs']))
                    write_log(q, begin_date, end_date, page, api_key, status='{} results written'.format(total_inserted))
        else:
            bd = parser.parse(begin_date)
            ed = parser.parse(end_date)
            half = (ed - bd) // 2
            
            begin_date1 = begin_date
            end_date1 = (bd + timedelta(half.days)).strftime("%Y%m%d")
            get_documents(term, begin_date1, end_date1, api_key)
            
            begin_date2 = (bd + timedelta(half.days + 1)).strftime("%Y%m%d")
            end_date2 = end_date
            get_documents(term, begin_date2, end_date2, api_key)

In [18]:
def download_by_date_range(term, api_key):
    begin_date = date(1999, 1, 1)
    end_date = date(2014, 12, 31)
    for r in date_ranges(begin_date, end_date, 1):
        try:
            begin_date = r[0].strftime("%Y%m%d")
            end_date = r[1].strftime("%Y%m%d")
            get_documents(term, begin_date=begin_date, end_date=end_date, api_key=api_key)
        except Exception as e:
            write_log(e, status='EXCEPTION')

In [19]:
def download_documents(terms, api_key):
    begin_date = '19990101'
    end_date = '20141231'
    for term in terms:
        q = term
        try:
            response = wait(search, q, begin_date, end_date, 0, api_key)
            if response['status'] == 'OK':
                if response['response']['meta']['hits'] == 0:
                    write_log(q, begin_date, end_date, api_key, status='0')
                else:
                    download_by_date_range(term, api_key)
        except Exception as e:
            write_log(e, status='EXCEPTION')

In [20]:
def downloader(search_terms_by_api_key, api_keys):
    try:
        Parallel(n_jobs=8)(delayed(download_documents)(search_terms_by_api_key[api_key], api_key) for api_key in api_keys)
    except Exception as e:
        write_log(e, status='EXCEPTION')
        for api_key in api_keys:
            try:
                download_documents(search_terms_by_api_key[api_key], api_key)
            except Exception as e:
                write_log(e, status='EXCEPTION')

In [None]:
downloader(search_terms_by_api_key, api_keys)