#Sentence extractor for The New York Times

In [1]:
import logging
import math
import os
from datetime import timedelta, date, datetime
from dateutil import parser
from time import sleep, time

import requests
from joblib import Parallel, delayed
from pymongo import MongoClient
from pymongo.errors import BulkWriteError, DuplicateKeyError

##Logging

In [2]:
try:
    os.remove('search2.log')
except:
    pass

In [3]:
logging.getLogger().handlers = []
logging.getLogger('requests.packages.urllib3').setLevel(logging.WARNING)
logging.basicConfig(filename='search2.log', level=logging.INFO, format='%(asctime)s %(message)s')

In [4]:
def write_log(*args, status=None):
    record = '{} ==> {}'.format(args, status)
    logging.info(record)

##MongoDB

In [5]:
client = MongoClient()
client.drop_database('nytimes2')
db = client.nytimes2

In [6]:
def insert_documents(docs, q):
    try:
        inserted = db.articles.insert_many(docs, ordered=False)
        total_inserted = len(inserted.inserted_ids)
        write_log(q, status='INSERTION OK {}'.format(total_inserted))
    except BulkWriteError as e:
        write_log(e, status='INSERTION EXCEPTION BULKWRITE')
    except Exception as e:
        write_log(e, status='INSERTION EXCEPTION OTHER')

##Search terms

In [7]:
def preprocess_terms(term_list):
    search_terms = []
    search_terms_aux = []
    
    for term in term_list:
        if '-' in term or ' & ' in term or ' and ' in term:
            if '-' in term:
                search_terms_aux.append((term, term.lower()))
                search_terms_aux.append((term, term.replace('-', '').lower()))
                search_terms_aux.append((term, term.replace('-', ' ').lower()))
            if ' & ' in term:
                search_terms_aux.append((term, term.replace(' & ', ' ').lower()))
            if ' and ' in term:
                search_terms_aux.append((term, term.replace(' and ', ' ').lower()))
        else:
            search_terms_aux.append((term, term.lower()))
    
    for original, curated in search_terms_aux:
        if curated.endswith('corporation'):
            search_terms.append((original, curated[:-12]))
        elif curated.endswith('company'):
            search_terms.append((original, curated[:-8]))
        elif curated.endswith('inc.'):
            search_terms.append((original, curated[:-5]))
        elif curated.endswith('inc'):
            search_terms.append((original, curated[:-4]))
        search_terms.append((original, curated))
    
    search_terms = map(lambda x: (x[0], '"{}"'.format(x[1])), search_terms)
    return search_terms

In [8]:
st_file = open('search_terms.txt')
term_list = map(lambda x: x.strip(), st_file.readlines())
search_terms = preprocess_terms(term_list)
search_terms = [('entrepreneur', 'entrepreneur'), ('executive', 'executive')]

##NYTimes API keys

In [9]:
# One API key for each of the cores
api_keys = [
    "3439a9084efa80c4f5fb1d290dfc1b44:11:70233981", # my api key
    "a5c709f3168b829711241b243457e9d6:13:70235641", # the other api key
#     "c7ba2eac72924572152e63f4516210d7:14:72380734", # my second api key
#     "7e692d35c7bd20618395859a3c4cbef6:15:72380785", # my third api key
#     "ba47374fd391c9bc5fd3ca51ff953a44:14:70229228",
#     "4557e02788189abb3642a33bca7469ff:11:69136863",
#     "2b3d39fd4c7836168a2a370c25ad6232:16:70235576",
#     "87d7b22c0feec4f3112d80b71d0b500a:1:69642501",
#     "d7655429355ab2df4621a10c01d04865:8:69135199",
#     "1944df13b86dd83e4a8c4ea82e767975:2:65092848",
#     "730e30f5220059551e666430644fbf87:11:69642501", # developer inactive
]

In [10]:
def next_multiple(n, m):
    # 4, 17 ==> 20
    rest = m % n
    return m if rest == 0 else m + n - rest

def chunks(l, n_chunks):
    l = list(l)
    size = len(l)
    n = next_multiple(n_chunks, size) // n_chunks
    for i in range(0, size, n):
        yield l[i:i + n]

In [11]:
search_terms_by_api_key = {}
for t in zip(api_keys, chunks(search_terms, len(api_keys))):
    search_terms_by_api_key[t[0]] = t[1]

##Dates

In [12]:
def month_duration(d):
    if d.month in [1, 3, 5, 7, 8, 10, 12]:
        ndays = 31
    elif d.month in [4, 6, 9, 11]:
        ndays = 30
    else: # d.month == 2
        if d.year % 400 == 0 or d.year % 4 == 0 and d.year % 100 != 0: # lap-year
            ndays = 29
        else:
            ndays = 28
    return ndays

def n_days(d, n_months):
    ndays = 0
    new_d = d
    for _ in range(n_months):
        m_duration = month_duration(d)
        d += timedelta(m_duration)
        ndays += m_duration
    return ndays - 1

def date_ranges(begin_date, end_date, n_months=1):
    aux_date = begin_date
    while aux_date < end_date:
        ndays = n_days(aux_date, n_months)
        yield (aux_date, min(aux_date + timedelta(ndays), end_date))
        aux_date += timedelta(ndays + 1)

##Timer

In [13]:
LAST_REQUEST = time()

In [14]:
def wait(f, *args, t=0):
    global LAST_REQUEST
    now = time()
    elapsed_time = now - LAST_REQUEST
    if elapsed_time < t:
        sleep(t - elapsed_time)
    LAST_REQUEST = time()
    return f(*args)

##Query

In [15]:
class Query:
    def __init__(self, term, begin_date, end_date, page, api_key):
        self.term = term
        self.begin_date = begin_date
        self.end_date = end_date
        self.page = page
        self.api_key =api_key
    
    def __repr__(self):
        return 'Q<{}, {}, {}, {}, {}>'.format(self.term, self.begin_date, self.end_date, self.page, self.api_key)

##Downloader

In [16]:
# BEGIN_DATE = date(1999, 1, 1)
# END_DATE = date(2014, 12, 31)
BEGIN_DATE = date(2014, 1, 1)
END_DATE = date(2014, 2, 28)

In [18]:
def search(q):
    base_url = 'http://api.nytimes.com/svc/search/v2/articlesearch.json'
    payload = {'q': q.term, 'begin_date': q.begin_date, 'end_date': q.end_date, 'page': q.page, 'api-key': q.api_key}
    fl = [
        'web_url', 'snippet', 'lead_paragraph', 'abstract', 'source', 'headline',
        'keywords', 'pub_date', 'document_type', 'section_name', '_id',
    ]
    payload.update({'sort': 'oldest', 'fq': 'source:("The New York Times")', 'fl': ','.join(fl)})
    response = requests.get(base_url, params=payload)
    return response.json()

In [19]:
def get_documents_by_page(q, original, total_results):
    n_pages = math.ceil(total_results / 10)
    for page in range(n_pages):
        q.page = page
        response = wait(search, q)
        if response['status'] != 'OK':
            write_log(q, status='SEARCH PAGE {}'.format(response['status']))
        else:
            docs = response['response']['docs']
            for doc in docs:
                doc.update({'q': q.__dict__, 'original': original})
            insert_documents(docs, q)

In [20]:
def get_documents_by_date(q, original):
    response = wait(search, q)
    
    if response['status'] != 'OK':
        write_log(q, status='SEARCH DATERANGE {}'.format(response['status']))
    else:
        total_results = response['response']['meta']['hits']
        write_log(q, status='SEARCH {} {}'.format(response['status'], total_results))
            
        if total_results <= 1010:
            get_documents_by_page(q, original, total_results)
        else:
            bd = parser.parse(q.begin_date)
            ed = parser.parse(q.end_date)
            half = (ed - bd) // 2
            
            begin_date1 = q.begin_date
            end_date1 = (bd + timedelta(half.days)).strftime("%Y%m%d")
            q1 = Query(q.term, begin_date1, end_date1, 0, q.api_key)
            get_documents_by_date(q1, original)
            
            begin_date2 = (bd + timedelta(half.days + 1)).strftime("%Y%m%d")
            end_date2 = q.end_date
            q2 = Query(q.term, begin_date2, end_date2, 0, q.api_key)
            get_documents_by_date(q2, original)

In [21]:
def download_by_date_ranges(term, api_key):
    for r in date_ranges(BEGIN_DATE, END_DATE, 1):
        try:
            begin_date = r[0].strftime("%Y%m%d")
            end_date = r[1].strftime("%Y%m%d")
            q = Query(term[1], begin_date, end_date, 0, api_key)
            get_documents_by_date(q, term[0])
        except Exception as e:
            write_log(e, status='DOWNLOAD EXCEPTION DATERANGE')

In [22]:
def download_by_term(term, api_key):
    begin_date = BEGIN_DATE.strftime("%Y%m%d")
    end_date = END_DATE.strftime("%Y%m%d")
    try:
        q = Query(term[1], begin_date, end_date, 0, api_key)
        response = wait(search, q)
        
        if response['status'] != 'OK':
            write_log(q, status='SEARCH TERM {}'.format(response['status']))
        else:
            total_results = response['response']['meta']['hits']
            write_log(q, status='SEARCH TERM {} {}'.format(response['status'], total_results))
            
            if total_results == 0:
                write_log(q, status='SEARCH BY TERM 0')
            elif total_results <= 1010:
                get_documents_by_page(q, term[0], total_results)
            else:
                download_by_date_ranges(term, api_key)
    except Exception as e:
        write_log(e, status='DOWNLOAD EXCEPTION TERM')

In [23]:
def download_by_key(search_terms, api_key):
    for term in search_terms:
        try:
            download_by_term(term, api_key)
        except Exception as e:
            write_log(e, status='DOWNLOAD EXCEPTION TERM')

In [24]:
def downloader(search_terms_by_api_key, api_keys):
    # Version parallel
    Parallel(n_jobs=2)(delayed(download_by_key)(search_terms_by_api_key[api_key], api_key) for api_key in api_keys)

    # Version sequencial
    for api_key in api_keys:
        try:
            download_by_key(search_terms_by_api_key[api_key], api_key)
        except Exception as e:
            write_log(e, status='DOWNLOAD EXCEPTION KEY')

In [25]:
downloader(search_terms_by_api_key, api_keys)