#Sentence extractor for The New York Times

In [54]:
import logging
import math
import os
from datetime import timedelta, date, datetime
from dateutil import parser
from time import sleep, time

import requests
from joblib import Parallel, delayed
from pymongo import MongoClient
from pymongo.errors import BulkWriteError, DuplicateKeyError

##Logging

In [56]:
try:
    os.remove('search.log')
except:
    pass

In [52]:
logging.getLogger().handlers = []
logging.getLogger('requests.packages.urllib3').setLevel(logging.WARNING)
logging.basicConfig(filename='search.log', level=logging.INFO)

In [34]:
def write_log(*args, status=None):
    dt_str = datetime.now().strftime('%Y-%m-%d @ %I:%M:%S %p')
    record = '{} -- {} ==> {}'.format(dt_str, args, status)
    logging.info(record)

##MongoDB

In [35]:
client = MongoClient()
client.drop_database('nytimes')
db = client.nytimes

In [36]:
def insert_documents(docs):
    try:
        db.articles.insert_many(docs)
    except BulkWriteError as e:
        write_log(e, status='EXCEPTION')
        for doc in docs:
            try:
                db.articles.insert_one(doc)
            except DuplicateKeyError as e:
                write_log(e, status='EXCEPTION')
    except Exception as e:
        write_log(e, status='EXCEPTION')

##Search terms

In [37]:
st_file = open('search_terms.txt')
search_terms = map(lambda x: x.strip(), st_file.readlines())
search_terms = ['executive', 'entrepreneur']

##NYTimes API keys

In [38]:
# One API key for each of the cores
api_keys = [
    "3439a9084efa80c4f5fb1d290dfc1b44:11:70233981", # my api key
    "a5c709f3168b829711241b243457e9d6:13:70235641",
#     "ba47374fd391c9bc5fd3ca51ff953a44:14:70229228",
#     "4557e02788189abb3642a33bca7469ff:11:69136863",
#     "2b3d39fd4c7836168a2a370c25ad6232:16:70235576",
#     "87d7b22c0feec4f3112d80b71d0b500a:1:69642501",
#     "d7655429355ab2df4621a10c01d04865:8:69135199",
#     "1944df13b86dd83e4a8c4ea82e767975:2:65092848",
#     "730e30f5220059551e666430644fbf87:11:69642501", # developer inactive
]

In [39]:
def next_multiple(n, m):
    # 4, 17 ==> 20
    rest = m % n
    return m if rest == 0 else m + n - rest

def chunks(l, n_chunks):
    size = len(l)
    n = next_multiple(n_chunks, size) // n_chunks
    for i in range(0, len(l), n):
        yield l[i:i + n]

In [40]:
search_terms_by_api_key = {}
for t in zip(api_keys, chunks(list(search_terms), len(api_keys))):
    search_terms_by_api_key[t[0]] = t[1]

##Dates

In [41]:
def month_duration(d):
    if d.month in [1, 3, 5, 7, 8, 10, 12]:
        ndays = 31
    elif d.month in [4, 6, 9, 11]:
        ndays = 30
    else: # d.month == 2
        if d.year % 400 == 0 or d.year % 4 == 0 and d.year % 100 != 0: # lap-year
            ndays = 29
        else:
            ndays = 28
    return ndays

def n_days(d, n_months):
    ndays = 0
    new_d = d
    for _ in range(n_months):
        m_duration = month_duration(d)
        d += timedelta(m_duration)
        ndays += m_duration
    return ndays - 1

def date_ranges(begin_date, end_date, n_months=1):
    aux_date = begin_date
    while aux_date < end_date:
        ndays = n_days(aux_date, n_months)
        yield (aux_date, min(aux_date + timedelta(ndays), end_date))
        aux_date += timedelta(ndays + 1)

##Timer

In [42]:
last_request = time()

In [43]:
def wait(f, *args, t=9):
    global last_request
    now = time()
    elapsed_time = now - last_request
    if elapsed_time < t:
        sleep(t - elapsed_time)
    last_request = time()
    return f(*args)

##Downloader

In [44]:
def format_query(term):
    return term

In [45]:
def filter_fields_doc(doc):
    fl = [
        'web_url', 'snippet', 'lead_paragraph', 'abstract', 'source', 'headline',
        'keywords', 'pub_date', 'document_type', 'section_name', '_id',
    ]
    return {key: doc[key] for key in fl}

def filter_fields(docs):
    return list(map(filter_fields_doc, docs))

In [46]:
def search(q, begin_date, end_date, sort, page, api_key):
    base_url = 'http://api.nytimes.com/svc/search/v2/articlesearch.json'
    payload = {'q': q, 'begin_date': begin_date, 'end_date': end_date, 'sort': sort, 'page': page, 'api-key': api_key}
    response = requests.get(base_url, params=payload)
    return response.json()

In [47]:
def get_documents(term, begin_date='19990101', end_date='20141231', sort='oldest', page=0, api_key='sample-key'):    
    q = format_query(term)
    response = wait(search, q, begin_date, end_date, sort, page, api_key, t=0)
    
    if response['status'] != 'OK':
        print('Error')
        write_log(q, begin_date, end_date, page, api_key, status=response['status'])
    else:
        total_results = response['response']['meta']['hits']
        print(q, begin_date, end_date, api_key, '==>', total_results, 'results found')
        
        if total_results <= 1010:
            if total_results == 0:
                write_log(q, begin_date, end_date, page, api_key, status='0')
                
            n_pages = math.ceil(total_results / 10)
            for page in range(n_pages):
                r = wait(search, q, begin_date, end_date, sort, page, api_key, t=0)
                if r['status'] != 'OK':
                    print('Error')
                else:
                    insert_documents(filter_fields(r['response']['docs']))
                write_log(q, begin_date, end_date, page, api_key, status=response['status'])
        else: # total_results > 1010
            bd = parser.parse(begin_date)
            ed = parser.parse(end_date)
            half = (ed - bd) // 2
            
            begin_date1 = begin_date
            end_date1 = (bd + timedelta(half.days)).strftime("%Y%m%d")
            get_documents(term, begin_date1, end_date1, sort, page, api_key)
            
            begin_date2 = (bd + timedelta(half.days + 1)).strftime("%Y%m%d")
            end_date2 = end_date
            get_documents(term, begin_date2, end_date2, sort, page, api_key)

In [48]:
def download_by_date_range(term, api_key):
#     begin_date = date(1999, 1, 1)
#     end_date = date(2014, 12, 31)
    begin_date = date(2014, 1, 1)
    end_date = date(2014, 2, 28)
    for r in date_ranges(begin_date, end_date, 1):
        try:
            begin_date = r[0].strftime("%Y%m%d")
            end_date = r[1].strftime("%Y%m%d")
            get_documents(term, begin_date=begin_date, end_date=end_date, api_key=api_key)
        except Exception as e:
            write_log(e, status='EXCEPTION')

In [49]:
def download_documents(api_key, terms):
    for term in terms:
        try:
            download_by_date_range(term, api_key)
        except Exception as e:
            write_log(e, status='EXCEPTION')

In [50]:
def downloader(api_keys, search_terms_by_api_key):
    try:
        Parallel(n_jobs=2)(delayed(download_documents)(api_key, search_terms_by_api_key[api_key]) for api_key in api_keys)
    except Exception as e:
        write_log(e, status='EXCEPTION')
        for api_key in api_keys:
            try:
                download_documents(api_key, search_terms_by_api_key[api_key])
            except Exception as e:
                write_log(e, status='EXCEPTION')

In [53]:
downloader(api_keys, search_terms_by_api_key)

executive 20140201 20140228 3439a9084efa80c4f5fb1d290dfc1b44:11:70233981 ==> 2532 results found
executive 20140201 20140214 3439a9084efa80c4f5fb1d290dfc1b44:11:70233981 ==> 1315 results found
executive 20140201 20140207 3439a9084efa80c4f5fb1d290dfc1b44:11:70233981 ==> 662 results found
executive 20140208 20140214 3439a9084efa80c4f5fb1d290dfc1b44:11:70233981 ==> 653 results found
executive 20140215 20140228 3439a9084efa80c4f5fb1d290dfc1b44:11:70233981 ==> 1217 results found
executive 20140215 20140221 3439a9084efa80c4f5fb1d290dfc1b44:11:70233981 ==> 558 results found
executive 20140222 20140228 3439a9084efa80c4f5fb1d290dfc1b44:11:70233981 ==> 659 results found


In [57]:
>>> from joblib import Parallel, delayed
>>> from math import sqrt

p = Parallel(n_jobs=2)(delayed(sqrt)(i**2) for i in range(10))

In [59]:
type(p)

list

In [14]:
len(documents)

5235

In [15]:
import pandas as pd

In [16]:
df = pd.DataFrame(documents)

In [17]:
len(df)

5235

In [502]:
len(df.drop_duplicates('_id'))

5233

In [431]:
df['_id'].nunique()

1937

In [466]:
d1=date(1999,1,1)
d2=date(1999,2,28)

In [469]:
(d2-d1).days // 2

29

In [472]:
parser.parse('19990101')

datetime.datetime(1999, 1, 1, 0, 0)

In [46]:
from collections import OrderedDict
d=OrderedDict({'b':1, 'a':2})
d.values()

ValuesView(OrderedDict([('a', 2), ('b', 1)]))

In [None]:
3439a9084efa80c4f5fb1d290dfc1b44%3A11%3A70233981
3439a9084efa80c4f5fb1d290dfc1b44%3A11%3A70233981

In [None]:
http://api.nytimes.com/svc/search/v2/articlesearch.json?sort=oldest&begin_date=19990101&api_key=3439a9084efa80c4f5fb1d290dfc1b44%3A11%3A70233981&end_date=20141231&q=entrepreneur&page=0
http://api.nytimes.com/svc/search/v2/articlesearch.json?q=entrepreneur&begin_date=19990101&end_date=20141231&sort=oldest&page=0&api-key=3439a9084efa80c4f5fb1d290dfc1b44%3A11%3A70233981
http://api.nytimes.com/svc/search/v2/articlesearch.json?q=entrepreneur&begin_date=19990101&end_date=20141231&sort=oldest&page=0&api_key=3439a9084efa80c4f5fb1d290dfc1b44%3A11%3A70233981

In [63]:
url1='http://api.nytimes.com/svc/search/v2/articlesearch.json?q=entrepreneur&begin_date=19990101&end_date=20141231&sort=oldest&page=0&api-key=3439a9084efa80c4f5fb1d290dfc1b44%3A11%3A70233981'

In [64]:
url2='http://api.nytimes.com/svc/search/v2/articlesearch.json?q=entrepreneur&begin_date=19990101&end_date=20141231&sort=oldest&page=0&api_key=3439a9084efa80c4f5fb1d290dfc1b44%3A11%3A70233981'

In [85]:
n=131
url1[:n]==url2[:n]

True

In [86]:
url1[:n]

'http://api.nytimes.com/svc/search/v2/articlesearch.json?q=entrepreneur&begin_date=19990101&end_date=20141231&sort=oldest&page=0&api'

In [75]:
len(url2)

184

In [67]:
url1

'http://api.nytimes.com/svc/search/v2/articlesearch.json?q=entrepreneur&begin_date=19990101&end_date=20141231&sort=oldest&page=0&api-key=3439a9084efa80c4f5fb1d290dfc1b44%3A11%3A70233981'

In [68]:
url2

'http://api.nytimes.com/svc/search/v2/articlesearch.json?q=entrepreneur&begin_date=19990101&end_date=20141231&sort=oldest&page=0&api_key=3439a9084efa80c4f5fb1d290dfc1b44%3A11%3A70233981'

In [30]:
response.url

'http://api.nytimes.com/svc/search/v2/articlesearch.json?sort=oldest&begin_date=19990101&api_key=3439a9084efa80c4f5fb1d290dfc1b44%3A11%3A70233981&end_date=20141231&q=entrepreneur&page=0'

In [57]:
response.url

'http://api.nytimes.com/svc/search/v2/articlesearch.json?q=entrepreneur&begin_date=19990101&end_date=20141231&sort=oldest&page=0&api_key=3439a9084efa80c4f5fb1d290dfc1b44%3A11%3A70233981'

In [92]:
len(list(search_terms))

421

In [35]:
from urllib.request import urlopen
URL='http://api.nytimes.com/svc/search/v2/articlesearch.json?q=new+startups&begin_date=20130101&end_date=20130201&sort=newest&api-key=sample-key'
r=urlopen(URL)

In [36]:
import ujson as json
d=json.load(r)

In [38]:
d.keys()

dict_keys(['status', 'response', 'copyright'])

In [None]:
pd.read_csv('data/files_new_york_times/new_york_times_no_text.csv', encoding='utf-8')

In [None]:
dataFrame = pd.read_csv('data/files_new_york_times/csv_url/startup.csv', encoding='utf-8')
count = 0
for result in dataFrame['url_works']:
    if result == 1:
        count = count + 1
print count

In [179]:
x=end_date - start_date

In [203]:
start_date += timedelta(1)

In [204]:
start_date

datetime.date(2013, 1, 2)

In [191]:
list(range(0, 365, 28))

[0, 28, 56, 84, 112, 140, 168, 196, 224, 252, 280, 308, 336, 364]

In [200]:
def f(a,b):
    aux = a
    while aux<b:
        yield (aux, b)
        aux += 1

In [201]:
for x in f(3, 10):
    print(x)

(3, 10)
(4, 10)
(5, 10)
(6, 10)
(7, 10)
(8, 10)
(9, 10)


##Dates

In [265]:
def month_duration(d):
    if d.month in [1, 3, 5, 7, 8, 10, 12]:
        ndays = 31
    elif d.month in [4, 6, 9, 11]:
        ndays = 30
    else: # d.month == 2
        if d.year % 400 == 0 or d.year % 4 == 0 and d.year % 100 != 0: # lap-year
            ndays = 29
        else:
            ndays = 28
    return ndays

def n_days(d, n_months):
    ndays = 0
    new_d = d
    for _ in range(n_months):
        m_duration = month_duration(d)
        d += timedelta(m_duration)
        ndays += m_duration
    return ndays - 1

def date_ranges(begin_date, end_date, n_months=1):
    aux_date = begin_date
    while aux_date < end_date:
        ndays = n_days(aux_date, n_months)
        yield (aux_date, min(aux_date + timedelta(ndays), end_date))
        aux_date += timedelta(ndays + 1)

start_date = date(2012, 1, 1)
end_date = date(2013, 12, 31)
for a,b in date_ranges(start_date, end_date, 11):
    print(a,b.strftime("%Y%m%d"))

2012-01-01 20121130
2012-12-01 20131031
2013-11-01 20131231


In [219]:
month_duration(date(2100, 2, 1))

28

In [233]:
def f(d):
    d += timedelta(1)
    return d

In [262]:
min(start_date, end_date)

datetime.date(2012, 1, 1)

In [241]:
start_date += timedelta(months=1)

TypeError: 'months' is an invalid keyword argument for this function

In [207]:
start_date.month

1