In [3]:
import logging
import os
import re
from datetime import datetime
from xml.etree import ElementTree
from xml.etree.ElementTree import ParseError

from pymongo import MongoClient
from pymongo.errors import BulkWriteError

## Logging

In [2]:
try:
    os.remove('new_ft_text.log')
except:
    pass

logging.getLogger().handlers = []
logging.getLogger('requests.packages.urllib3').setLevel(logging.WARNING)
logging.basicConfig(
    filename='new_ft_text.log',
    level=logging.INFO,
    format='%(asctime)s %(message)s'
)

## MongoDB

In [4]:
client = MongoClient()
# client.drop_database('new_ft_text')
db_text = client.new_ft_text

In [4]:
def insert_rows(db, rows):
    try:
        db.rows.insert_many(rows, ordered=False)
    except BulkWriteError as ex:
        for err in ex.details['writeErrors']:
            if err['code'] == 11000:
                _id = err['op']['_id']
                logging.info('BulkWriteError: {} - {}'.format(ex, _id))
                global dups
                dups += 1

## Main

In [5]:
def format_date(date):
    return datetime.strptime(date, '%d-%b-%Y').strftime('%Y%m%d')

def get_id(url):
    article_id = None
    if url:
        match = re.findall('((\w+-)+\w+)', url)
        if match:
            article_id = match[0][0]
        else:
            match = re.findall('(\?id=(\d+))', url)
            if match:
                article_id = match[0][1]
    return article_id

def get_text(text):
    result = ''
    if text:
        result = ' '.join([p.text for p in text.findall('p') if p.text])
#         result = ' '.join(element.itertext())
    return result

def parse_item(f, elem):
    result = {}
    result['url'] = elem.findtext('urlofdoc')
    result['source'] = 'ftcom' if result['url'] else 'newspaper'
    result['_id'] = elem.findtext('uuid') or get_id(result['url'])
    result['date'] = elem.findtext('publicationdate') or format_date(elem.findtext('datearticle'))
    result['text'] = get_text(elem.find('text'))
    result['title'] = elem.findtext('headline')
    return result

In [6]:
%%time
total = 0
error = 0
items = 0
dups = 0
path = '/home/antonio/git/nytimes/ft_data/all/'
for e in os.listdir(path):
    f = os.path.join(path, e)
    iterparser = ElementTree.iterparse(f)
    rows = []
    try:
        for event, elem in iterparser:
            if elem.tag == 'item':
                row = parse_item(f, elem)
                rows.append(row)
                elem.clear()
                items += 1
            elif elem.tag == 'ftcom' or elem.tag == 'ftnewspaper':
                elem.clear()
    except ParseError as ex:
        logging.info('ParseError: {} - {}'.format(ex, e))
        error += 1
    total +=1
    if rows:
        insert_rows(db_text, rows)
print('Total files:', total)
print('Error files:', error)
print('Total items:', items)
print('Total dups:', dups)

Total files: 281584
Error files: 1318
Total items: 1034401
Total dups: 102596
CPU times: user 6min 20s, sys: 12.9 s, total: 6min 33s
Wall time: 7min 36s


In [7]:
db_text.rows.count()

931805

In [8]:
db_text.rows.find_one({'_id': '030102000137'})

{'_id': '030102000137',
 'date': '20030102',
 'source': 'ftcom',
 'title': "OBSERVER: Travellin' man",
 'url': 'http://search.ft.com/search/article.html?id=030102000137'}

In [9]:
c=0
for x in db_text.rows.find({'text': ''}):
    c+=1
c

4221

In [5]:
db_text = client.new_ft_text
db_text.rows.find({'text': ''}).count()

4221

In [10]:
c=0
for x in db_text.rows.find({'text': None}):
    c+=1
c

0

In [6]:
cur = 20030101
while cur < 20160101:
    nxt = cur + 10000
    c = db_text.rows.find({'date': {
        '$gte': str(cur),
        '$lt': str(nxt)
    }}).count()
    print('{} - {}: {}'.format(cur, nxt, c))
    cur = nxt

20030101 - 20040101: 58874
20040101 - 20050101: 102733
20050101 - 20060101: 108625
20060101 - 20070101: 111014
20070101 - 20080101: 118309
20080101 - 20090101: 115680
20090101 - 20100101: 99929
20100101 - 20110101: 50759
20110101 - 20120101: 48629
20120101 - 20130101: 48520
20130101 - 20140101: 44647
20140101 - 20150101: 24086
20150101 - 20160101: 0
