In [None]:
import logging
import os
import re
from datetime import datetime
from xml.etree import ElementTree
from xml.etree.ElementTree import ParseError

from pymongo import MongoClient
from pymongo.errors import BulkWriteError

## Logging

In [None]:
try:
    os.remove('new_ft_text.log')
except:
    pass

logging.getLogger().handlers = []
logging.getLogger('requests.packages.urllib3').setLevel(logging.WARNING)
logging.basicConfig(
    filename='new_ft_text.log',
    level=logging.INFO,
    format='%(asctime)s %(message)s'
)

## MongoDB

In [None]:
client = MongoClient()
client.drop_database('new_ft_text')
db_text = client.new_ft_text

In [None]:
# for x in DB.rows.find({'pub_date': {'$gt': '20030103', '$lt': '20030107'}}):
#     print(x)

In [None]:
def insert_rows(db, rows):
    try:
        db.rows.insert_many(rows, ordered=False)
    except BulkWriteError as ex:
        for err in ex.details['writeErrors']:
            if err['code'] == 11000:
                _id = err['op']['_id']
                logging.info('BulkWriteError: {} - {}'.format(ex, _id))
                global dups
                dups += 1

## Main

In [None]:
def format_date(date):
    return datetime.strptime(date, '%d-%b-%Y').strftime('%Y%m%d')

def get_id(url):
    article_id = None
    if url:
        match = re.findall('((\w+-)+\w+)', url)
        if match:
            article_id = match[0][0]
        else:
            match = re.findall('(\?id=(\d+))', url)
            if match:
                article_id = match[0][1]
    return article_id

def get_text(text):
    result = None
    if text:
        result = ' '.join([p.text for p in text.findall('p') if p.text])
    return result

def parse_item(f, elem):
    result = {}
    result['url'] = elem.findtext('urlofdoc')
    result['source'] = 'ftcom' if result['url'] else 'newspaper'
    result['_id'] = elem.findtext('uuid') or get_id(result['url'])
    result['date'] = elem.findtext('publicationdate') or format_date(elem.findtext('datearticle'))
    result['text'] = get_text(elem.find('text'))
    result['title'] = elem.findtext('headline')
    return result

In [None]:
%%time
total = 0
error = 0
items = 0
dups = 0
path = '/home/antonio/git/nytimes/ft_data/all/'
for e in os.listdir(path):
    f = os.path.join(path, e)
    iterparser = ElementTree.iterparse(f)
    rows = []
    try:
        for event, elem in iterparser:
            if elem.tag == 'item':
                row = parse_item(f, elem)
                rows.append(row)
                elem.clear()
                items += 1
            elif elem.tag == 'ftcom' or elem.tag == 'ftnewspaper':
                elem.clear()
    except ParseError as ex:
        logging.info('ParseError: {} - {}'.format(ex, e))
        error += 1
    total +=1
    if rows:
        insert_rows(db_text, rows)
print('Total files:', total)
print('Error files:', error)
print('Total items:', items)
print('Total dups:', dups)

In [None]:
db_text.rows.count()

In [None]:
db_text.rows.find_one({'_id': '030102000137'})