In [1]:
import logging
import os
import re
from datetime import datetime
from xml.etree import ElementTree
from xml.etree.ElementTree import ParseError

from pymongo import MongoClient
from pymongo.errors import BulkWriteError

## Logging

In [2]:
try:
    os.remove('new_ft_to_db.log')
except:
    pass

logging.getLogger().handlers = []
logging.getLogger('requests.packages.urllib3').setLevel(logging.WARNING)
logging.basicConfig(
    filename='new_ft_to_db.log',
    level=logging.INFO,
    format='%(asctime)s %(message)s'
)

## MongoDB

In [3]:
client = MongoClient()
client.drop_database('new_ft_text')
DB = client.new_ft_text

In [4]:
# for x in DB.rows.find({'pub_date': {'$gt': '20030103', '$lt': '20030107'}}):
#     print(x)

In [5]:
def insert_rows(rows):
#     try:
    DB.rows.insert_many(rows, ordered=False)
#     except Exception as e:
#         write_log('DB INSERTION EXCEPTION', status='{}'.format(e))
#         logging.info('ERROR')

## Main

In [6]:
def format_date(date):
    return datetime.strptime(date, '%d-%b-%Y').strftime('%Y%m%d')

def get_id(url):
    article_id = None
    if url:
        match = re.findall('((\w+-)+\w+)', url)
        if match:
            article_id = match[0][0]
        else:
            match = re.findall('(\?id=(\d+))', url)
            if match:
                article_id = match[0][1]
    return article_id

def get_text(text):
    result = None
    if text:
        result = ' '.join([p.text for p in text.findall('p') if p.text])
    return result

def parse_item(f, elem):
    result = {}
    result['url'] = elem.findtext('urlofdoc')
    result['source'] = 'ftcom' if result['url'] else 'newspaper'
    result['article_id'] = elem.findtext('uuid') or get_id(result['url'])
    result['pub_date'] = elem.findtext('publicationdate') or format_date(elem.findtext('datearticle'))
    result['text'] = get_text(elem.find('text'))
    result['title'] = elem.findtext('headline')
    return result

In [7]:
%%time
total = 0
error = 0
path = '/home/antonio/git/nytimes/ft_data/all/'
for e in os.listdir(path):
    f = os.path.join(path, e)
    iterparser = ElementTree.iterparse(f)
    rows = []
    try:
        for event, elem in iterparser:
            if elem.tag == 'item':
                row = parse_item(f, elem)
                rows.append(row)
                elem.clear()
            elif elem.tag == 'ftcom' or elem.tag == 'ftnewspaper':
                elem.clear()
    except ParseError as ex:
        logging.info(e)
        error += 1
    total +=1
    if rows:
            insert_rows(rows)
print('Total files:', total)
print('Error files:', error)

Total files: 281584
Error files: 1318
CPU times: user 6min 23s, sys: 19.2 s, total: 6min 42s
Wall time: 8min 54s


In [8]:
DB.rows.count()

1034401

In [10]:
for x in DB.rows.find({'article_id': '030102000137'}):
    print(x)

