In [None]:
import logging

import yaml
from nltk.tokenize import sent_tokenize
from pymongo import MongoClient
from pymongo.errors import BulkWriteError
from slugify import slugify

## Logging

In [None]:
try:
    os.remove('new_ft_sentences.log')
except:
    pass

logging.getLogger().handlers = []
logging.getLogger('requests.packages.urllib3').setLevel(logging.WARNING)
logging.basicConfig(
    filename='new_ft_sentences.log',
    level=logging.INFO,
    format='%(asctime)s %(message)s'
)

## MongoDB

In [None]:
client = MongoClient()
client.drop_database('new_ft_sentences')
db_sentences = client.new_ft_sentences
db_text = client.new_ft_text

In [None]:
def insert_rows(db, rows):
    try:
        db.rows.insert_many(rows, ordered=False)
    except BulkWriteError as ex:
        for err in ex.details['writeErrors']:
            if err['code'] == 11000:
                _id = err['op']['_id']
                logging.info('BulkWriteError: {} - {}'.format(ex, _id))

## Search terms

In [None]:
with open('search_terms.yml') as search_term_file:
    term_yaml = yaml.load(search_term_file.read())

In [None]:
# [(search_term, original_term, term_type)]
search_terms = [
    (t, k2, k1)
        for k1 in term_yaml
            for k2 in term_yaml[k1]
                for t in term_yaml[k1][k2]
]

## Main

In [None]:
%%time
for article in db_text.rows.find():
    text = article['text']
    if any([slugify(search_term[0]) in slugify(text) for search_term in search_terms]):
        rows = []
        sentences = sent_tokenize(text)
        for sentence in sentences:
            for search_term in search_terms:
                if slugify(search_term[0]) in slugify(sentence):
                    row = {
                        'article_id': article['_id'],
                        'date': article['date'],
                        'url': article['url'],
                        'source': article['source'],
                        'title': article['title'],
                        'term_category': search_term[2],
                        'term': search_term[1],
                        'sentence': sentence,
                    }
                    rows.append(row)
    if rows:
        insert_rows(db_sentences, rows)

In [None]:
db_sentences.rows.count()