In [None]:
import logging

import yaml
from nltk.tokenize import sent_tokenize
from pymongo import MongoClient
from pymongo.errors import BulkWriteError
from slugify import slugify

## Logging

In [None]:
try:
    os.remove('new_ft_sentences.log')
except:
    pass

logging.getLogger().handlers = []
logging.getLogger('requests.packages.urllib3').setLevel(logging.WARNING)
logging.basicConfig(
    filename='new_ft_sentences.log',
    level=logging.INFO,
    format='%(asctime)s %(message)s'
)

## MongoDB

In [None]:
# for x in DB.rows.find({'pub_date': {'$gt': '20030103', '$lt': '20030107'}}):
#     print(x)

In [None]:
client = MongoClient()
client.drop_database('new_ft_sentences')
db_sentences = client.new_ft_sentences
db_text = client.new_ft_text

In [None]:
def insert_rows(db, rows):
    try:
        db.rows.insert_many(rows, ordered=False)
    except BulkWriteError as ex:
        for err in ex.details['writeErrors']:
            if err['code'] == 11000:
                _id = err['op']['_id']
                print('BulkWriteError: {} - {}'.format(ex, _id))
                global dups
                dups += 1
                print(err)
                print()

## Search terms

In [None]:
with open('search_terms.yml') as search_term_file:
    term_yaml = yaml.load(search_term_file.read())

In [None]:
# [(search_term, original_term, term_type)]
search_terms_aux = [
    (t, k2, k1)
        for k1 in term_yaml
            for k2 in term_yaml[k1]
                for t in term_yaml[k1][k2]
]

In [None]:
# Remove
remove_list = ['Become Inc', 'Indeed', 'at&t', 'Signal']

In [None]:
search_terms = [term for term in search_terms_aux if term[1] not in remove_list]
slugified_terms = [slugify(term[0]) for term in search_terms]

## Main

In [None]:
%%time
for article in db_text.rows.find():
    rows = []
    text = article['text']
    sentences = sent_tokenize(text)
    for sentence in sentences:
        slugified_sentence = slugify(sentence)
        for slugified_term in slugified_terms:
            if slugified_term in slugified_sentence:
                row = {
                    'article_id': article['_id'],
                    'date': article['date'],
                    'url': article['url'],
                    'source': article['source'],
                    'title': article['title'],
                    'term_category': search_term[2],
                    'term': search_term[1],
                    'sentence': sentence,
                }
                rows.append(row)
    if rows:
        insert_rows(db_sentences, rows)

In [None]:
db_sentences.rows.count()

In [None]:
db_sentences.rows.find_one({'_id': ObjectId('56140caba688eb514e37573a')})