In [1]:
import logging

import yaml
from nltk.tokenize import sent_tokenize
from pymongo import MongoClient
from pymongo.errors import BulkWriteError
from slugify import slugify

## Logging

In [2]:
try:
    os.remove('new_ft_sentences.log')
except:
    pass

logging.getLogger().handlers = []
logging.getLogger('requests.packages.urllib3').setLevel(logging.WARNING)
logging.basicConfig(
    filename='new_ft_sentences.log',
    level=logging.INFO,
    format='%(asctime)s %(message)s'
)

## MongoDB

In [3]:
# for x in DB.rows.find({'pub_date': {'$gt': '20030103', '$lt': '20030107'}}):
#     print(x)

In [4]:
client = MongoClient()
# client.drop_database('new_ft_sentences')
db_sentences = client.new_ft_sentences
db_text = client.new_ft_text

In [5]:
def insert_rows(db, rows):
    try:
        db.rows.insert_many(rows, ordered=False)
    except BulkWriteError as ex:
        for err in ex.details['writeErrors']:
            if err['code'] == 11000:
                _id = err['op']['_id']
                logging.info('BulkWriteError: {} - {}'.format(ex, _id))
                global dups
                dups += 1

## Search terms

In [6]:
with open('search_terms.yml') as search_term_file:
    term_yaml = yaml.load(search_term_file.read())

In [7]:
# [(search_term, original_term, term_type)]
search_terms_aux = [
    (t, k2, k1)
        for k1 in term_yaml
            for k2 in term_yaml[k1]
                for t in term_yaml[k1][k2]
]

In [8]:
# Remove
remove_list = ['Become Inc', 'Indeed', 'at&t', 'Signal']

In [9]:
search_terms = filter(lambda x: x[1] not in remove_list, search_terms_aux)
slugified_terms = list(map(lambda x: (slugify(x[0]), x[1], x[2]), search_terms))

## Main

In [10]:
%%time
count = 0
for article in db_text.rows.find():
    rows = []
    text = article['text']
    sentences = sent_tokenize(text)
    for sentence in sentences:
        slugified_sentence = slugify(sentence)
        slugified_sentence_list = slugified_sentence.split('-')
        for slugified_term in slugified_terms:
            if slugified_term[0] in slugified_sentence_list:
                row = {
                    'article_id': article['_id'],
                    'date': article['date'],
                    'url': article['url'],
                    'source': article['source'],
                    'title': article['title'],
                    'term_category': slugified_term[2],
                    'original_term': slugified_term[1],
                    'search_term': slugified_term[0],
                    'sentence': sentence,
                }
                rows.append(row)
    if rows:
        insert_rows(db_sentences, rows)
    
    if count % 10000 == 0:
        print(count, 'articles processed.')
    count += 1

0 articles processed.
10000 articles processed.
20000 articles processed.
30000 articles processed.
40000 articles processed.
50000 articles processed.
60000 articles processed.
70000 articles processed.
80000 articles processed.
90000 articles processed.
100000 articles processed.
110000 articles processed.
120000 articles processed.
130000 articles processed.
140000 articles processed.
150000 articles processed.
160000 articles processed.
170000 articles processed.
180000 articles processed.
190000 articles processed.
200000 articles processed.
210000 articles processed.
220000 articles processed.
230000 articles processed.
240000 articles processed.
250000 articles processed.
260000 articles processed.
270000 articles processed.
280000 articles processed.
290000 articles processed.
300000 articles processed.
310000 articles processed.
320000 articles processed.
330000 articles processed.
340000 articles processed.
350000 articles processed.
360000 articles processed.
370000 articles

In [11]:
db_sentences.rows.count()

749553

In [12]:
count = 0
for s in db_sentences.rows.find():
    print(s)
    print()
    
    if count > 10:
        break
    count += 1

{'date': '20040429', 'article_id': '040429008280', 'source': 'ftcom', 'term_category': 2, 'url': 'http://search.ft.com/search/article.html?id=040429008280', 'original_term': 'microsoft', 'sentence': 'and Microsoft.', '_id': ObjectId('5615738fa688eb2b8e26b299'), 'search_term': 'microsoft', 'title': "Google's rivals in search for supremacy"}

{'date': '20040429', 'article_id': '040429008280', 'source': 'ftcom', 'term_category': 2, 'url': 'http://search.ft.com/search/article.html?id=040429008280', 'original_term': 'Google', 'sentence': 'has pulled off a series of acquisitions, including the Inktomi search engine and the Overture advertising network, to emulate Google.', '_id': ObjectId('5615738fa688eb2b8e26b29a'), 'search_term': 'google', 'title': "Google's rivals in search for supremacy"}

{'date': '20040429', 'article_id': '040429008280', 'source': 'ftcom', 'term_category': 2, 'url': 'http://search.ft.com/search/article.html?id=040429008280', 'original_term': 'Google', 'sentence': "Earl

In [13]:
# for x in slugified_terms:
#     for y in slugified_terms:
#         if x[0] in y[0] and x[0] != y[0] and x[1] != y[1]:
#             print(x)
#             print(y)
#             print()