#CSV builder for The New York Times

In [1]:
import logging
import math

from blaze import Data, DataFrame
from joblib import Parallel, delayed
from nltk.tokenize import sent_tokenize
from pymongo import MongoClient
from pymongo.errors import BulkWriteError
from slugify import slugify

##Logging

In [2]:
try:
    os.remove('newdb.log')
except:
    pass

In [3]:
logging.getLogger().handlers = []
logging.getLogger('requests.packages.urllib3').setLevel(logging.WARNING)
logging.basicConfig(filename='newdb.log', level=logging.INFO, format='%(asctime)s %(message)s')

In [4]:
def write_log(*args, status=None):
    record = '{} ==> {}'.format(args, status)
    logging.info(record)

##MongoDB

In [5]:
client = MongoClient()
client.drop_database('nytimes_csv')
db = client.nytimes3
db_csv = client.nytimes_csv

In [6]:
def insert_rows(rows):
    try:
        db_csv.rows.insert_many(rows, ordered=False)
    except Exception as e:
        write_log('DB INSERTION EXCEPTION', status='{}'.format(e))

In [7]:
total = db.articles.count()
percent = math.ceil(total / 100)
count = 0
print('Total documents:', total)
print()

for doc in db.articles.find():
    rows = []
    try:
        # Texts
        common_texts = []
        if doc.get('abstract'):
            common_texts.append(doc['abstract'])
        if doc.get('headline') and isinstance(doc['headline'], dict) and doc['headline'].get('main'):
            common_texts.append(doc['headline']['main'])
        if doc.get('lead_paragraph'):
            common_texts.append(doc['lead_paragraph'])
        # add snippet as variable field

        # Fix fields
        article_id = doc['_id']
        pub_date = doc['pub_date']
        section_name = doc['section_name']
        web_url = doc['web_url']

        # Variable fields
        for term in doc['q_info']:
            texts = list(common_texts)
            q_info = doc['q_info'][term]
            term_category = q_info[0]['term_category']
            search_terms = []
            for info in q_info:
                search_terms.append(info['q']['term'])
                if info['snippet']:
                    texts.append(info['snippet'])

            for text in texts:
                sentences = sent_tokenize(text)
                for sentence in sentences:
                    if any([slugify(search_term) in slugify(sentence) for search_term in search_terms]):
                        term_aux = term.replace('_', '.')
                        sentence_aux = sentence.replace('<strong>', '').replace('</strong>', '').replace(',', '')
                        if article_id and pub_date and section_name and web_url and term_category and term_aux and sentence_aux:
                            row = {
                                'article_id': article_id,
                                'pub_date': pub_date,
                                'section_name': section_name,
                                'web_url': web_url,
                                'term_category': term_category,
                                'term': term_aux,
                                'sentence': sentence_aux,
                            }
                            rows.append(row)
        if rows:
            insert_rows(rows)

        if count % percent == 0:
            percentage = count // percent
            print('{} out of {} processed.'.format(count, total))
            print('{}% completed.'.format(percentage))
            print()
        count +=1

    except Exception as e:
        write_log('DOCUMENT {} PROCESS EXCEPTION'.format(doc), status='{}'.format(e))

print('{} out of {} processed.'.format(total, total))
print('100% completed.')

Total documents: 2965

0 out of 2965 processed.
0% completed.

30 out of 2965 processed.
1% completed.

60 out of 2965 processed.
2% completed.

90 out of 2965 processed.
3% completed.

120 out of 2965 processed.
4% completed.

150 out of 2965 processed.
5% completed.

180 out of 2965 processed.
6% completed.

210 out of 2965 processed.
7% completed.

240 out of 2965 processed.
8% completed.

270 out of 2965 processed.
9% completed.

300 out of 2965 processed.
10% completed.

330 out of 2965 processed.
11% completed.

360 out of 2965 processed.
12% completed.

390 out of 2965 processed.
13% completed.

420 out of 2965 processed.
14% completed.

450 out of 2965 processed.
15% completed.

480 out of 2965 processed.
16% completed.

510 out of 2965 processed.
17% completed.

540 out of 2965 processed.
18% completed.

570 out of 2965 processed.
19% completed.

600 out of 2965 processed.
20% completed.

630 out of 2965 processed.
21% completed.

660 out of 2965 processed.
22% completed.

690

In [8]:
# total_rows_df['term'] = total_rows_df['term'].apply(lambda x: x.replace('_', '.'))
# total_rows_df['sentence'] = total_rows_df['sentence'].apply(lambda x: x.replace('<strong>', '').replace('</strong>', ''))
# total_rows_df['sentence'] = total_rows_df['sentence'].apply(lambda x: x.replace(',', '')) # csv delimiter problems

In [9]:
# total_rows_df.to_csv('total_rows.csv', index=False)

In [10]:
# print(total_rows_df['term'].value_counts())

In [11]:
db_csv.rows.count()

4108