#CSV builder for The New York Times

In [1]:
# import logging
# import math
# import os
# from collections import defaultdict
# from datetime import timedelta, date, datetime
# from dateutil import parser
# from time import sleep, time

# import requests
# import yaml
# from joblib import Parallel, delayed
# from pymongo import MongoClient
# from pymongo.errors import BulkWriteError, DuplicateKeyError

In [196]:
from blaze import Data, DataFrame
from nltk.tokenize import sent_tokenize
from pymongo import MongoClient
from slugify import slugify

##MongoDB

In [3]:
client = MongoClient()
db = client.nytimes3

In [4]:
# for d in db.articles.find():
#     if len(d['q_info']) > 1:
#         k = list(d['q_info'].keys())[0]
#         if len(d['q_info'][k]) > 1:
#             print(d['_id'])
#             print(d['q_info'])
#             break

# 52c8a3eb38f0d862ec32236f
# {
# 'amazon_com': [{
#     'q': {'page': 0, 'end_date': '20140228', 'term': '"amazon.com"', 'begin_date': '20140101', 'api_key': '3439a9084efa80c4f5fb1d290dfc1b44:11:70233981'},
#     'term_category': 3,
#     'snippet': 'Jeff Bezos, vacationing in the Galapagos Islands this week, felt some serious pain. And it wasn’t just because UPS had failed to make all of Amazon’s shipments by Christmas.'
#     }, {
#     'q': {'page': 0, 'begin_date': '20140101', 'end_date': '20140228', 'term': '"amazon"', 'api_key': 'a5c709f3168b829711241b243457e9d6:13:70235641'},
#     'term_category': 2,
#     'snippet': 'The Ecuadorean Navy offered Jeff Bezos same-day shipping on an <strong>Amazon</strong> Prime package: Mr. Bezos himself. Amazon’s chief executive suffered a kidney stone attack visiting the Galapagos Islands this week, according to the local'
#     }],
# 'executive': [{
#     'q': {'page': 10, 'end_date': '20140116', 'term': '"executive"', 'begin_date': '20140101', 'api_key': 'a5c709f3168b829711241b243457e9d6:13:70235641'},
#     'term_category': 1,
#     'snippet': 'The Ecuadorean Navy offered Jeff Bezos same-day shipping on an Amazon Prime package: Mr. Bezos himself. Amazon’s chief <strong>executive</strong> suffered a kidney stone attack visiting the Galapagos Islands this week, according to the local'
#     }]
# }

In [182]:
# db.articles.find_one({'_id':'52c8a3eb38f0d862ec32236f'})

In [375]:
total_rows_df = DataFrame()
for d in [db.articles.find_one({'_id':'52c8a3eb38f0d862ec32236f'}),
          db.articles.find_one({'_id':'52c8a3eb38f0d862ec32236f'})]:
    rows_df = DataFrame()
    
    # Texts
    common_texts = []
    if d['abstract']:
        common_texts.append(d['abstract'])
    if d['headline'] and d['headline']['main']:
        common_texts.append(d['headline']['main'])
    if d['lead_paragraph']:
        common_texts.append(d['lead_paragraph'])
    # add snippet as variable field
    
    # Fix fields
#     row_id = 'row_id'
    article_id = d['_id']
    pub_date = d['pub_date']
    section_name = d['section_name']
    web_url = d['web_url']
    
    # Variable fields
    for term in d['q_info']:
        texts = list(common_texts)
        q_info = d['q_info'][term]
        term_category = q_info[0]['term_category']
        search_terms = []
        for info in q_info:
            search_terms.append(info['q']['term'])
            if info['snippet']:
                texts.append(info['snippet'])
        
        for text in texts:
            sentences = sent_tokenize(text)
            for sentence in sentences:
                if any([slugify(search_term) in slugify(sentence) for search_term in search_terms]):
                    row_df = DataFrame(
                        [[article_id, pub_date, section_name, web_url, term_category, term, sentence]],
                        columns=['article_id', 'pub_date', 'section_name', 'web_url', 'term_category', 'term', 'sentence']
                    )
                    rows_df = rows_df.append(row_df)
    
    total_rows_df = total_rows_df.append(rows_df)

In [376]:
total_rows_df['term'] = total_rows_df['term'].apply(lambda x: x.replace('_', '.'))
total_rows_df['sentence'] = total_rows_df['sentence'].apply(lambda x: x.replace('<strong>', '').replace('</strong>', ''))
total_rows_df['sentence'] = total_rows_df['sentence'].apply(lambda x: x.replace(',', '')) # delimiter problems

In [377]:
total_rows_df = total_rows_df.reset_index(drop=True)

In [394]:
total_rows_df.to_csv('total_rows.csv')

In [426]:
total_rows_data = Data('total_rows.csv').relabel({'Unnamed: 0': 'row_id'})

In [427]:
total_rows_data

Unnamed: 0,row_id,article_id,pub_date,section_name,web_url,term_category,term,sentence
0,0,52c8a3eb38f0d862ec32236f,2014-01-04 19:11:42,Technology,http://bits.blogs.nytimes.com/2014/01/04/ecuad...,3,amazon.com,And it wasn’t just because UPS had failed to m...
1,1,52c8a3eb38f0d862ec32236f,2014-01-04 19:11:42,Technology,http://bits.blogs.nytimes.com/2014/01/04/ecuad...,3,amazon.com,Ecuadorean Navy Delivers for Amazon Chief
2,2,52c8a3eb38f0d862ec32236f,2014-01-04 19:11:42,Technology,http://bits.blogs.nytimes.com/2014/01/04/ecuad...,3,amazon.com,And it wasn’t just because UPS had failed to m...
3,3,52c8a3eb38f0d862ec32236f,2014-01-04 19:11:42,Technology,http://bits.blogs.nytimes.com/2014/01/04/ecuad...,3,amazon.com,The Ecuadorean Navy offered Jeff Bezos same-da...
4,4,52c8a3eb38f0d862ec32236f,2014-01-04 19:11:42,Technology,http://bits.blogs.nytimes.com/2014/01/04/ecuad...,3,amazon.com,Amazon’s chief executive suffered a kidney sto...
5,5,52c8a3eb38f0d862ec32236f,2014-01-04 19:11:42,Technology,http://bits.blogs.nytimes.com/2014/01/04/ecuad...,1,executive,Amazon’s chief executive suffered a kidney sto...
6,6,52c8a3eb38f0d862ec32236f,2014-01-04 19:11:42,Technology,http://bits.blogs.nytimes.com/2014/01/04/ecuad...,3,amazon.com,And it wasn’t just because UPS had failed to m...
7,7,52c8a3eb38f0d862ec32236f,2014-01-04 19:11:42,Technology,http://bits.blogs.nytimes.com/2014/01/04/ecuad...,3,amazon.com,Ecuadorean Navy Delivers for Amazon Chief
8,8,52c8a3eb38f0d862ec32236f,2014-01-04 19:11:42,Technology,http://bits.blogs.nytimes.com/2014/01/04/ecuad...,3,amazon.com,And it wasn’t just because UPS had failed to m...
9,9,52c8a3eb38f0d862ec32236f,2014-01-04 19:11:42,Technology,http://bits.blogs.nytimes.com/2014/01/04/ecuad...,3,amazon.com,The Ecuadorean Navy offered Jeff Bezos same-da...


In [405]:
total_rows_data.columns

['row_id',
 'article_id',
 'pub_date',
 'section_name',
 'web_url',
 'term_category',
 'term',
 'sentence']

In [421]:
d=total_rows_data.dshape.info()[1][1]

In [423]:
d.fields

(('Unnamed: 0', ctype("int64")),
 ('article_id', ?string),
 ('pub_date', ?datetime),
 ('section_name', ?string),
 ('web_url', ?string),
 ('term_category', ctype("int64")),
 ('term', ?string),
 ('sentence', ?string))

In [388]:
iris = Data('iris.csv')

In [389]:
iris

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,"Iris, setosa"
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
5,5.4,3.9,1.7,0.4,Iris-setosa
6,4.6,3.4,1.4,0.3,Iris-setosa
7,5.0,3.4,1.5,0.2,Iris-setosa
8,4.4,2.9,1.4,0.2,Iris-setosa
9,4.9,3.1,1.5,0.1,Iris-setosa
