#CSV builder for The New York Times

In [1]:
# import logging
# import math
# import os
# from collections import defaultdict
# from datetime import timedelta, date, datetime
# from dateutil import parser
# from time import sleep, time

# import requests
# import yaml
# from joblib import Parallel, delayed
# from pymongo import MongoClient
# from pymongo.errors import BulkWriteError, DuplicateKeyError

In [2]:
from blaze import Data, DataFrame, by, join, merge
from nltk.tokenize import sent_tokenize
from pymongo import MongoClient
from slugify import slugify

##MongoDB

In [3]:
client = MongoClient()
db = client.nytimes3

In [4]:
# for d in db.articles.find():
#     if len(d['q_info']) > 1:
#         k = list(d['q_info'].keys())[0]
#         if len(d['q_info'][k]) > 1:
#             print(d['_id'])
#             print(d['q_info'])
#             break

# 52c8a3eb38f0d862ec32236f
# {
# 'amazon_com': [{
#     'q': {'page': 0, 'end_date': '20140228', 'term': '"amazon.com"', 'begin_date': '20140101', 'api_key': '3439a9084efa80c4f5fb1d290dfc1b44:11:70233981'},
#     'term_category': 3,
#     'snippet': 'Jeff Bezos, vacationing in the Galapagos Islands this week, felt some serious pain. And it wasn’t just because UPS had failed to make all of Amazon’s shipments by Christmas.'
#     }, {
#     'q': {'page': 0, 'begin_date': '20140101', 'end_date': '20140228', 'term': '"amazon"', 'api_key': 'a5c709f3168b829711241b243457e9d6:13:70235641'},
#     'term_category': 2,
#     'snippet': 'The Ecuadorean Navy offered Jeff Bezos same-day shipping on an <strong>Amazon</strong> Prime package: Mr. Bezos himself. Amazon’s chief executive suffered a kidney stone attack visiting the Galapagos Islands this week, according to the local'
#     }],
# 'executive': [{
#     'q': {'page': 10, 'end_date': '20140116', 'term': '"executive"', 'begin_date': '20140101', 'api_key': 'a5c709f3168b829711241b243457e9d6:13:70235641'},
#     'term_category': 1,
#     'snippet': 'The Ecuadorean Navy offered Jeff Bezos same-day shipping on an Amazon Prime package: Mr. Bezos himself. Amazon’s chief <strong>executive</strong> suffered a kidney stone attack visiting the Galapagos Islands this week, according to the local'
#     }]
# }

In [5]:
# db.articles.find_one({'_id':'52c8a3eb38f0d862ec32236f'})

In [6]:
total_rows_df = DataFrame()
for doc in db.articles.find():
    rows_df = DataFrame()
    
    # Texts
    common_texts = []
    if doc['abstract']:
        common_texts.append(doc['abstract'])
    if doc['headline'] and doc['headline']['main']:
        common_texts.append(doc['headline']['main'])
    if doc['lead_paragraph']:
        common_texts.append(doc['lead_paragraph'])
    # add snippet as variable field
    
    # Fix fields
#     row_id = 'row_id'
    article_id = doc['_id']
    pub_date = doc['pub_date']
    section_name = doc['section_name']
    web_url = doc['web_url']
    
    # Variable fields
    for term in doc['q_info']:
        texts = list(common_texts)
        q_info = doc['q_info'][term]
        term_category = q_info[0]['term_category']
        search_terms = []
        for info in q_info:
            search_terms.append(info['q']['term'])
            if info['snippet']:
                texts.append(info['snippet'])
        
        for text in texts:
            sentences = sent_tokenize(text)
            for sentence in sentences:
                if any([slugify(search_term) in slugify(sentence) for search_term in search_terms]):
                    row_df = DataFrame(
                        [[article_id, pub_date, section_name, web_url, term_category, term, sentence]],
                        columns=['article_id', 'pub_date', 'section_name', 'web_url', 'term_category', 'term', 'sentence']
                    )
                    rows_df = rows_df.append(row_df)
    
    total_rows_df = total_rows_df.append(rows_df)

In [7]:
total_rows_df['term'] = total_rows_df['term'].apply(lambda x: x.replace('_', '.'))
total_rows_df['sentence'] = total_rows_df['sentence'].apply(lambda x: x.replace('<strong>', '').replace('</strong>', ''))
total_rows_df['sentence'] = total_rows_df['sentence'].apply(lambda x: x.replace(',', '')) # csv delimiter problems

In [8]:
total_rows_df.to_csv('total_rows.csv', index=False)

In [9]:
total_rows_data = Data('total_rows.csv')

In [10]:
total_rows_distinct = total_rows_data.distinct()

In [11]:
total_rows_data

Unnamed: 0,article_id,pub_date,section_name,web_url,term_category,term,sentence
0,52ce1f2c38f0d81799290366,2014-01-01 00:00:00,Paid Death Notices,http://query.nytimes.com/gst/fullpage.html?res...,1,entrepreneur,An entrepreneur and connoisseur she was also p...
1,52ce1f2c38f0d81799290366,2014-01-01 00:00:00,Paid Death Notices,http://query.nytimes.com/gst/fullpage.html?res...,1,entrepreneur,An entrepreneur and connoisseur she was also p...
2,52c4045238f0d82e35d8e45c,2014-01-01 07:00:45,Business Day,http://boss.blogs.nytimes.com/2014/01/01/the-b...,1,entrepreneur,Colleen DeBaise — and the Story Exchange — bro...
3,52c491cb38f0d82d0ea3260c,2014-01-01 17:08:02,U.S.,http://www.nytimes.com/video/us/10000000262887...,1,executive,Mike Duggan a former hospital executive and pr...
4,52c491cb38f0d82d0ea3260c,2014-01-01 17:08:02,U.S.,http://www.nytimes.com/video/us/10000000262887...,1,executive,Mike Duggan a former hospital executive and pr...
5,52e0941e38f0d87ee88528f8,2014-01-15 00:00:00,Paid Death Notices,http://query.nytimes.com/gst/fullpage.html?res...,1,executive,Alisa R. Doctoroff President; Linda Mirels Cha...
6,52e0941e38f0d87ee88528f8,2014-01-15 00:00:00,Paid Death Notices,http://query.nytimes.com/gst/fullpage.html?res...,1,executive,Alisa R. Doctoroff President; Linda Mirels Cha...
7,52c4b55038f0d82d0ea32638,2014-01-02 00:00:00,Business Day,http://www.nytimes.com/2014/01/02/business/hig...,1,executive,analysts and industry executives say.
8,52c4b55038f0d82d0ea32638,2014-01-02 00:00:00,Business Day,http://www.nytimes.com/2014/01/02/business/hig...,1,executive,“People were pulling back when they had to let...
9,52c4b55038f0d82d0ea32638,2014-01-02 00:00:00,Business Day,http://www.nytimes.com/2014/01/02/business/hig...,1,entrepreneur,DETROIT — Matt Hlavin an entrepreneur in Cleve...


In [12]:
total_rows_distinct

Unnamed: 0,article_id,pub_date,section_name,web_url,term_category,term,sentence
0,52ce1f2c38f0d81799290366,2014-01-01 00:00:00,Paid Death Notices,http://query.nytimes.com/gst/fullpage.html?res...,1,entrepreneur,An entrepreneur and connoisseur she was also p...
1,52c4045238f0d82e35d8e45c,2014-01-01 07:00:45,Business Day,http://boss.blogs.nytimes.com/2014/01/01/the-b...,1,entrepreneur,Colleen DeBaise — and the Story Exchange — bro...
2,52c491cb38f0d82d0ea3260c,2014-01-01 17:08:02,U.S.,http://www.nytimes.com/video/us/10000000262887...,1,executive,Mike Duggan a former hospital executive and pr...
3,52e0941e38f0d87ee88528f8,2014-01-15 00:00:00,Paid Death Notices,http://query.nytimes.com/gst/fullpage.html?res...,1,executive,Alisa R. Doctoroff President; Linda Mirels Cha...
4,52c4b55038f0d82d0ea32638,2014-01-02 00:00:00,Business Day,http://www.nytimes.com/2014/01/02/business/hig...,1,executive,analysts and industry executives say.
5,52c4b55038f0d82d0ea32638,2014-01-02 00:00:00,Business Day,http://www.nytimes.com/2014/01/02/business/hig...,1,executive,“People were pulling back when they had to let...
6,52c4b55038f0d82d0ea32638,2014-01-02 00:00:00,Business Day,http://www.nytimes.com/2014/01/02/business/hig...,1,entrepreneur,DETROIT — Matt Hlavin an entrepreneur in Cleve...
7,52c8a15c38f0d862ec32236d,2014-01-05 00:00:00,Real Estate,http://www.nytimes.com/2014/01/05/realestate/r...,1,entrepreneur,to the thrift and simplicity of our grandfathe...
8,52c6e22438f0d82ae415dd89,2014-01-05 00:00:00,Books,http://www.nytimes.com/2014/01/05/books/review...,1,entrepreneur,squander this thing we call ‘the day’ as quick...
9,52c6e23b38f0d82ae415dd8c,2014-01-05 00:00:00,Books,http://www.nytimes.com/2014/01/05/books/review...,1,entrepreneur,) an entrepreneur (superior service!).


In [13]:
total_rows_distinct.term.count_values()

Unnamed: 0,term,count
3,executive,2960
4,google,600
2,entrepreneur,186
0,amazon.com,185
1,e-bay,81


In [14]:
total_rows_data.count()

In [15]:
total_rows_distinct.count()

In [16]:
data = total_rows_distinct[total_rows_distinct.term != 'executive']

In [17]:
data.term.count_values()

Unnamed: 0,term,count
3,google,600
2,entrepreneur,186
0,amazon.com,185
1,e-bay,81


In [26]:
data[data.term == 'e-bay'].term_category.map(0)

In [27]:
data[data.term == 'e-bay'].term_category

Unnamed: 0,term_category
24,3
183,3
184,3
577,3
815,3
816,3
817,3
818,3
819,3
821,3


In [50]:
data.dshape

dshape("""var * {
  article_id: ?string,
  pub_date: ?datetime,
  section_name: ?string,
  web_url: ?string,
  term_category: int64,
  term: ?string,
  sentence: ?string
  }""")

In [55]:
print(data[3])

distinct(_1)[distinct(_1).term != 'executive'][3]


In [56]:
import pickle
with open('sentiment/be.pickle', 'rb') as be_file:
    be = pickle.load(be_file)

In [62]:
data.dshape

dshape("""var * {
  article_id: ?string,
  pub_date: ?datetime,
  section_name: ?string,
  web_url: ?string,
  term_category: int64,
  term: ?string,
  sentence: ?string
  }""")

In [63]:
ss = data.sentence.map(lambda x: be.predict([x])[0], '?string')

In [76]:
rr = ss.relabel({'sentence': 'sentiment'})

In [77]:
rr

Unnamed: 0,sentiment
0,pos
1,pos
6,pos
7,pos
8,pos
9,pos
11,pos
12,pos
13,pos
14,pos


In [74]:
data

Unnamed: 0,article_id,pub_date,section_name,web_url,term_category,term,sentence
0,52ce1f2c38f0d81799290366,2014-01-01 00:00:00,Paid Death Notices,http://query.nytimes.com/gst/fullpage.html?res...,1,entrepreneur,An entrepreneur and connoisseur she was also p...
1,52c4045238f0d82e35d8e45c,2014-01-01 07:00:45,Business Day,http://boss.blogs.nytimes.com/2014/01/01/the-b...,1,entrepreneur,Colleen DeBaise — and the Story Exchange — bro...
6,52c4b55038f0d82d0ea32638,2014-01-02 00:00:00,Business Day,http://www.nytimes.com/2014/01/02/business/hig...,1,entrepreneur,DETROIT — Matt Hlavin an entrepreneur in Cleve...
7,52c8a15c38f0d862ec32236d,2014-01-05 00:00:00,Real Estate,http://www.nytimes.com/2014/01/05/realestate/r...,1,entrepreneur,to the thrift and simplicity of our grandfathe...
8,52c6e22438f0d82ae415dd89,2014-01-05 00:00:00,Books,http://www.nytimes.com/2014/01/05/books/review...,1,entrepreneur,squander this thing we call ‘the day’ as quick...
9,52c6e23b38f0d82ae415dd8c,2014-01-05 00:00:00,Books,http://www.nytimes.com/2014/01/05/books/review...,1,entrepreneur,) an entrepreneur (superior service!).
11,52c6ea6e38f0d82ae415ddb2,2014-01-05 00:00:00,Real Estate,http://www.nytimes.com/2014/01/05/realestate/a...,1,entrepreneur,did exactly what I wanted in the apartment in ...
12,52cc1e9738f0d86264f604ec,2014-01-07 10:30:22,Business Day,http://boss.blogs.nytimes.com/2014/01/07/today...,1,entrepreneur,Entrepreneurs This entrepreneur is trying to c...
13,52cc414138f0d878bab47df0,2014-01-07 12:58:19,Technology,http://bits.blogs.nytimes.com/2014/01/07/wolfr...,1,entrepreneur,Stephen Wolfram a scientist and entrepreneur a...
14,52cc414138f0d878bab47df0,2014-01-07 12:58:19,Technology,http://bits.blogs.nytimes.com/2014/01/07/wolfr...,1,entrepreneur,unveiled at the International Consumer Electro...


In [67]:
import blaze

In [97]:
ext = blaze.join(data, data)

In [98]:
rr

Unnamed: 0,sentiment
0,pos
1,pos
6,pos
7,pos
8,pos
9,pos
11,pos
12,pos
13,pos
14,pos


In [100]:
sdf = blaze.odo(rr, blaze.DataFrame)

In [125]:
sdf[sdf.sentiment == 'pos'].count()

sentiment    906
dtype: int64

In [126]:
sdf[sdf.sentiment == 'neg'].count()

sentiment    146
dtype: int64

In [102]:
ddf = blaze.odo(data, blaze.DataFrame)

In [103]:
ddf['sent'] = sdf

In [106]:
data2 = Data(ddf)

In [110]:
data2.count()

In [132]:
data2[data2.sent == 'pos'].count()

In [136]:
data2.term_category.count()

In [137]:
100*data2[data2.sent == 'pos'].count()//data2.term_category.count()

In [138]:
by(data2.term_category,
   prop_pos=100*data2[data2.sent == 'pos'].count()//data2.term_category.count(),
   prop_neg=100*data2[data2.sent == 'neg'].count()//data2.term_category.count(),
)a

SyntaxError: invalid syntax (<ipython-input-138-6ee2a1550800>, line 4)

In [148]:
by(data2.term_category,
   total_amount=data2.sent.count(),
   amount_pos=data2[data2.sent == 'pos'].sent.count(),
   amount_neg=data2[data2.sent == 'neg'].sent.count()
)

Unnamed: 0,term_category,amount_neg,amount_pos,total_amount
0,1,13,173,186
1,2,106,620,726
2,3,27,113,140


In [121]:
x = by(data2.sentence, unique=data2.term.nunique() == 1)

In [123]:
x[x.unique == True].count()

In [124]:
x[x.unique == False].count()

In [44]:
repeat = x[x.unique == False]

In [47]:
repeat.count()

In [543]:
blaze.utils.

TypeError: 'module' object is not callable

In [530]:
data.like(sentence='In addition to Uber there is competition*')

Unnamed: 0,article_id,pub_date,section_name,web_url,term_category,term,sentence
3919,5303a06738f0d835dccb3278,2014-02-18 12:59:23,Technology,http://bits.blogs.nytimes.com/2014/02/18/postm...,2,google,In addition to Uber there is competition from ...
3921,5303a06738f0d835dccb3278,2014-02-18 12:59:23,Technology,http://bits.blogs.nytimes.com/2014/02/18/postm...,3,e-bay,In addition to Uber there is competition from ...
3922,5303a06738f0d835dccb3278,2014-02-18 12:59:23,Technology,http://bits.blogs.nytimes.com/2014/02/18/postm...,2,amazon.com,In addition to Uber there is competition from ...


In [523]:
x.unique.count_values()

Unnamed: 0,unique,count
0,1,996
1,2,13
2,3,1


In [494]:
data[data.term == 'e-bay']

Unnamed: 0,article_id,pub_date,section_name,web_url,term_category,term,sentence
24,52cf2fa138f0d8359e12cf29,2014-01-10 00:00:00,Business Day,http://www.nytimes.com/2014/01/10/business/med...,3,e-bay,Pierre Omidyar the founder of eBay has pledged...
183,52deac6238f0d80317840823,2014-01-21 12:17:14,Business Day,http://boss.blogs.nytimes.com/2014/01/21/today...,3,e-bay,Online Ebay is reportedly planning a new marke...
184,52eeff4238f0d82003e2538e,2014-02-03 00:00:00,Sports,http://www.nytimes.com/2014/02/03/sports/footb...,3,e-bay,As for parking passes they were sold for $150 ...
576,52cd55e038f0d878bab480c9,2014-01-08 08:39:33,Business Day,http://dealbook.nytimes.com/2014/01/08/morning...,3,e-bay,Get your own hedge fund fleece on eBay where s...
815,52dea67e38f0d808a7481088,2014-01-21 11:54:30,Business Day,http://dealbook.nytimes.com/2014/01/21/why-bit...,3,e-bay,Bitcoin shares this network effect property wi...
816,52e2c8ed38f0d87ecf265e38,2014-01-25 00:00:00,Automobiles,http://www.nytimes.com/2014/01/25/automobiles/...,3,e-bay,the Candelabra” popped up for sale on eBay thi...
817,52e99d9b38f0d84aab9e9a52,2014-01-30 00:00:00,Home & Garden,http://www.nytimes.com/2014/01/30/garden/extra...,3,e-bay,I use wrought-iron tools I bought on eBay and ...
818,52fe860938f0d826f8460d7c,2014-02-16 00:00:00,Fashion & Style,http://www.nytimes.com/2014/02/16/fashion/wint...,3,e-bay,I knew no suede.” Her Sorels were vintage sour...
819,52fe860938f0d826f8460d7c,2014-02-16 00:00:00,Fashion & Style,http://www.nytimes.com/2014/02/16/fashion/wint...,3,e-bay,She turned to eBay because she said the boots ...
820,530c153f38f0d82a1358d5cf,2014-02-25 00:00:00,Sports,http://www.nytimes.com/2014/02/25/sports/baske...,3,e-bay,Game-used Collins jerseys from his previous te...


In [388]:
iris = Data('iris.csv')

In [389]:
iris

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,"Iris, setosa"
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
5,5.4,3.9,1.7,0.4,Iris-setosa
6,4.6,3.4,1.4,0.3,Iris-setosa
7,5.0,3.4,1.5,0.2,Iris-setosa
8,4.4,2.9,1.4,0.2,Iris-setosa
9,4.9,3.1,1.5,0.1,Iris-setosa
