In [1]:
%matplotlib inline



In [2]:
# pandas options plus some more
import numpy as np
import scipy as sp
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels import discrete
import re
import pandas as pd
import math 
import csv
import time
import dateutil
from datetime import datetime
import seaborn as sns
import json
from IPython.core.display import HTML
HTML("<style>.container {width:50% !important; }</style>")

pd.set_option('display.width', 1000)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
pd.options.display.float_format = '{:,.2f}'.format
sns.set_style("whitegrid")
sns.set_context("poster")


In [3]:
# Matplotlib Formatting
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
from matplotlib import gridspec
from matplotlib import ticker
millnames = ['',' Thousand',' Million',' Billion',' Trillion']
def millify(n, pos):
    n = float(n)
    millidx = max(0,min(len(millnames)-1,
                        int(math.floor(0 if n == 0 else math.log10(abs(n))/3))))
    thingtoreturn = n / 10**(3 * millidx)
    if thingtoreturn % 1 == 0:
        return '{:.0f}{}'.format(thingtoreturn, millnames[millidx])
    elif thingtoreturn % 0.1 == 0:
        return '{:.1f}{}'.format(thingtoreturn, millnames[millidx])
    else:
        return '{:.2f}{}'.format(thingtoreturn, millnames[millidx])
    
from eventregistry import *
er = EventRegistry()
er.login("rradovanovic@college.harvard.edu","ruski.EVENT1")

Event Registry host: http://eventregistry.org


{u'action': u'success', u'desc': u'Login successful'}

In [4]:
# load S&P data
spdf = pd.read_csv("../data/sp500.csv")
names = np.array(spdf.COMNAM.value_counts()[100:300].index)

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
%%time
"""
# get EventRegistry uri-s

uridict = {}
i=0
for name in names:
    uridict[name] = er.getConceptUri(name, lang = "eng")
orig_uridf = pd.DataFrame.from_dict(uridict, orient='index')
orig_uridf.columns = ["uri"]
uridf = orig_uridf.dropna()
""";

In [7]:
#uridf.to_excel('../data/uridf.xlsx')
uridf = pd.read_excel('../data/uridf.xlsx')

## Articles

In [80]:
"""
%%time

# get all articles

q_dict_art = {}
res_dict_art = {}

for i in range(len(uridf)):
    # get concept and uri
    concept = uridf.index[i]
    uri = uridf.uri[i]
    
    # create article query
    q_dict_art[uri] = QueryArticles()
    q_dict_art[uri].addConcept(uri)
    q_dict_art[uri].addRequestedResult(RequestArticlesUriList(count=50000))
    
    # store results
    res_dict_art[uri] = er.execQuery(q_dict[uri])
""";

Wall time: 50.4 s


In [81]:
# save so no more requesting
#with open('../data/res_dict_art.json', 'w') as fp:
#    json.dump(res_dict_art, fp, indent=4)

with open('../data/res_dict_art.json', 'r') as fp:
    res_dict_art = json.load(fp)

In [86]:
# get dict of concepts and articles
conartdict = {}

for uri, res in res_dict_art.items():
    if 'uriList' in res_dict_art[uri]:
        if 'results' in res_dict_art[uri]['uriList']:
            conartdict[uri] = set(res_dict_art[uri]['uriList']['results'])
        else:
            conartdict[uri] = set()
    else:
        conartdict[uri] = set()

In [87]:
# check totals
for uri, articles in conartdict.items():
    print uri, len(articles)

http://en.wikipedia.org/wiki/Ledbetter_v._Goodyear_Tire_&_Rubber_Co. 28
http://en.wikipedia.org/wiki/The_J.M._Smucker_Company 1699
http://en.wikipedia.org/wiki/Vernor_v._Autodesk,_Inc. 5
http://en.wikipedia.org/wiki/Corning_Inc. 4629
http://en.wikipedia.org/wiki/United_States_v._Microsoft_Corp. 663
http://en.wikipedia.org/wiki/Genuine_Parts_Company 322
http://en.wikipedia.org/wiki/Vulcan_Materials_Company 396
http://en.wikipedia.org/wiki/Leonard_v._Pepsico,_Inc. 1
http://fr.wikipedia.org/wiki/Occidental_Petroleum_Corporation 9
http://en.wikipedia.org/wiki/ConocoPhillips 13482
http://en.wikipedia.org/wiki/Gap_Inc. 40785
http://en.wikipedia.org/wiki/Eastman_Chemical_Company 1616
http://en.wikipedia.org/wiki/Abbott_Laboratories 14512
http://it.wikipedia.org/wiki/NVIDIA_Corporation 2
http://en.wikipedia.org/wiki/Realty_Income_Corporation 514
http://en.wikipedia.org/wiki/Mascoma_Corporation 1
http://fr.wikipedia.org/wiki/Kimberly-Clark_Corporation 3
http://en.wikipedia.org/wiki/Whirlpool_Co

In [84]:
# check rough concept matrix
artcovdf = pd.DataFrame(index=uridf.index.unique(), columns=uridf.index.unique())

for i in uridf.index.unique():
    for j in uridf.index.unique():
        uri_i = uridf.loc[i,:].uri
        uri_j = uridf.loc[j,:].uri
        artcovdf.loc[i, j] = len(conartdict[uri_i].intersection(conartdict[uri_j])) / len(conartdict[uri_i].union(conartdict[uri_j]))

## Events

In [66]:
"""
%%time

# get all events

q_dict_evt = {}
res_dict_evt = {}

for i in range(len(uridf)):
    # get concept and uri
    concept = uridf.index[i]
    uri = uridf.uri[i]
    
    # create article query
    q_dict_evt[uri] = QueryEvents()
    q_dict_evt[uri].addConcept(uri)
    q_dict_evt[uri].addRequestedResult(RequestEventsUriList(count=50000))
    
    # store results
    res_dict_evt[uri] = er.execQuery(q_dict_evt[uri])
""";

Wall time: 36.8 s


In [67]:
#with open('../data/res_dict_evt.json', 'w') as fp:
#    json.dump(res_dict_evt, fp, indent=4)

with open('../data/res_dict_evt.json', 'r') as fp:
    res_dict_evt = json.load(fp)

In [78]:
# get dictionary of concepts and article tags
conevtdict = {}

for uri, res in res_dict_evt.items():
    if 'uriList' in res_dict_evt[uri]:
        if 'results' in res_dict_evt[uri]['uriList']:
            conevtdict[uri] = set(res_dict[uri]['uriList']['results'])
        else:
            conevtdict[uri] = set()
    else:
        conevtdict[uri] = set()

In [79]:
# check totals
for uri, events in conevtdict.items():
    print uri, len(events)

http://en.wikipedia.org/wiki/Ledbetter_v._Goodyear_Tire_&_Rubber_Co. 0
http://en.wikipedia.org/wiki/The_J.M._Smucker_Company 1699
http://en.wikipedia.org/wiki/Vernor_v._Autodesk,_Inc. 0
http://en.wikipedia.org/wiki/Corning_Inc. 4628
http://en.wikipedia.org/wiki/United_States_v._Microsoft_Corp. 662
http://en.wikipedia.org/wiki/Genuine_Parts_Company 322
http://en.wikipedia.org/wiki/Vulcan_Materials_Company 396
http://en.wikipedia.org/wiki/Leonard_v._Pepsico,_Inc. 0
http://fr.wikipedia.org/wiki/Occidental_Petroleum_Corporation 9
http://en.wikipedia.org/wiki/ConocoPhillips 13479
http://en.wikipedia.org/wiki/Gap_Inc. 40782
http://en.wikipedia.org/wiki/Eastman_Chemical_Company 1616
http://en.wikipedia.org/wiki/Abbott_Laboratories 14511
http://it.wikipedia.org/wiki/NVIDIA_Corporation 0
http://en.wikipedia.org/wiki/Realty_Income_Corporation 514
http://en.wikipedia.org/wiki/Mascoma_Corporation 0
http://fr.wikipedia.org/wiki/Kimberly-Clark_Corporation 0
http://en.wikipedia.org/wiki/Whirlpool_Cor

In [72]:
# compute event covariance matrix
newscovdf = pd.DataFrame(index=uridf.index.unique(), columns=uridf.index.unique())

for i in uridf.index.unique():
    for j in uridf.index.unique():
        uri_i = uridf.loc[i,:].uri
        uri_j = uridf.loc[j,:].uri
        newscovdf.loc[i, j] = len(conartdict[uri_i].intersection(conartdict[uri_j])) / np.max((1,len(conartdict[uri_i].union(conartdict[uri_j]))))

In [74]:
newscovdf.sum()

KIMBERLY CLARK CORP           0.00
UNITED TECHNOLOGIES CORP      1.00
OCCIDENTAL PETROLEUM CORP     1.00
DANAHER CORP                  1.00
CATERPILLAR INC               1.00
WHIRLPOOL CORP                1.00
NIKE INC                      1.00
MICROSOFT CORP                1.00
COCA COLA CO                  1.00
CORNING INC                   1.00
CARNIVAL CORP                 1.00
GOODYEAR TIRE & RUBBER CO     0.00
PEPSICO INC                   0.00
GAP INC                       1.00
SEMPRA ENERGY                 1.00
YAHOO INC                     0.00
MASCO CORP                    0.00
TARGET CORP                   1.00
COACH INC                     1.00
GENERAL ELECTRIC CO           1.00
ROYAL CARIBBEAN CRUISES LTD   1.00
NEWMONT MINING CORP           1.00
DISNEY WALT CO                1.00
INTERNATIONAL PAPER CO        0.00
NORFOLK SOUTHERN CORP         1.00
HARRIS CORP                   1.00
AUTODESK INC                  0.00
VORNADO REALTY TRUST          1.00
VULCAN MATERIALS CO 