## Imports & Liz's Newsroom Code

In [20]:
import pandas as pd
import string
import time
import sys
import requests
from lxml import etree
from newsroom_query import NovusSearchQuery
import newsroom_query as nq
import numpy as np
import novus_document

In [21]:
class NovusSearchQuery(object):

    def __init__(self, name, permids, languages, search_terms):
        self.name = name
        self.permids = permids
        self.languages = languages
        self.search_terms = search_terms
        if not self.permids and not any(search_terms.values()):
            raise ValueError("Cannot create an empty Query: please specify some search terms")

    def get_query_string(self):
        query_components = []
        if self.permids:
            permids_list = [str(permid) for permid in self.permids]
            query_components.append("OAID({})".format(" ".join(permids_list)))
        if self.languages:
            query_components.append("LANGUAGE({})".format(" ".join(self.languages)))
        for field, search_terms in self.search_terms.items():
            query_components.append("{}({})".format(field, " OR ".join(search_terms)))

        query = " & ".join(query_components)

        print 'NovusQuery: ', query
        return query

In [22]:
import sys

__author__ = 'eroman'

import requests
from lxml import etree
from newsroom_query import NovusSearchQuery
import newsroom_query as nq
import numpy as np
import novus_document


def make_prod_client(collection_set="NRALLNEWS"):
    return NewsRoomSearchClient("http://newsroom.ta.int.thomsonreuters.com/newsroom", "TESTUI", "KY3P_DIL_POC",
                                collection_set=collection_set)


class SearchHit(object):
    # A SearchHit represent the (q, D) pair -> (query, Document)

    def __init__(self, query, collection, guid, document_xml=None, headline=None):
        self.query = query
        self.collection = collection
        self.guid = guid
        self.document_xml = document_xml
        self.headline = headline

    def set_document_xml(self, document_xml):
        self.document_xml = document_xml

    def to_csv_row(self):
        # permid, article_snippet, article_date, source_url=(collection, doc_guid), risk_score, risk_type, relevance_score, confidence_score

        # permid
        permid = "{}".format(" ".join(str(p) for p in self.query.permids))

        root = etree.XML(self.document_xml)

        # date
        date = root.xpath("//sort-pub-date")[0].text

        # relevancy & confidence

        xpath_expr = "./n-docbody/document/indexing/extraction-terms/extr-company-block//norm-company-wrap/company-ids/oa-id[text()='{}']".format(permid)
        oaid = root.xpath(xpath_expr)
        if not oaid:
            raise ValueError("No norm_company_wrap elements found in this document, this result should be discarded")
        oaid = oaid[0]
        norm_company_wrap = oaid.getparent().getparent()

        relevance_score = np.nan
        relevancies = norm_company_wrap.xpath("./relevancy")
        if relevancies:
            relevance_score = float(relevancies[0].text)

        confidence = np.nan
        confidences = norm_company_wrap.xpath("./confidence")
        if confidences:
            confidence = float(confidences[0].text)

        return [permid,
                self.document_xml,
                date,
                self.collection,
                self.guid,
                0,
                self.query.name,
                relevance_score,
                confidence]


class NewsRoomSearchClient(object):
    def __init__(self, url, product_id, root_guid, collection_set="NRALLNEWS"):
        self.url = url
        self.product_id = product_id
        self.root_guid = root_guid
        self.collection_set = collection_set


    def get_publications(self, version="v1"):
        headers = {'Content-type':'application/xml',
                   'Accept':'application/json',
                   'x-trmr-product': self.product_id,
                   'x-trmr-rootguid': self.root_guid}
        request_url = self.url + "/{}/ui/publications".format(version)
        r = requests.get(request_url, headers=headers).json()
        return r


    def get_publication_languages(self):
        headers = {'Content-type':'application/json',
                   'Accept':'application/json',
                   'x-trmr-product': self.product_id,
                   'x-trmr-rootguid': self.root_guid}
        request_url = self.url + "/v2/publication/languages/{}".format(self.collection_set)
        r = requests.get(request_url, headers=headers).json()
        return [d['label'] for d in r['languages']]

    def get_document(self, hit):
        headers = {'Content-type':'application/xml',
                   'Accept':'application/xml',
                   'x-trmr-product': self.product_id,
                   'x-trmr-rootguid': self.root_guid}
        request_url = self.url + "/v1/document/{}/{}".format(hit.collection, hit.guid)
        r = requests.get(request_url, headers=headers)
        if hit:
            response_element = etree.fromstring(r.text)
            doc_xml_string = etree.tostring(response_element.xpath("./doc")[0])
            hit.set_document_xml(doc_xml_string)
        return r

    def get_document_with_guid(self, collection, guid):
        headers = {'Content-type':'application/xml',
                   'Accept':'application/xml',
                   'x-trmr-product': self.product_id,
                   'x-trmr-rootguid': self.root_guid}
        request_url = self.url + "/v1/document/{}/{}".format(collection, guid)
        r = requests.get(request_url, headers=headers)

        response_element = etree.fromstring(r.text)
        doc_xml_string = etree.tostring(response_element.xpath("./doc")[0])
        return doc_xml_string

    def search(self, query):
        print query.get_query_string()
        headers = {'Content-type' : 'application/xml',
                   'x-trmr-product': self.product_id,
                   'x-trmr-rootguid': self.root_guid}
        request_url = self.url + "/v1/search"
        xml_body = """<tns:SearchRequest xmlns:tns="http://west.newsroom.westlan.com/newsroom/v1" queryType="BOOLEAN" collectionSet="{}" exactMatch="false" fullWidthToHalfWidthConversionEnabled="false" querySyntax="NATIVE" useQueryWarnings="true" cacheSearchResults="true" maxResultsToReturn="10000" highlightWordRanges="true" idOnly="false" ignoreStopWords="true" returnLatestBookmark="false" filterDuplicates="true" returnSearchSummaryResultOnly="false">
                    <query>
                        <![CDATA[ {} ]]>
                    </query>
                 </tns:SearchRequest>""".format(self.collection_set, query.get_query_string())
        #print xml_body
        return requests.post(request_url, headers=headers, data=xml_body)

    def retrieve_hits(self, name, permid=None, languages=["en"], search_terms={}):
        permid_query_term = [permid] if permid else []
        query = NovusSearchQuery(name, permid_query_term, languages, search_terms)
        hits = []
        r = self.search(query)
        root = etree.fromstring(r.text)
        for element in root.iter("doc"):
            title = element.xpath("n-docbody/document/title-info")[0].xpath("string()")
            hits.append(SearchHit(query, element.get("collection"), element.get("guid"), headline=title))

        return hits




In [23]:
from datetime import datetime, timedelta
import calendar

def has_exact_date(row):
    """
    check the excistence of the exact date
    """
    answer = not(row["exact_date"] == "00")
    return answer
    
def generate_exact_dates(exact_date, delta_days_before = 8, delta_days_after = 45):
    """
    return a 14-day interval for an exact date
    """
    date_object = datetime.strptime(exact_date, '%d %B %Y')
    date_from = date_object - timedelta(days = delta_days_before)  
    date_to = date_object + timedelta(days = delta_days_after)  
    return date_from, date_to


def generate_rough_dates(rough_date_in):
    date_from = datetime.strptime(rough_date_in, '%B %Y').date()
    month = date_from.month
    year = date_from.year
    last_day =  calendar.monthrange(year,month)[1]
    date_to = datetime.strptime(str(year)+str(month)+str(last_day), '%Y%m%d').date()
    rough_date = date_from.strftime("%Y%m")
    return date_from, date_to, rough_date


# from http://stackoverflow.com/questions/1060279/iterating-through-a-range-of-dates-in-python
from datetime import timedelta, date

def daterange(start_date, end_date):
    for n in range(int ((end_date - start_date).days)):
        yield start_date + timedelta(n)

#start_date = date(2013, 1, 1)
#end_date = date(2015, 6, 2)


-------- Up until this point, it is identical to Amir's Code -------

## Class 2 Exact Date

In [24]:
## function of exact_date
def generate_class2_exact_dates(exact_date, delta_days_before = 8, delta_days_after = 45, class2_before_window = 120, class2_after_window = 120):
    """
    return a 14-day interval for an exact date
    """
    date_object = datetime.strptime(exact_date, '%d %B %Y')
    class2_preevent_end = date_object - timedelta(days = delta_days_before + 1)
    class2_preevent_start = class2_preevent_end - timedelta(days = class2_before_window)
    class2_postevent_start = date_object + timedelta(days = delta_days_after + 1)  
    class2_postevent_end = class2_postevent_start +  timedelta(days = class2_after_window) 
    
    return class2_preevent_start, class2_preevent_end, class2_postevent_start, class2_postevent_end

In [25]:
generate_class2_exact_dates('20 June 2010')

(datetime.datetime(2010, 2, 11, 0, 0),
 datetime.datetime(2010, 6, 11, 0, 0),
 datetime.datetime(2010, 8, 5, 0, 0),
 datetime.datetime(2010, 12, 3, 0, 0))

## Class 2 Rough Date

In [27]:
def generate_class2_rough_dates(rough_date_in, class2_preevent_window = 120, class2_postevent_window = 120):
    date_from, date_to, rough_date = generate_rough_dates(rough_date_in)
    class2_preevent_end = date_from - timedelta(days = 1)
    class2_preevent_start = class2_preevent_end - timedelta(days = class2_preevent_window)
    class2_postevent_start = date_to + timedelta(days = 1)  
    class2_postevent_end = class2_postevent_start +  timedelta(days = class2_postevent_window) 
    return class2_preevent_start, class2_preevent_end, class2_postevent_start, class2_postevent_end

In [28]:
generate_class2_rough_dates('December 2010')

(datetime.date(2010, 8, 2),
 datetime.date(2010, 11, 30),
 datetime.date(2011, 1, 1),
 datetime.date(2011, 5, 1))

In [31]:
## For all text, replace ; with ,
## Use ; as separator

client = make_prod_client()

# read events file
df = pd.read_csv('../data/usbank_permid.csv',index_col=0)

# output file:
a = pd.DataFrame(index=["permid", "date", "headline", "txt"]).T
a.to_csv("./output_dir/usbank_class1.csv")

## Remove # before run. Otherwise, it won't save files

In [None]:
start_time = datetime.now()

for index, row in df[623:].iterrows():
    
    if (index % 10 == 0) & (index > 0):
        time.sleep(300)
        
    output_name = './output_dir/class2_' + str(index) + '.csv'
    
    a = pd.DataFrame(index=["permid", "date", "headline", "txt"]).T
    a.to_csv(output_name)

    print index
    print row["id"]
    permid = string.split(row["id"], "-")[1]
    print row["exact_date"]
    class2_preevent_start, class2_preevent_end, class2_postevent_start, class2_postevent_end = generate_class2_exact_dates(row["exact_date"])
    print("found exact date", permid)
    class2_preevent_daterange = list(daterange(class2_preevent_start, class2_preevent_end))
    class2_postevent_daterange = list(daterange(class2_postevent_start, class2_postevent_end))
    for single_date in class2_preevent_daterange + class2_postevent_daterange:
        a = pd.DataFrame(index=["permid", "date", "headline", "txt"]).T
        exact_date = single_date.strftime("%Y%m%d")
        search_terms = {'DATE':[exact_date]}
        hits = client.retrieve_hits("corning_query", permid, search_terms=search_terms)
        for hit in hits:
            client.get_document(hit)
            doc = novus_document.NewsroomNovusDocument(hit.document_xml)
            article = [permid, doc.date(), doc.headline_or_title(), doc.get_stripped_text().replace("\n"," ").replace(';',',')]
            a1 = pd.DataFrame(article, index=["permid", "date", "headline", "txt"]).T
            a = pd.concat([a,a1])
        a.to_csv("./output_dir/usbank_class2.csv", mode='a', header=False, encoding='utf-8')
        a.to_csv(output_name, mode='a',header=False, encoding='utf-8')

end_time = datetime.now()
print 'Elapsed:', end_time - start_time

In [14]:
## For all text, replace ; with ,
## Use ; as separator

client = make_prod_client()

# read events file
df_overlap = pd.read_csv('../data/usbank_overlap.csv')
#df = df.reset_index(drop=True)
#df['date'] = df['date'].map(lambda x: convert_rough_date(x))
#df['exact_date'] = df['exact_date'].map(lambda x: convert_exact_date(x) if x == x else "00")

# output file:
a = pd.DataFrame(index=["permid", "date", "headline", "txt"]).T
a.to_csv("./class2_tagged.csv")

The server tends to get slow after a while. So I made it rest for 1 minute after each run of 10 queries.
Note that for Class 2, it takes 15 - 25 min per 10 queries.

1. Extracts time interval for two Class 2 windows using generate_class2_exact_date or generate_class2_rough_date
2. Add two 

In [None]:
start_time = datetime.now()

for index, row in df.iterrows():
    
    if (index % 10 == 0) & (index > 0):
        time.sleep(60)
        
    #output_name = './class2_article_event ' + str(index) + '.csv'
    
    #a = pd.DataFrame(index=["permid", "date", "headline", "txt"]).T
    #a.to_csv(output_name)

    print index
    print row["id"]
    permid = string.split(row["id"], "-")[1]
    if has_exact_date(row):
        print row["exact_date"]
        class2_preevent_start, class2_preevent_end, class2_postevent_start, class2_postevent_end = generate_class2_exact_dates(row["exact_date"])
        print("found exact date", permid)
    else:
        print row['date']
        print("got a rough date", permid)
        class2_preevent_start, class2_preevent_end, class2_postevent_start, class2_postevent_end = generate_class2_rough_dates(row["date"])
    class2_preevent_daterange = list(daterange(class2_preevent_start, class2_preevent_end))
    class2_postevent_daterange = list(daterange(class2_postevent_start, class2_postevent_end))
    for single_date in class2_preevent_daterange + class2_postevent_daterange:
        a = pd.DataFrame(index=["permid", "date", "headline", "txt"]).T
        exact_date = single_date.strftime("%Y%m%d")
        search_terms = {'DATE':[exact_date]}
        hits = client.retrieve_hits("corning_query", permid, search_terms=search_terms)
        for hit in hits:
            client.get_document(hit)
            doc = novus_document.NewsroomNovusDocument(hit.document_xml)
            article = [permid, doc.date(), doc.headline_or_title(), doc.get_stripped_text().replace("\n"," ").replace(';',',')]
            a1 = pd.DataFrame(article, index=["permid", "date", "headline", "txt"]).T
            a = pd.concat([a,a1])
        #a.to_csv("class2_tagged.csv", mode='a', header=False, encoding='utf-8')
        #a.to_csv(output_name, mode='a',header=False, encoding='utf-8')

end_time = datetime.now()
print 'Elapsed:', end_time - start_time

0
https://permid.org/1-4295903973
07 September 2008
('found exact date', '4295903973')
NovusQuery:  OAID(4295903973) & LANGUAGE(en) & DATE(20080501)
OAID(4295903973) & LANGUAGE(en) & DATE(20080501)
NovusQuery:  OAID(4295903973) & LANGUAGE(en) & DATE(20080501)
NovusQuery:  OAID(4295903973) & LANGUAGE(en) & DATE(20080502)
OAID(4295903973) & LANGUAGE(en) & DATE(20080502)
NovusQuery:  OAID(4295903973) & LANGUAGE(en) & DATE(20080502)
NovusQuery:  OAID(4295903973) & LANGUAGE(en) & DATE(20080503)
OAID(4295903973) & LANGUAGE(en) & DATE(20080503)
NovusQuery:  OAID(4295903973) & LANGUAGE(en) & DATE(20080503)
NovusQuery:  OAID(4295903973) & LANGUAGE(en) & DATE(20080504)
OAID(4295903973) & LANGUAGE(en) & DATE(20080504)
NovusQuery:  OAID(4295903973) & LANGUAGE(en) & DATE(20080504)
NovusQuery:  OAID(4295903973) & LANGUAGE(en) & DATE(20080505)
OAID(4295903973) & LANGUAGE(en) & DATE(20080505)
NovusQuery:  OAID(4295903973) & LANGUAGE(en) & DATE(20080505)
NovusQuery:  OAID(4295903973) & LANGUAGE(en) & D

In [None]:
import time

start_time = datetime.now()

for index, row in df_eikon.iterrows():
    
    if (index % 10 == 0) & (index > 0):
        time.sleep(60)
    
    print index
    output_name = './output_dir/class1_eikon_' + str(index) + '.csv'
    
    a = pd.DataFrame(index=["permid", "date", "headline", "txt"]).T
    a.to_csv(output_name)

    print row["id"]
    permid = string.split(row["id"], "-")[1]
    date_from, date_to = generate_exact_dates(row["exact_date"])
    print(permid, row["exact_date"], date_from, date_to)
    for single_date in daterange(date_from, date_to):
        a = pd.DataFrame(index=["permid", "date", "headline", "txt"]).T
        exact_date = single_date.strftime("%Y%m%d")
        search_terms = {'DATE':[exact_date]}
        hits = client.retrieve_hits("corning_query", permid, search_terms=search_terms)
        for hit in hits:
            client.get_document(hit)
            doc = novus_document.NewsroomNovusDocument(hit.document_xml)
            article = [permid, doc.date(), doc.headline_or_title(), doc.get_stripped_text().replace("\n"," ").replace(';',',')]
            a1 = pd.DataFrame(article, index=["permid", "date", "headline", "txt"]).T
            a = pd.concat([a,a1])
        a.to_csv("./output_dir/usbank_class1_eikon.csv", mode='a', header=False, encoding='utf-8')
        a.to_csv(output_name,mode='a',header=False, encoding='utf-8')

    time.sleep(0.5)
    
end_time = datetime.now()
print 'Elapsed:', end_time - start_time

0
https://permid.org/1-4295903973
('4295903973', '07 September 2008', datetime.datetime(2008, 8, 30, 0, 0), datetime.datetime(2008, 10, 22, 0, 0))
NovusQuery:  OAID(4295903973) & LANGUAGE(en) & DATE(20080830)
OAID(4295903973) & LANGUAGE(en) & DATE(20080830)
NovusQuery:  OAID(4295903973) & LANGUAGE(en) & DATE(20080830)
NovusQuery:  OAID(4295903973) & LANGUAGE(en) & DATE(20080831)
OAID(4295903973) & LANGUAGE(en) & DATE(20080831)
NovusQuery:  OAID(4295903973) & LANGUAGE(en) & DATE(20080831)
NovusQuery:  OAID(4295903973) & LANGUAGE(en) & DATE(20080901)
OAID(4295903973) & LANGUAGE(en) & DATE(20080901)
NovusQuery:  OAID(4295903973) & LANGUAGE(en) & DATE(20080901)
NovusQuery:  OAID(4295903973) & LANGUAGE(en) & DATE(20080902)
OAID(4295903973) & LANGUAGE(en) & DATE(20080902)
NovusQuery:  OAID(4295903973) & LANGUAGE(en) & DATE(20080902)
NovusQuery:  OAID(4295903973) & LANGUAGE(en) & DATE(20080903)
OAID(4295903973) & LANGUAGE(en) & DATE(20080903)
NovusQuery:  OAID(4295903973) & LANGUAGE(en) & DAT

In [17]:
## For all text, replace ; with ,
## Use ; as separator

client = make_prod_client()

# read events file
df_eikon = pd.read_csv('../data/usbank_only_in_eikon.csv')
#df = df.reset_index(drop=True)
#df['date'] = df['date'].map(lambda x: convert_rough_date(x))
#df['exact_date'] = df['exact_date'].map(lambda x: convert_exact_date(x) if x == x else "00")

# output file:
a = pd.DataFrame(index=["permid", "date", "headline", "txt"]).T
a.to_csv("./output_dir/usbank_class1_eikon.csv")

In [16]:
start_time = datetime.now()


a = pd.DataFrame(index=["permid", "date", "headline", "txt"]).T
a.to_csv("./output_dir/usbank_class2_overlap.csv")

for index, row in df_overlap.iterrows():
    
    if (index % 10 == 0) & (index > 0):
        time.sleep(60)
        
    output_name = './output_dir/class2_overlap_' + str(index) + '.csv'
    
    a = pd.DataFrame(index=["permid", "date", "headline", "txt"]).T
    a.to_csv(output_name)

    print index
    print row["id"]
    permid = string.split(row["id"], "-")[1]
    print row["exact_date"]
    class2_preevent_start, class2_preevent_end, class2_postevent_start, class2_postevent_end = generate_class2_exact_dates(row["exact_date"])
    print("found exact date", permid)
    class2_preevent_daterange = list(daterange(class2_preevent_start, class2_preevent_end))
    class2_postevent_daterange = list(daterange(class2_postevent_start, class2_postevent_end))
    for single_date in class2_preevent_daterange + class2_postevent_daterange:
        a = pd.DataFrame(index=["permid", "date", "headline", "txt"]).T
        exact_date = single_date.strftime("%Y%m%d")
        search_terms = {'DATE':[exact_date]}
        hits = client.retrieve_hits("corning_query", permid, search_terms=search_terms)
        for hit in hits:
            client.get_document(hit)
            doc = novus_document.NewsroomNovusDocument(hit.document_xml)
            article = [permid, doc.date(), doc.headline_or_title(), doc.get_stripped_text().replace("\n"," ").replace(';',',')]
            a1 = pd.DataFrame(article, index=["permid", "date", "headline", "txt"]).T
            a = pd.concat([a,a1])
        a.to_csv("./output_dir/usbank_class2_overlap.csv", mode='a', header=False, encoding='utf-8')
        a.to_csv(output_name, mode='a',header=False, encoding='utf-8')

end_time = datetime.now()
print 'Elapsed:', end_time - start_time

0
https://permid.org/1-4296525121
27 February 2009
('found exact date', '4296525121')
NovusQuery:  OAID(4296525121) & LANGUAGE(en) & DATE(20081021)
OAID(4296525121) & LANGUAGE(en) & DATE(20081021)
NovusQuery:  OAID(4296525121) & LANGUAGE(en) & DATE(20081021)
NovusQuery:  OAID(4296525121) & LANGUAGE(en) & DATE(20081022)
OAID(4296525121) & LANGUAGE(en) & DATE(20081022)
NovusQuery:  OAID(4296525121) & LANGUAGE(en) & DATE(20081022)
NovusQuery:  OAID(4296525121) & LANGUAGE(en) & DATE(20081023)
OAID(4296525121) & LANGUAGE(en) & DATE(20081023)
NovusQuery:  OAID(4296525121) & LANGUAGE(en) & DATE(20081023)
NovusQuery:  OAID(4296525121) & LANGUAGE(en) & DATE(20081024)
OAID(4296525121) & LANGUAGE(en) & DATE(20081024)
NovusQuery:  OAID(4296525121) & LANGUAGE(en) & DATE(20081024)
NovusQuery:  OAID(4296525121) & LANGUAGE(en) & DATE(20081025)
OAID(4296525121) & LANGUAGE(en) & DATE(20081025)
NovusQuery:  OAID(4296525121) & LANGUAGE(en) & DATE(20081025)
NovusQuery:  OAID(4296525121) & LANGUAGE(en) & DA