## Import Libraries

In [1]:
import sys
import requests
import os
import json
from nltk import tokenize
import re
import time
import pandas as pd
import time

## Production version url
calais_url = 'http://api.apigarden-qa.int.thomsonreuters.com/permid/calais'

You can only parse a text file smaller than 100KB to Calais API. So this function consumes string as an input, saves the file, and send the file to the server.

There are 4 possible Status Code:
Error 429: Either sending too many queries or reached daily quota
Error 500: Server side error
Error 413: File is too big
Code 200: Working

In [2]:
def get_json(input_text, access_token = 'YOUR_ACCESS_TOKEN'):
    ## You can only parse a text file to Calais API.
    ## Hence, write a dummy text file with the input string as its content.
    with open(r'./input.txt', 'w') as inputfile:
        inputfile.write(input_text)
    input_file = './input.txt' ## or input_file = inputfile
    
    headers = {'X-AG-Access-Token' : access_token, 'Content-Type' : 'text/raw', 'outputformat' : 'application/json'}
    
    files = {'file': open(input_file, 'rb')}
    
    try:
        response = requests.post(calais_url, files=files, headers=headers, timeout=80)
        print 'status code: %s' % response.status_code
        if (response.status_code == 429) | (response.status_code == 500):
            print "Retrying..."
            time.sleep(1.0)
            return get_json(input_text)
        else:
            content = response.content
            json_content = json.loads(content)
            return(json_content)
        
    ## When you get error 413, the code will break the string into a few pieces, 
    # send them to server separately, get result and merge the result
    except:
        print "Error 413"
        final_json = {}
        num_segment = (len(input_text) / 50000)+1
        for i in range(0,num_segment):
            txt = input_text[(50000 * i):(50000 * (i+1))]
            if i == num_segment:
                txt = input_text[(50000 * (i+1)):]
            json_new = get_json(txt)
            final_json.update(json_new)
            print str(i+1) + '/' + str(num_segment)
        return(final_json)

The output json file has multiple "comphash" tags. Each tag corresponds to one tagged entity.
"Instance" is where exact mentions of entity can be found
"Resolutions" is where PermID can be found

In [3]:
def get_entity_summary(input_json):
    if input_json == {}:
        return []
    
    key_list = input_json.keys()
    
    summary_list = []

    for key in key_list:
        key_dict = input_json[key]
        try:
            m = key_dict['instances']
            for j in m:
                mentioned_name = j['exact']
                for i in key_dict['resolutions']:
                    if i.has_key('topmostPublicParent') != True:
                        exact_name = i['name']
                        permid = int(i['permid'])
                        summary_list.append((permid, mentioned_name, exact_name))
        except:
            pass
    
    return(list(set(summary_list)))

From get_entity_summary, it retreives every instance where PermID matches input PermID

In [4]:
def returnEntities(input_text, permid):
    new_json = get_json(input_text)
    entities_summary = get_entity_summary(new_json)
    
    if new_json == {}:
        return([])
    
    if entities_summary == []:
        return([])
    
    relevant_entities = []
    
    for e in entities_summary:
        if e[0] == permid:
            relevant_entities.append(e[1])
    return(relevant_entities)

1. Identifies all instances where we have a period followed by no space.
2. Safely assume \d+.\d+ is ok.
3. Remove 2) from 1)
4. Replace "." with ". "
5. Convert string into a list of sentences

In [5]:
def clean_paragraph(input_para):
    ## Only loophole is when we have a sentence ending with a number and succeeding senetence begins with a number.
    ## Ex: "Michael is 23. 10 students are younger than Mike."
    possible_exceptions = set(re.findall('([0-9A-Za-z]+[.][0-9A-Za-z]+)',input_para))
    allowed_list = set(re.findall('\d[.]\d',input_para))
    
    exception_list = list(possible_exceptions.difference(allowed_list))
    
    new_list = [e.replace('.','. ') for e in exception_list]
    
    replace_dict = dict(zip(exception_list, new_list))
    
    new_para = input_para
    
    for old,new in replace_dict.items():
        new_para = new_para.replace(old,new)
        new_para = new_para.replace(';',',') ## Using ; as separator, so replacing any ; with ,
        
    return new_para

Check which sentence contains the entity of our interest

In [6]:
from nltk import tokenize

def returnSentences(input_text, permid):
    clean_para = clean_paragraph(input_text)
    sent_list= tokenize.sent_tokenize(clean_para.decode('utf_8'))
    clean_sent_list = [sent.strip() for sent in sent_list]
    
    entity_list = returnEntities(clean_para, permid) 
    
    if entity_list == []:
        return []
    
    relevant_sent_list = [sent for sent in clean_sent_list 
                                if any(entity.lower() in sent.lower() for entity in entity_list)]
    
    return relevant_sent_list

Returns the output from returnEntity and returnSentences.
This is particularly useful when we want to loop through each row of dataframe.
If we run returnEntity and returnSentences separately, we send 2 queries per row.

In [7]:
from nltk import tokenize

def returnSentences2(input_text, permid):
    clean_para = clean_paragraph(input_text)
    sent_list= tokenize.sent_tokenize(clean_para.decode('utf_8'))
    clean_sent_list = [sent.strip() for sent in sent_list]
    
    entity_list = returnEntities(clean_para, permid) 

    if entity_list == []:
        return [], []
    
    relevant_sent_list = [sent for sent in clean_sent_list 
                                if any(entity.lower() in sent.lower() for entity in entity_list)]
    
    return entity_list, relevant_sent_list

In [8]:
articles_df = pd.read_csv('../done data/American Bank/usbank_class1_full.csv',sep=';')
articles_df.tail()

Unnamed: 0,permid,date,headline,txt
9389,4295906733,"Saturday, December 13, 2008",Americas Structured Equity Issue,"Throughout the past year, financial instituti..."
9390,4295906733,"Tuesday, December 16, 2008",Harborfields beats Huntington in girls basketball,Dec. 16--Harborfields had the ball with ...
9391,4295906733,"Monday, December 22, 2008",Lending crunch adds to LI's economic troubles,Dec. 22--As banks and other mortgage len...
9392,4295906733,"Tuesday, December 23, 2008",Banks refuse to say how bailout money is being...,Where's the money going? But after receiving b...
9393,4295906733,"Saturday, December 27, 2008",Mountain of wood chips a headache for two towns,Dec. 27--A 30-foot mountain of wood chip...


In [9]:
articles_df['entities'] = None
articles_df['sentences'] = None

In [10]:
new_df = pd.DataFrame(index=[u'permid', u'date', u'headline', u'txt', u'entities',
       u'sentences']).T
new_df.to_csv('../done data/American Bank/class1_full.csv')

In [12]:
import math
import time
from datetime import datetime

start_time = datetime.now()

for index, row in articles_df[5000:].iterrows():
    print index
    
    if (index >0) & (index % 500 == 0):
        time.sleep(120)
        
    new_df = pd.DataFrame(index=[u'permid', u'date', u'headline', u'txt', u'entities',u'sentences']).T
    
    input_str = row['txt']
    input_permid = row['permid']
    
    if input_str != input_str: ## Check for NaN
        entity_list = []
        sent_list = []
        input_str = ''
        
    else:
        entity_list, sent_list = returnSentences2(input_str,input_permid)  

    articles_df['entities'][index] = entity_list
    articles_df['sentences'][index] = sent_list
    
    new_summary = [input_permid, row['date'], row['headline'], input_str.replace(';',','), entity_list, sent_list]
    new_df1 = pd.DataFrame(new_summary, index = [u'permid', u'date', u'headline', u'txt', u'entities',u'sentences']).T
    new_df = pd.concat([new_df,new_df1])
    
    new_df.to_csv('../done data/American Bank/class1_full.csv',mode='a',header=False,encoding='utf-8')
    
    time.sleep(0.5) 
    
end_time = datetime.now()
print 'Elapsed:', end_time - start_time

5000
status code: 200


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


5001
status code: 200
5002
status code: 200
5003
status code: 200
5004
status code: 200
5005
status code: 200
5006
status code: 200
5007
status code: 200
5008
status code: 200
5009
status code: 200
5010
status code: 200
5011
status code: 200
5012
status code: 200
5013
status code: 200
5014
status code: 200
5015
status code: 200
5016
status code: 200
5017
status code: 200
5018
status code: 200
5019
status code: 200
5020
status code: 200
5021
status code: 200
5022
status code: 200
5023
status code: 200
5024
status code: 200
5025
status code: 200
5026
status code: 200
5027
status code: 200
5028
status code: 200
5029
status code: 200
5030
status code: 200
5031
status code: 200
5032
status code: 200
5033
status code: 200
5034
status code: 200
5035
status code: 200
5036
status code: 200
5037
status code: 200
5038
status code: 200
5039
status code: 200
5040
status code: 500
Retrying...
status code: 200
5041
status code: 200
5042
status code: 200
5043
status code: 200
5044
status code: 200
504

In [13]:
articles_df.to_csv('../done data/usbank_class1_df.csv',sep=';',index=False,encoding='utf-8')

In [14]:
class2_articles_df = pd.read_csv('../done data/American Bank/usbank_class2_full.csv',sep=';')

In [15]:
newer_df = pd.DataFrame(index=[u'permid', u'date', u'headline', u'txt', u'entities',
       u'sentences']).T
newer_df.to_csv('../done data/American Bank/class2_full.csv')

In [16]:
class2_articles_df['entities'] = None
class2_articles_df['sentences'] = None

In [18]:
## import math
import time
from datetime import datetime

start_time = datetime.now()

for index, row in class2_articles_df[12785:15000].iterrows():
    print index
    
    if (index >0) & (index % 250 == 0):
        time.sleep(120)
    
    newer_df = pd.DataFrame(index=[u'permid', u'date', u'headline', u'txt', u'entities',u'sentences']).T
    
    input_str = row['txt']
    input_permid = row['permid']
    
    if input_str != input_str: ## Check for NaN
        entity_list = []
        sent_list = []
        input_str = ''
        
    else:
        entity_list, sent_list = returnSentences2(input_str,input_permid)  

    class2_articles_df['entities'][index] = entity_list
    class2_articles_df['sentences'][index] = sent_list
    
    new_summary = [input_permid, row['date'], row['headline'], input_str.replace(';',','), entity_list, sent_list]
    newer_df1 = pd.DataFrame(new_summary, index = [u'permid', u'date', u'headline', u'txt', u'entities',u'sentences']).T
    newer_df = pd.concat([newer_df,newer_df1])
    
    newer_df.to_csv('../done data/American Bank/class2_full.csv',mode='a',header=False,encoding='utf-8')
    
    time.sleep(0.5) 
    
end_time = datetime.now()
print 'Elapsed:', end_time - start_time

12785
status code: 500
Retrying...
status code: 200


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


12786
status code: 200
12787
status code: 200
12788
status code: 200
12789
status code: 200
12790
status code: 200
12791
status code: 200
12792
status code: 200
12793
status code: 200
12794
status code: 200
12795
status code: 200
12796
status code: 200
12797
status code: 200
12798
status code: 200
12799
status code: 200
12800
status code: 200
12801
status code: 200
12802
status code: 200
12803
status code: 200
12804
status code: 200
12805
status code: 200
12806
status code: 500
Retrying...
status code: 200
12807
status code: 200
12808
status code: 200
12809
status code: 200
12810
status code: 200
12811
status code: 500
Retrying...
status code: 200
12812
status code: 200
12813
status code: 200
12814
status code: 200
12815
status code: 200
12816
status code: 200
12817
status code: 200
12818
status code: 200
12819
status code: 200
12820
status code: 200
12821
status code: 500
Retrying...
status code: 500
Retrying...
status code: 200
12822
status code: 200
12823
status code: 200
12824
stat