# Named Entity Recognition

## Step 1. Reading JSON string file into a Python array of dictionaries

In [1]:
import json

In [2]:
with open("nyt_articles2.json", 'r') as f:
    newsfeeds = json.load(f)

len(newsfeeds)

1158

In [3]:
import pandas as pd
df = pd.DataFrame(newsfeeds)

## Step 2. Refinitiv Open Calais
Reference: https://developers.refinitiv.com/en/article-catalog/article/intelligent-tagging-extract-information-api-response

### 2.1 Fetch all article URLs, headlines, and abstract

In [15]:
import requests

In [69]:
df.keys()

Index(['Headline', 'Abstract', 'Lead_Paragraph', 'Snippet', 'Published_date',
       'Author', 'News_desk', 'URL', 'Source'],
      dtype='object')

In [75]:
all_URL = []
for index, row in df.iterrows():
    all_URL.append(row["URL"])

In [84]:
all_headlines = []
for index, row in df.iterrows():
    all_headlines.append(row["Headline"])

In [85]:
all_snippets = []
for index, row in df.iterrows():
    all_snippets.append(row["Snippet"])

In [110]:
# print(len(all_URL))
# print(len(all_headlines))
# print(len(all_snippets))

### 2.2 NER tagging for all articles

Replace with your own token! 

In [102]:
all_JsonResponses = [] 

for url in all_URL:
    url = url
    headers = {}
    HTMLResponse = requests.request('GET', url, headers=headers)
    contentText = HTMLResponse.text
    headType = 'text/html'
    
    token = your_own_token
    url = 'https://api-eit.refinitiv.com/permid/calais'
    payload = contentText.encode('utf8')
    headers = {
        'Content-Type': headType,
        'x-ag-access-token': token,
        #'x-calais-selectiveTags': 'Company,CompanyLocation, Movie, Organization, Person',
        'outputformat': 'application/json'
        }

    TRITResponse = requests.request('POST', url, data=payload, headers=headers)
    TRITTextResponse = TRITResponse.text
    
    TRITJsonResponse = json.loads(TRITTextResponse)

    all_JsonResponses.append(TRITJsonResponse)
        
with open("NER_Tagging.json", "w") as json_file:
    json.dump(all_JsonResponses, json_file, indent=4)    

In [117]:
all_headlines[2]

'The 25 Best Films of the 21st Century So Far.'

In [118]:
all_JsonResponses[2]

{'doc': {'info': {'id': 'http://id.opencalais.com/ENK1a2wRBU8VNS0-hgI2*A',
   'ontology': 'http://trit-us-east-1-sharedamd.int.refinitiv.com/owlschema/0.5.2/onecalais.owl.allmetadata.xml',
   'docId': 'http://d.opencalais.com/dochash-1/7b933617-6135-3bf1-a1c5-9713078245f1',
   'document': "<body> The 25 Best Films of the 21st Century So Far.\nJUNE 9, 2017\nA.O. Scott and Manohla Dargis rank the best films made since 2000.\nThe 25 Best Films of the 21st Century\nSo Far.\n查看简体中文版 (Read in Chinese) Leer en español (Read in Spanish)\nWe are now approximately one-sixth of the way through the 21st century, and thousands of movies have already been released. Which means that it’s high time for the sorting – and the fighting – to start. As the chief film critics of The Times, we decided to rank, with some help from cinema savants on Facebook, the top 25 movies that are destined to be the classics of the future. While we’re sure almost everyone will agree with our choices, we’re equally sure th

### 2.2 Create a list of all the recognized entities attached to article title

In [139]:
#Get Entities
entity_dict = {}

print('====Entities====')
print('Article_No, Type, Name, Article Title')
art_no = 0
dict_key = 0
for TRITJsonResponse in all_JsonResponses:
    art_no += 1
    for key in TRITJsonResponse:
        dict_key += 1
        if ('_typeGroup' in TRITJsonResponse[key]):
            if TRITJsonResponse[key]['_typeGroup'] == 'entities':
                entities = str(art_no) + ", " + TRITJsonResponse[key]['_type'] + ", " + TRITJsonResponse[key]['name'] + ", " + all_headlines[art_no]
                entity_dict[dict_key] = entities

====Entities====
Article_No, Type, Name, Article Title


In [144]:
entity_dict[14]

'1, Person, Ellen Burstyn, The 25 Greatest Actors of the 21st Century (So Far)'

In [127]:
#Get Entities
entity_list2 = []

print('====Entities====')
print('Article_No, Type, Name, Article Title')
art_no = 0
for TRITJsonResponse in all_JsonResponses:
    art_no += 1
    for key in TRITJsonResponse:
        if ('_typeGroup' in TRITJsonResponse[key]):
            if TRITJsonResponse[key]['_typeGroup'] == 'entities':
                entities = str(art_no) + ", " + TRITJsonResponse[key]['_type'] + ", " + TRITJsonResponse[key]['name'] + ", " + all_headlines[art_no]
                entity_list2.append(entities)

====Entities====
Article_No, Type, Name, Article Title


## 2.3 Search articles by movie name, company, or person

*Method 1*: Print articles that includes the search value (multiple definitions)

In [206]:
def search_by_value(dictionary, search_value):
    keys_with_search_value = []
    for key, value in dictionary.items():
        if search_value in value:
            keys_with_search_value.append(key)
    return keys_with_search_value

def list_of_output(dictionary, keys_with_search_value):
    output_list = []
    for key in keys_with_search_value:
        output_list.append(f"Key: {key}, Value: {dictionary[key]}")    
    return output_list

def print_output(output_list):
    for values in output_list:
        print(values)

In [208]:
value = 'warner'#input search value

keys_with_search_value = search_by_value(entity_dict, value)
output = list_of_output(entity_dict, keys_with_search_value)
print_output(output)

Key: 92, Value: 1, Company, warner bros, The 25 Greatest Actors of the 21st Century (So Far)
Key: 349, Value: 4, Company, warner bros, ‘Shaun the Sheep Movie’: Designing the Characters
Key: 736, Value: 29, Company, warner bros, Hollywood as Biographer
Key: 886, Value: 32, Company, warner bros, The 25 Greatest Actors of the 21st Century (So Far)
Key: 948, Value: 34, Company, warner bros, Film Series Listings
Key: 1186, Value: 36, Company, warner bros, The 25 Best Films of the 21st Century So Far.
Key: 1620, Value: 42, Company, warner bros, 10 Sundance Movies With Heat
Key: 2867, Value: 111, Company, warner bros, 2018 Academy Awards
Key: 5067, Value: 178, Company, warner bros, The 25 Best Films of the 21st Century So Far.
Key: 6791, Value: 233, Company, warner bros, 2018 Golden Globe Awards
Key: 7097, Value: 237, Company, warner bros, Marlon Brando, Oscar-Winning Actor, Is Dead at 80
Key: 8557, Value: 286, Company, warner bros, Golden Globes: The Projectionist’s Takeaways
Key: 8735, Valu

*Method 2*: Print articles that includes the search value (one definition):

In [209]:
def print_output2 (dictionary, search_value):
    keys_with_search_value = []
    for key, value in dictionary.items():
        if search_value in value:
            keys_with_search_value.append(key)

    output_list = []
    for key in keys_with_search_value:
        output_list.append(f"Key: {key}, Value: {dictionary[key]}")    

    for values in output_list:
        print(values)

In [212]:
value = 'Tan'

print_output2(entity_dict, value)

Key: 540, Value: 22, Movie, Black and Tan, From the BBC, a Watery World
Key: 2099, Value: 84, Movie, Black and Tan, The 25 Best Films of the 21st Century So Far.
Key: 2943, Value: 113, Movie, Black and Tan, 2018 Golden Globe Awards
Key: 5368, Value: 181, Person, Richard Tanne, Aging Stars Don’t Fade Away, They Make More Movies
Key: 6158, Value: 221, Movie, Black and Tan, The 25 Best Films of the 21st Century So Far.
Key: 6506, Value: 227, Person, Richard Tanne, A Flurry of Horrors, Real and Imagined
Key: 6954, Value: 236, Person, Richard Tanne, The Holiday Movie Season in Trailers
Key: 8181, Value: 279, Movie, Black and Tan, The 25 Best Films of the 21st Century So Far.
Key: 8961, Value: 292, Movie, Black and Tan, The 25 Best Films of the 21st Century So Far.
Key: 10794, Value: 339, Movie, Black and Tan, The 25 Best Films of the 21st Century So Far.
Key: 13940, Value: 492, Movie, Black and Tan, The 25 Best Films of the 21st Century So Far.
Key: 14408, Value: 496, Person, Richard Tanne,