# Named Entity Recognition

## Step 1. Reading JSON string file into a Python array of dictionaries

In [8]:
import json

In [9]:
with open("nyt_articles2.json", 'r') as f:
    newsfeeds = json.load(f)

len(newsfeeds)

1158

In [10]:
import pandas as pd
df = pd.DataFrame(newsfeeds)

## Step 2. Refinitiv Open Calais
Reference: https://developers.refinitiv.com/en/article-catalog/article/intelligent-tagging-extract-information-api-response

### 2.1 Fetch all article URLs, headlines, and abstract

In [11]:
import requests

In [12]:
df.keys()

Index(['Headline', 'Abstract', 'Lead_Paragraph', 'Snippet', 'Published_date',
       'Author', 'News_desk', 'URL', 'Source'],
      dtype='object')

In [13]:
all_URL = []
for index, row in df.iterrows():
    all_URL.append(row["URL"])

In [14]:
all_headlines = []
for index, row in df.iterrows():
    all_headlines.append(row["Headline"])

In [15]:
all_snippets = []
for index, row in df.iterrows():
    all_snippets.append(row["Snippet"])

In [110]:
# print(len(all_URL))
# print(len(all_headlines))
# print(len(all_snippets))

### 2.2 NER tagging for all articles

Replace with your own token! 

In [102]:
all_JsonResponses = [] 

for url in all_URL:
    url = url
    headers = {}
    HTMLResponse = requests.request('GET', url, headers=headers)
    contentText = HTMLResponse.text
    headType = 'text/html'
    
    token = your_own_token
    url = 'https://api-eit.refinitiv.com/permid/calais'
    payload = contentText.encode('utf8')
    headers = {
        'Content-Type': headType,
        'x-ag-access-token': token,
        #'x-calais-selectiveTags': 'Company,CompanyLocation, Movie, Organization, Person',
        'outputformat': 'application/json'
        }

    TRITResponse = requests.request('POST', url, data=payload, headers=headers)
    TRITTextResponse = TRITResponse.text
    
    TRITJsonResponse = json.loads(TRITTextResponse)

    all_JsonResponses.append(TRITJsonResponse)
        
with open("NER_Tagging.json", "w") as json_file:
    json.dump(all_JsonResponses, json_file, indent=4)    

In [117]:
all_headlines[2]

'The 25 Best Films of the 21st Century So Far.'

In [118]:
all_JsonResponses[2]

{'doc': {'info': {'id': 'http://id.opencalais.com/ENK1a2wRBU8VNS0-hgI2*A',
   'ontology': 'http://trit-us-east-1-sharedamd.int.refinitiv.com/owlschema/0.5.2/onecalais.owl.allmetadata.xml',
   'docId': 'http://d.opencalais.com/dochash-1/7b933617-6135-3bf1-a1c5-9713078245f1',
   'document': "<body> The 25 Best Films of the 21st Century So Far.\nJUNE 9, 2017\nA.O. Scott and Manohla Dargis rank the best films made since 2000.\nThe 25 Best Films of the 21st Century\nSo Far.\n查看简体中文版 (Read in Chinese) Leer en español (Read in Spanish)\nWe are now approximately one-sixth of the way through the 21st century, and thousands of movies have already been released. Which means that it’s high time for the sorting – and the fighting – to start. As the chief film critics of The Times, we decided to rank, with some help from cinema savants on Facebook, the top 25 movies that are destined to be the classics of the future. While we’re sure almost everyone will agree with our choices, we’re equally sure th

### 2.2 Create a list of all the recognized entities attached to article title

In [16]:
#Get Entities
entity_dict = {}

print('====Entities====')
print('Article_No, Type, Name, Article Title')
art_no = 0
dict_key = 0
for TRITJsonResponse in all_JsonResponses:
    art_no += 1
    for key in TRITJsonResponse:
        dict_key += 1
        if ('_typeGroup' in TRITJsonResponse[key]):
            if TRITJsonResponse[key]['_typeGroup'] == 'entities':
                entities = str(art_no) + ", " + TRITJsonResponse[key]['_type'] + ", " + TRITJsonResponse[key]['name'] + ", " + all_headlines[art_no]
                entity_dict[dict_key] = entities

====Entities====
Article_No, Type, Name, Article Title


In [68]:
entity_dict

{14: '1, Person, Ellen Burstyn, The 25 Greatest Actors of the 21st Century (So Far)',
 15: '1, Person, Dwayne Johnson, The 25 Greatest Actors of the 21st Century (So Far)',
 16: '1, Person, Eugene Levy, The 25 Greatest Actors of the 21st Century (So Far)',
 17: '1, Person, Bill Pullman, The 25 Greatest Actors of the 21st Century (So Far)',
 18: '1, Person, Todd Solondz, The 25 Greatest Actors of the 21st Century (So Far)',
 19: '1, Company, Amazon, The 25 Greatest Actors of the 21st Century (So Far)',
 20: '1, Position, farmer, The 25 Greatest Actors of the 21st Century (So Far)',
 21: '1, Position, president, The 25 Greatest Actors of the 21st Century (So Far)',
 22: '1, ProvinceOrState, Jones County, The 25 Greatest Actors of the 21st Century (So Far)',
 23: '1, NaturalFeature, Lonely Island, The 25 Greatest Actors of the 21st Century (So Far)',
 24: '1, Person, Ben Foster, The 25 Greatest Actors of the 21st Century (So Far)',
 25: '1, Person, Diane Keaton, The 25 Greatest Actors of 

## 2.3 Search articles by movie name, company, or person

Print all articles that includes the search value (input your own search value): 

In [57]:
def print_output (dictionary, search_value):
    keys_with_search_value = []
    for key, value in dictionary.items():
        if search_value in value:
            keys_with_search_value.append(key)

    output_list = []
    for key in keys_with_search_value:
        output_list.append(f"Dict_Key: {key}, Article: {dictionary[key]}")    

    print('=====Article Titles=====')
    for values in output_list:
        result = values.split(',')[-1].strip()
        print(result)

In [58]:
value = 'warner'

print_output(entity_dict, value)

=====Article Titles=====
The 25 Greatest Actors of the 21st Century (So Far)
‘Shaun the Sheep Movie’: Designing the Characters
Hollywood as Biographer
The 25 Greatest Actors of the 21st Century (So Far)
Film Series Listings
The 25 Best Films of the 21st Century So Far.
10 Sundance Movies With Heat
2018 Academy Awards
The 25 Best Films of the 21st Century So Far.
2018 Golden Globe Awards
Is Dead at 80
Golden Globes: The Projectionist’s Takeaways
What the Movies Taught Me About Being a Woman
2018 Academy Awards
How Will Movies Survive the Next 10 Years?
28 Films for Black History Month
New Books Look at the ‘Peanuts’ Gang
‘Shaun the Sheep Movie’: Designing the Characters
’ in Two Minutes
You Know These 20 Movies. Now Meet the Women Behind Them
28 Films for Black History Month
David Bowie in the Movies
How Will Movies Survive the Next 10 Years?
The 25 Best Films of the 21st Century So Far.
What the Movies Taught Me About Being a Woman
The 25 Best Films of the 21st Century So Far.
The 25 G

Print everything tagged based on movie title: 

In [39]:
# Create a new dictionary to store the structured articles
structured_articles = {}

for article_id, article_content in entity_dict.items():
    tag_info = article_content.split(', ')
    article_number = int(tag_info[0])
    tag_type = tag_info[1]
    value = tag_info[2]
    title = tag_info[3]

    # If the article number already exists in the structured_articles dictionary, update the 'Movies' key
    if article_number in structured_articles:
        if tag_type == 'Movie':
            structured_articles[article_number]['Movies'].append(value)
    else:
        # If the article number doesn't exist, create a new entry with the 'Movies' key
        structured_articles[article_number] = {
            'Article': article_number,
            'Title': title,
            'Movies': [] if tag_type == 'Movie' else [],
        }

# Now, structured_articles is a dictionary with article numbers as keys and their associated information, including movie tags in the 'Movies' key
#print(structured_articles)

In [66]:
def search_movie_in_articles(articles_dict, movie_name):
    found_articles = []
    for article_info in articles_dict.values():
        if 'Movies' in article_info and movie_name in article_info['Movies']:
            found_articles.append(article_info)
    return found_articles

In [70]:
def print_tags(entity_dict, article_number):
    tags_by_article = {}
    for key, value in entity_dict.items():
        tag_info = value.split(', ')
        current_article_number = int(tag_info[0])
        if current_article_number == article_number:
            tag_type = tag_info[1]
            tag_value = tag_info[2]
            tags_by_article.setdefault(tag_type, []).append(tag_value)

    # Print the tags for the specified article number
    print(f"Tags for Article Number {article_number}:")
    for tag_type, tag_values in tags_by_article.items():
        print(f"{tag_type}s: {', '.join(tag_values)}")

In [76]:
movie_name = 'Toy Story 3'

# Print the related articles and their tags
for article_info in search_movie_in_articles(structured_articles, movie_name):
    print(f"Article Number: {article_info['Article']}")
    print(f"Title: {article_info['Title']}")
    if 'Movies' in article_info:
        print(f"Movies: {', '.join(article_info['Movies'])}")
    print('-' * 50)
    print_tags(entity_dict, article_info['Article'])
    print('=' * 50) 

Article Number: 3
Title: Summer Movies Preview: 10 Trailers for June
Movies: Toy Story 3, Boyhood, Wall-E, The Death of Mr. Lazarescu, A Touch of Zen, Before Sunrise, My Neighbor Totoro, Million Dollar Baby, Puss in Boots, The Big Sleep
--------------------------------------------------
Tags for Article Number 3:
Persons: Yi Yi, Nelson, Margarete Weathers, Carl, Barb Beaser-Konschak, Dina Dasom Anya, Leia Becker, Ava DuVernay, Frankie Dunn, Barry Jenkins, Daniel Plainview, Kyle Hambor, Yang-Yang, Dante Lazarescu, Morgan Freeman, Donald J. Trump, Oscar, Scott Yang-Yang, Cristi Puiu, Faust, Maggie Fitzgerald, Ellie, Manohla Dargis Paul Thomas Anderson, Tim Riley, Andy, Daniel Day-Lewis, Otsuka Yasuo, Martin Scorsese, Edward Yang, Kathryn Bigelow, Ronnie del Carmen, Hilary Swank, Michelle Williams, Tsai Ming-liang, Scott More, Francis Ford Coppola, Hayao Miyazaki, Andres Ollarvez, Jia Zhangke, Cristian Mungiu, Robert Elswit, Be, Jian, Corneliu Porumboiu, Hu, Chihiro, Guillermo del Toro, S