## 1. Import the libraries and extract data from selected news articles

Credits: [@ArunDhaJ](https://www.arundhaj.com/blog/news-article-summarization-with-NLTK.html)

In [1]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from string import punctuation
from bs4 import BeautifulSoup
from heapq import nlargest
from collections import defaultdict
import requests
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import nltk

# Downloads the NLTK package
nltk.download('stopwords')
nltk.download('punkt')

# Stopword remover (include punctuation)
sw = set(stopwords.words('english') + list(punctuation))

def extract_summarized_contents(url, top_words_n=20, top_sent_n=10):
    # Extract contents
    soup = BeautifulSoup(requests.get(url).text, features='lxml')
    contents = ' '.join([p.text for p in soup.find('body').find_all('p')])

    # Tokenize by sentences.
    sent_tokens = [sent.lower() for sent in sent_tokenize(contents)]

    # Tokenize by words (with removed stopwords).
    word_tokens = [word for word in word_tokenize(contents) if word not in sw]

    freq = FreqDist(word_tokens)
    nlargest(top_words_n, freq, key=freq.get)
    ranking = defaultdict(int)

    # Pick words with higher ranking.
    for i, sent in enumerate(sent_tokens):
        for w in word_tokenize(sent):
            if w in freq:
                ranking[i] += freq[w]

    return ' '.join([sent_tokens[j] for j in sorted(nlargest(top_sent_n, ranking, key=ranking.get))])
    

corpus = [extract_summarized_contents(url) for url in [
    'https://www.thehindu.com/opinion/editorial/purifying-water-the-hindu-editorial-on-draft-notification-on-ro-systems/article30745293.ece',
    'https://www.foxnews.com/health/wisconsin-covid-19-deaths-non-fully-vaccinated',
    'https://edition.cnn.com/2021/06/25/football/england-germany-euro-2020-rivarly-cmd-spt-intl/index.html',
    'https://www.nbcnews.com/media/buzzfeed-announces-plans-go-public-rcna1266'
]]

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/leticiachoo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/leticiachoo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## 2. Using the Stanford NER tagger to determine entities

Credits: [@obahareth](https://gist.github.com/obahareth/27db373e0cc12cc4cff59df1befb6179)

In [2]:
from nltk.internals import find_jars_within_path
from nltk.tag.stanford import StanfordNERTagger
import os

# Modify to own stanford ner. 
models_dir_path = "stanford-ner-2020-11-17"

os.environ["CLASSPATH"] = models_dir_path
os.environ["STANFORD_MODELS"] = f'{models_dir_path}/classifiers/'

tagger = StanfordNERTagger('english.muc.7class.distsim.crf.ser.gz')

def tag_sentences(sentences):
    tagged_sentences = list(map(lambda s: [it for it in s[0] if it[1] != 'O'], [tagger.tag_sents([word_tokenize(sent)]) for sent in sentences]))
    
    for i, tagged in enumerate(tagged_sentences):
        print(f'Sentence:\n\n"{sentences[i]}"\n')
        print(pd.DataFrame(tagged, columns=['Word', 'Entity']), '\n\n')
    

print('===== Example data =====')

tag_sentences([
    'While in France, Christine Lagarde discussed short-term stimulus efforts in a recent interview with the Wall Street Journal.',
    "Apple Inc. is an American multinational technology company headquartered in Cupertino, California, that designs, develops, and sells consumer electronics, computer software, and online services. Its hardware products include the iPhone smartphone, the iPad tablet computer, the Mac personal computer, the iPod portable media player, the Apple Watch smartwatch, and the Apple TV digital media player. Apple's consumer software includes the OS X and iOS operating systems, the iTunes media player, the Safari web browser, and the iLife and iWork creativity and productivity suites. Its online services include the iTunes Store, the iOS App Store and Mac App Store, and iCloud. Apple was founded by Steve Jobs, Steve Wozniak, and Ronald Wayne on April 1, 1976, to develop and sell personal computers. It was incorporated as Apple Computer, Inc. on January 3, 1977, and was renamed as Apple Inc. on January 9, 2007, to reflect its shifted focus toward consumer electronics. Apple (NASDAQ: AAPL ) joined the Dow Jones Industrial Average on March 19, 2015.",
    "Samuel Patterson Smyth \"Sam\" Pollock, OC, CQ (December 25, 1925 – August 15, 2007) was sports executive who was general manager of the National Hockey League's Montreal Canadiens for 14 years where they won 9 Stanley Cups. Pollock also served as Chairman and CEO of the Toronto Blue Jays baseball club.",
])

print('===== News identification =====')

tag_sentences(corpus)

===== Example data =====
Sentence:

"While in France, Christine Lagarde discussed short-term stimulus efforts in a recent interview with the Wall Street Journal."

        Word    Entity
0     France  LOCATION
1  Christine    PERSON
2    Lagarde    PERSON 


Sentence:

"Apple Inc. is an American multinational technology company headquartered in Cupertino, California, that designs, develops, and sells consumer electronics, computer software, and online services. Its hardware products include the iPhone smartphone, the iPad tablet computer, the Mac personal computer, the iPod portable media player, the Apple Watch smartwatch, and the Apple TV digital media player. Apple's consumer software includes the OS X and iOS operating systems, the iTunes media player, the Safari web browser, and the iLife and iWork creativity and productivity suites. Its online services include the iTunes Store, the iOS App Store and Mac App Store, and iCloud. Apple was founded by Steve Jobs, Steve Wozniak, and Ro