# Parsing News Articles

## Imports and news sources

In [1]:
import os
import pandas as pd
import panel as pn
import requests
from dotenv import load_dotenv

load_dotenv()  # take environment variables from .env

pn.extension('tabulator')

newsapi_url = 'https://newsapi.org/v2/everything?'\
    'q={queryString}&from={fromDate}&to={toDate}&searchIn=title,description&'\
    'page={page}&sources={sources}&sortBy=popularity&language=en&apiKey={apiKey}'

# Sources limited to 20 at a time
sources1 = 'abc-news,al-jazeera-english,associated-press,axios,bbc-news,bloomberg,breitbart-news,'\
           'business-insider,cbc-news,cbs-news,cnn,fox-news,google-news,independent,msnbc,national-review,'\
           'nbc-news,newsweek,new-york-magazine'
sources2 = 'politico,reuters,the-american-conservative,the-globe-and-mail,the-hill,the-hindu,'\
           'the-huffington-post,the-irish-times,the-jerusalem-post,the-times-of-india,the-wall-street-journal,'\
           'the-washington-post,the-washington-times,time,usa-today'

french_sources = 'google-news-fr,le-monde'
german_sources = 'bild,der-tagesspiegel,die-zeit,handelsblatt,spiegel-online'
spanish_sources = 'cnn-es,el-mundo,google-news-ar,infobae,la-gaceta,la-nacion'
russian_sources = 'google-news-ru,lenta,rbc,rt'

excluded_urls = ['/video', '/live', '/tv', 'bbc.co.uk/programmes', 'www.youtube.com']

headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}

## Retrieve news

In [2]:
topic = '+Trump+indictment'
from_date = '2023-08-09'
to_date = '2023-08-10'
api_key = os.environ.get("NEWS_API_KEY")

curr_request = newsapi_url.replace('{queryString}', topic).replace('{apiKey}', api_key) \
                          .replace('{fromDate}', from_date).replace('{toDate}', to_date)
    
article_dict = {
    'titles': [],
    'dates': [],
    'sources': [],
    'urls': [],
    'lengths': []
}


def append_to_list(key_name: str, new_value: str) -> list:
    curr_list = article_dict[key_name]
    curr_list.append(new_value)
    return curr_list


def get_articles(source_list: list, page_number: int):
    resp = requests.get(curr_request.replace('{sources}', source_list).replace('{page}', str(page_number)))
    if resp.status_code == 200:
        json_response = resp.json()
        number_results = json_response['totalResults']
        if number_results > 0:
            process_articles(json_response)
        if page_number == 1:
            print(f'Found {number_results} articles for the topic {topic}')
            if number_results > 100:
                number_pages = int(number_results/100)
                if number_results%100 > 0:
                    number_pages += 1
                for i in range(2,number_pages+1):
                    get_articles(source_list, i)
    else:
        print(f'** Failed to retrieve news articles for {curr_request}, response code {resp.status_code}')
        print(resp)

        
def process_articles(json_response): 
    list_articles = json_response['articles']
    for article in list_articles:
        article_dict['sources'] = append_to_list('sources', article['source']['name'])
        article_dict['titles'] = append_to_list('titles', article['title'])
        article_dict['dates'] = append_to_list('dates', article['publishedAt'])
        article_dict['urls'] = append_to_list('urls', 
                                              article['url'].replace('https://consent.google.com/ml?continue=', ''))


get_articles(sources1, 1)
get_articles(sources2, 1)

Found 25 articles for the topic +Trump+indictment
Found 3 articles for the topic +Trump+indictment


In [3]:
# Remove duplicate titles if in Google News and another source 
remove_indices = []
article_sources = article_dict['sources']
article_urls = article_dict['urls']
article_titles = article_dict['titles']
article_dates = article_dict['dates']
for i in range(0, len(article_sources)):
    for exclude_url in excluded_urls:
        if exclude_url in article_urls[i]:
            remove_indices.append(i)
            break
    if article_sources[i] == 'Google News': 
        gn_sd_title = article_titles[i][:30]
        index = 0
        for title in article_titles:
            if title.startswith(gn_sd_title) and index != i:
                remove_indices.append(i)
            index += 1
print(f'Removing {len(remove_indices)} articles as duplicate or excluded')
        
updated_sources = []
updated_authors = []
updated_titles = []
updated_descriptions = []
updated_dates = []
updated_urls = []
for i in range(0, len(article_sources)):
    if i in remove_indices:
        continue
    updated_sources.append(article_sources[i])
    updated_titles.append(article_titles[i])
    updated_dates.append(article_dates[i])
    updated_urls.append(article_urls[i])

Removing 3 articles as duplicate or excluded


In [4]:
article_display = {
    'id': list(range(0, len(updated_sources))),
    'source': updated_sources,
    'publishedAt': updated_dates,
    'title': updated_titles
}
article_df = pd.DataFrame(data=article_display)

In [5]:
df_widget = pn.widgets.Tabulator(article_df, selectable='checkbox', show_index=False, pagination=None)
df_widget

## Get article text

In [6]:
from bs4 import BeautifulSoup

# Some of the sources below (such as WSJ) need Selenium log-in processing
extraction_details = {
    'abcnews.go.': ['article', 'p'],
    '.afr.': ['article', 'p'],
    '.aljazeera.': ['div&class&wysiwyg--all-content', 'p'],
    '.apnews.': ['', 'p&class&Component-root-0-2-59'],
    '.bbc.': ['', 'p&class&ssrcss-1q0x1qg-Paragraph'],
    '.bloomberg.': ['div&class&body-content', 'p'],
    '.breitbart.': ['div&class&entry-content', 'p', 'p&class&rmoreabt'],  # Test
    '.businessinsider.': ['schemaorg', 'articleBody'],
    '.cbc.ca': ['div&class&story', 'p'],
    '.cbsnews.': ['schemaorg', 'articleBody'],
    '.ctvnews.': ['div&class&c-text', 'p'],
    '.cnn.': ['schemaorg', 'articleBody'],
    'economictimes.indiatimes.': ['', 'article'],
    '.foxnews.': ['schemaorg', 'articleBody'],
    '.france24.': ['', '<p><span>'], 
    '.ft.': ['div&class&article__content-body', 'p'],
    'globalnews.ca': ['article', 'p'],
    '.hindustantimes.': ['div&class&storyDetails', 'p'],
    '.independent.': ['div&class&sc-cvxyxr-6', 'p'],
    '.indiatoday.': ['main&class&main__content', 'p'],
    '.jpost.': ['schemaorg', 'articleBody'],
    '.moneycontrol.': ['schemaorg', 'articleBody'],
    '.nbcnews.': ['schemaorg', 'articleBody'],
    '.ndtv.': ['div&itemprop&articleBody', 'p'],
    '.news.yahoo.': ['div&class&caas-body||div&class&caas-readmore', 'p'],
    '.newsweek.': ['schemaorg', 'articleBody'],
    'nypost.': ['', 'p'],
    '.nytimes.': ['', 'p&class&css-at9mc1'],
    '.politico.': ['', 'p&class&story-text__paragraph'],
    '.reuters.': ['', 'p&data-testid&*paragraph-'],
    '.rt.': ['div&class&article__text', 'p', 'div&class&article__share article__share_bottom'],
    '.thedailybeast.': ['article&class&Body hpCCr', 'p', 'p&class'],
    '.theglobeandmail.': ['', 'p&class&c-article-body__text'],
    '.theguardian.': ['div&class&article-body-commercial-selector', 'p&class&dcr-94xsh'],
    'thehill.': ['article', 'p'],
    'timesofindia.indiatimes': ['schemaorg', 'articleBody'],
    '.vnexpress.': ['div&class&fck_detail', 'p'],
    '.washingtonpost.': ['schemaorg', 'hasPart/cssSelector=".article-body"/value'],
    '.washingtontimes.': ['div&class&storyareawrapper', 'p'],
    '.wsj.': [],     
    '.usatoday.': ['article', 'p']
}

excluded = ['author', 'publisher', 'Here\'s', 'Developments']

In [7]:
def check_excluded(url) -> bool:
    for exclude_url in excluded_urls:
        if exclude_url in url:
            print(f'Skipping excluded URL ({url})')
            return True
    return False


def find_element(soup, start) -> list:
    if '||' in start:
        starts = start.split('||')
        elements = []
        for alt_start in starts:
            elements.extend(find_element(soup, alt_start))
            return elements
    else:
        if '&' in start:
            strings = start.split('&')
            tag = strings[0]
            attrib_name = strings[1]
            name = strings[2]
            return soup.find(tag, {attrib_name: name})
        else:
            return soup.find(start)

    
def get_text(soup_element, text_element: str, stop: str):
    article_text = ''
    if soup_element:
        if text_element.startswith('<'):
            text_elements = text_element.split('>')
            first_element = text_elements[0][1:]
            second_element = text_elements[1][1:]
            first_elements = soup_element.find_all(first_element)
            for first in first_elements:
                first_child = str(first.select_one(":nth-child(1)"))
                if first_child.startswith(f'<{second_element}'):
                    new_text = first.text.strip()
                    if new_text and len(new_text) > 75:
                        article_text += new_text + '\n'
                else:
                    continue
        else:
            if '&' in text_element:
                strings = text_element.split('&')
                tag = strings[0]
                attrib_name = strings[1]
                name = strings[2]
                art_texts = soup_element.find_all(tag, {attrib_name: name})
            else:
                art_texts = soup_element.find_all(text_element)
            for art_text in art_texts:
                new_text = art_text.text.strip()
                if new_text:
                    article_text += new_text + '\n'
    return article_text
    
    
def process_extraction(soup, value):
    if len(value) == 0:
        print(f'** Cannot parse source; Needs manual creation of "Index##.txt" file')
        return ''
    start = value[0]
    text_at = value[1]
    if len(value) == 3:
        stop = value[2]
    else:
        stop = ''
    if start == 'schemaorg':
        return process_json(soup)
    elif start == '':
        element_detail = soup
    else:
        element_detail = find_element(soup, start)
    return get_text(element_detail, text_at, stop)  


def process_html(url: str, index: int, manual_array: list=None) -> str:
    text = ''
    web_page = requests.get(url, headers=headers)
    if web_page.status_code == 200:
        soup = BeautifulSoup(web_page.text, 'html.parser')
        for key,value in extraction_details.items():
            found = False
            if key in url:
                found = True
                text = process_extraction(soup, value)
                break
        if not found:
            print(f'** Text extraction algorithm not defined for publication')
            if manual_array:
                manual_array.append(index)
        elif found and not text:
            print(f'** No article text; Review {url} and provide manual text if desired')
            if manual_array:
                manual_array.append(index)
    else:
        print(f'** Error in getting referenced page ({url}), {web_page.status_code}')
        if manual_array:
            manual_array.append(index)
    return text

        
def process_json(soup):
    json_texts = soup.find_all('script', type='application/ld+json')
    for json_text in json_texts:
        json_string = str(json_text)
        try:
            json_detail = json.loads(json_string[json_string.index('>') + 1:json_string.index('</script')])
            if 'articleBody' in json_detail:
                return json_detail['articleBody']
            # TODO: Calculate from value
            elif 'hasPart' in json_detail and 'cssSelector' in json_detail['hasPart'] and \
                '.article-body' == json_detail['hasPart']['cssSelector']:
                return json_detail['hasPart']['value']
        except Exception as e:
            # Line feeds/carriage returns in articleBody are invalid
            if '"articleBody":' in json_string:   
                index_body = json_string.index('"articleBody":') + 14
                return json_string[json_string.index('"', index_body):json_string.index('",', index_body + 25)]
            print(f'** Exception in JSON-LD handling, {str(e)}')
    return ''

In [8]:
article_texts = ['' for x in range(len(updated_titles))]
manual_retrieval = []
for index in sorted(df_widget.selection): 
    url = updated_urls[index]
    print(f'Processing {index}, {url}')
    if url.startswith('https://news.google.com/'):
        interim_soup = BeautifulSoup(requests.get(url).text, 'html.parser')
        # TODO: FinancialTimes url ref not in og:url but = www.ft.com/content/id,
        #   where id in <script> LD, trackingData->pageDescription->rootContentId
        new_url = interim_soup.find_all('meta', {'property': 'og:url'})
        new_url0 = ''
        if len(new_url) == 1:
            new_url0 = new_url[0]['content']
        if new_url0 and '404' not in new_url0:
            url = new_url0
            print(f'Processing referenced page, {url}')
            updated_urls[index] = url
        else:
            # Unable to access Bloomberg, France24, WSJ, NYT, Hindustan Times
            print(f'** Unable to access source page for article, {index}, {updated_titles[index]}')
            print()
            article_texts[index] = ''
            manual_retrieval.append(f'{index},unable')
            continue
    skip = False
    if check_excluded(url):
        print(f'Skipping excluded URL ({url}) for article, {updated_titles[index]}')
        print()
        article_texts[index] = ''
        continue
    article_texts[index] = process_html(url, index, manual_retrieval)
    print()

Processing 0, https://apnews.com/article/donald-trump-indictment-campaign-election-interference-11cc4d1015c36e6ba078c00805d99838
** Text extraction algorithm not defined for publication

Processing 1, https://news.google.com/rss/articles/CCAiC0pkbVlTaE9YUzFvmAEB?oc%3D5&gl=FR&hl=en-US&cm=2&pc=n&src=1
Processing referenced page, https://www.youtube.com/watch?v=JdmYShOXS1o
Skipping excluded URL (https://www.youtube.com/watch?v=JdmYShOXS1o)
Skipping excluded URL (https://www.youtube.com/watch?v=JdmYShOXS1o) for article, Trump's prison fears mount as Jack Smith drops new subpoenas after 2 indictments - MSNBC

Processing 2, https://news.google.com/rss/articles/CCAiC3FTQXdkSFpfTm40mAEB?oc%3D5%26hl%3Den-CA%26gl%3DCA%26ceid%3DCA:en&gl=FR&hl=en-CA&cm=2&pc=n&src=1
Processing referenced page, https://www.youtube.com/watch?v=qSAwdHZ_Nn4
Skipping excluded URL (https://www.youtube.com/watch?v=qSAwdHZ_Nn4)
Skipping excluded URL (https://www.youtube.com/watch?v=qSAwdHZ_Nn4) for article, CNN legal analy

## Fix up suggested manual retrievals and add other articles

In [9]:
print('Known problems resolving Google News links to Bloomberg, Financial Times, France24, Hindustan Times, NYTimes and WSJ')
print('For Financial Times, France24, Hindustan Times and NYT, click on the GNews link and get the URL')
print('For WSJ and Bloomberg, need Selenium processing to log-in; For now, retrieve text manually and add to "Index##.txt" file')
print()
for i in range(0,len(manual_retrieval)):
    str_index = str(manual_retrieval[i])
    ref_index = int(str_index.split(",")[0])
    if ',unable' in str_index:
        print(f'Unable to resolve Google News link for index: {ref_index}')
        print(f'Article title: {updated_titles[ref_index]}')
    else:
        print(f'Unable to find relevant text for index: {ref_index}')
    print(f'URL: {updated_urls[ref_index]}')
    print()

Known problems resolving Google News links to Bloomberg, Financial Times, France24, Hindustan Times, NYTimes and WSJ
For Financial Times, France24, Hindustan Times and NYT, click on the GNews link and get the URL
For WSJ and Bloomberg, need Selenium processing to log-in; For now, retrieve text manually and add to "Index##.txt" file



In [29]:
# For example ...
references_dict = {
    9: 'https://www.france24.com/en/europe/20230531-drone-attack-draws-moscow-closer-to-ukraine-s-front-line',
    11: 'https://www.ft.com/content/cdde0e44-0e90-41c0-a3be-e5f6849b706b',
    17: 'Index17.txt',   # WSJ requires manual downloading of text, until Selenium processing complete
    21: 'https://www.hindustantimes.com/world-news/drone-attack-on-russian-capital-several-buildings-hit-moscow-mayor-says-101685422545056.html',
    44: 'https://www.nytimes.com/live/2023/06/01/world/russia-ukraine-news',
    61: 'https://www.bloomberg.com/news/articles/2023-05-30/several-moscow-buildings-damaged-in-drone-attack-mayor-says'
}

for key, value in references_dict.items():
    text = ''
    print(f'Processing index {key} and url or file, {value}')
    if value.startswith('Index'):
        with open(value, 'r') as text_file:
            text = text_file.read()
    elif not check_excluded(value):
        text = process_html(value, key)
    if text:
        article_texts[key] = text
    print()
        
additional_articles = {
    'wsj1': {'source': 'The Wall Street Journal', 'author': 'Matthew Luxmoore', 
             'title': 'Drones Hit Russian Oil Refineries as Moscow Shores Up Front Line', 
             'url': 'https://www.wsj.com/articles/drones-hit-russian-oil-refineries-as-moscow-shores-up-front-line-f365abfb',
             'description': 'Such attacks come as Russia loses the initiative in the conflict, say Western officials, with Moscow increasingly reacting to Ukrainian offensives that are setting the agenda on the battlefield.',
             'publishedAt': '2023-05-31T10:47:00.000Z', 'text': 'Indexwsj1.txt'},
    'nyt1': {'source': 'The New York Times', 'author': 'Anatoly Kurmanaev', 
             'title': 'Russia Denounces West Over Drone Strike on Moscow', 
             'url': 'https://www.nytimes.com/2023/05/31/world/europe/russia-moscow-drone-attack.html', 
             'description': 'Kremlin officials said Ukraine’s allies should have condemned the strike, and argued again that its real war was with the West.',
             'publishedAt': '2023-05-31T22:00:12.000Z'},
    'nyt2': {'source': 'The New York Times', 'author': 'Ivan Nechepurenko', 'title': 'meta&property&og:title', 
             'url': 'https://www.nytimes.com/2023/05/30/world/europe/moscow-drone-attack.html', 
             'description': 'meta&property&og:description', 'publishedAt': '2023-05-30T10:18:43.000Z'}
}

for key, dict_value in additional_articles.items():
    url = dict_value['url']
    print(f'Processing url, {url}')
    updated_authors.append(dict_value['author'])
    updated_sources.append(dict_value['source'])
    updated_dates.append(dict_value['publishedAt'])
    updated_titles.append(dict_value['title'])
    updated_descriptions.append(dict_value['description'])
    updated_urls.append(url)
    if 'text' in dict_value:
        with open(dict_value['text'], 'r') as text_file:
            article_texts.append(text_file.read())
    else:
        article_texts.append(process_html(url, len(updated_authors)))
    print()


Processing index 9 and url or file, https://www.france24.com/en/europe/20230531-drone-attack-draws-moscow-closer-to-ukraine-s-front-line

Processing index 11 and url or file, https://www.ft.com/content/cdde0e44-0e90-41c0-a3be-e5f6849b706b
** No article text; Review https://www.ft.com/content/cdde0e44-0e90-41c0-a3be-e5f6849b706b and provide manual text if desired

Processing index 17 and url or file, Index17.txt

Processing index 21 and url or file, https://www.hindustantimes.com/world-news/drone-attack-on-russian-capital-several-buildings-hit-moscow-mayor-says-101685422545056.html

Processing index 44 and url or file, https://www.nytimes.com/live/2023/06/01/world/russia-ukraine-news
Skipping excluded URL (https://www.nytimes.com/live/2023/06/01/world/russia-ukraine-news)

Processing index 61 and url or file, https://www.bloomberg.com/news/articles/2023-05-30/several-moscow-buildings-damaged-in-drone-attack-mayor-says
** No article text; Review https://www.bloomberg.com/news/articles/20