# Parsing News Articles

## Imports and news sources

In [1]:
import os
import pandas as pd
import panel as pn
import requests
from dotenv import load_dotenv

load_dotenv()  # take environment variables from .env

pn.extension('tabulator')

newsapi_url = 'https://newsapi.org/v2/everything?'\
    'q={queryString}&from={fromDate}&to={toDate}&searchIn=title,description&'\
    'page={page}&sources={sources}&sortBy=popularity&language=en&apiKey={apiKey}'

# Sources limited to 20 at a time
sources1 = 'abc-news,al-jazeera-english,associated-press,axios,bbc-news,bloomberg,breitbart-news,'\
           'business-insider,cbc-news,cbs-news,cnn,fox-news,google-news,independent,msnbc,national-review,'\
           'nbc-news,newsweek,new-york-magazine'
sources2 = 'politico,reuters,the-american-conservative,the-globe-and-mail,the-hill,the-hindu,'\
           'the-huffington-post,the-irish-times,the-jerusalem-post,the-times-of-india,the-wall-street-journal,'\
           'the-washington-post,the-washington-times,time,usa-today'

french_sources = 'google-news-fr,le-monde'
german_sources = 'bild,der-tagesspiegel,die-zeit,handelsblatt,spiegel-online'
spanish_sources = 'cnn-es,el-mundo,google-news-ar,infobae,la-gaceta,la-nacion'
russian_sources = 'google-news-ru,lenta,rbc,rt'

excluded_urls = ['/video', '/live', '/tv', 'bbc.co.uk/programmes', 'www.youtube.com']

headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}

## Retrieve news

In [2]:
topic = '+Hamas'
from_date = '2023-10-15'
to_date = '2023-10-16'
api_key = os.environ.get("NEWS_API_KEY")

curr_request = newsapi_url.replace('{queryString}', topic).replace('{apiKey}', api_key) \
                          .replace('{fromDate}', from_date).replace('{toDate}', to_date)
    
article_dict = {
    'titles': [],
    'dates': [],
    'sources': [],
    'urls': [],
    'lengths': []
}


def append_to_list(key_name: str, new_value: str) -> list:
    curr_list = article_dict[key_name]
    curr_list.append(new_value)
    return curr_list


def get_articles(source_list: list, page_number: int):
    resp = requests.get(curr_request.replace('{sources}', source_list).replace('{page}', str(page_number)))
    if resp.status_code == 200:
        json_response = resp.json()
        number_results = json_response['totalResults']
        if number_results > 0:
            process_articles(json_response)
        if page_number == 1:
            print(f'Found {number_results} articles for the topic {topic}')
            if number_results > 100:
                number_pages = int(number_results/100)
                if number_results%100 > 0:
                    number_pages += 1
                for i in range(2,number_pages+1):
                    get_articles(source_list, i)
    else:
        print(f'** Failed to retrieve news articles for {curr_request}, response code {resp.status_code}')
        print(resp)

        
def process_articles(json_response): 
    list_articles = json_response['articles']
    for article in list_articles:
        article_dict['sources'] = append_to_list('sources', article['source']['name'])
        article_dict['titles'] = append_to_list('titles', article['title'])
        article_dict['dates'] = append_to_list('dates', article['publishedAt'])
        article_dict['urls'] = append_to_list('urls', 
                                              article['url'].replace('https://consent.google.com/ml?continue=', ''))


get_articles(sources1, 1)
get_articles(sources2, 1)

Found 338 articles for the topic +Hamas
Found 237 articles for the topic +Hamas


In [3]:
# Remove duplicate titles if in Google News and another source 
remove_indices = []
article_sources = article_dict['sources']
article_urls = article_dict['urls']
article_titles = article_dict['titles']
article_dates = article_dict['dates']
for i in range(0, len(article_sources)):
    excluded = False
    if article_titles[i] == '[Removed]':
        remove_indices.append(i)
        excluded = True
    if not excluded:
        for exclude_url in excluded_urls:
            if exclude_url in article_urls[i]:
                remove_indices.append(i)
                excluded = True
                break
    if not excluded and article_sources[i] == 'Google News': 
        gn_sd_title = article_titles[i][:30]
        index = 0
        for title in article_titles:
            if title.startswith(gn_sd_title) and index != i:
                remove_indices.append(i)
            index += 1
print(f'Removing {len(remove_indices)} articles as duplicate or excluded')
        
updated_sources = []
updated_authors = []
updated_titles = []
updated_descriptions = []
updated_dates = []
updated_urls = []
for i in range(0, len(article_sources)):
    if i in remove_indices:
        continue
    updated_sources.append(article_sources[i])
    updated_titles.append(article_titles[i])
    updated_dates.append(article_dates[i])
    updated_urls.append(article_urls[i])

Removing 153 articles as duplicate or excluded


In [4]:
article_display = {
    'id': list(range(0, len(updated_sources))),
    'source': updated_sources,
    'publishedAt': updated_dates,
    'title': updated_titles
}
article_df = pd.DataFrame(data=article_display)

In [5]:
df_widget = pn.widgets.Tabulator(article_df, selectable='checkbox', show_index=False, pagination=None)
df_widget

## Get article text

In [6]:
from bs4 import BeautifulSoup

# Some of the sources below (such as WSJ) need Selenium log-in processing
extraction_details = {
    'abcnews.go.': ['article', 'p'],
    '.afr.': ['article', 'p'],
    '.aljazeera.': ['div&class&wysiwyg--all-content', 'p'],
    'apnews.com': ['div&class&RichTextStoryBody', 'p'],
    '.bbc.': ['', 'p&class&ssrcss-1q0x1qg-Paragraph'],
    '.bloomberg.': [],       # ['div&class&body-content', 'p'],
    '.breitbart.': ['div&class&entry-content', 'p', 'p&class&rmoreabt'],  
    '.businessinsider.': ['schemaorg', 'articleBody'],
    '.cbc.ca': ['div&class&story', 'p'],
    '.cbsnews.': ['schemaorg', 'articleBody'],
    '.ctvnews.': ['div&class&c-text', 'p'],
    '.cnn.': ['schemaorg', 'articleBody'],
    'economictimes.indiatimes.': ['', 'article'],
    '.foxnews.': ['schemaorg', 'articleBody'],
    '.france24.': [],        # ['', '<p><span>'],  
    '.ft.': [],              # ['div&class&article__content-body', 'p'],
    'globalnews.ca': ['article', 'p'],
    '.hindustantimes.': [],  # ['div&class&storyDetails', 'p'],
    '.independent.': ['div&class&sc-cvxyxr-6', 'p'],
    '.indiatoday.': ['main&class&main__content', 'p'],
    'irishtimes.': ['article&class&article-body-wrapper', 'p'],
    '.jpost.': ['schemaorg', 'articleBody'],
    '.moneycontrol.': ['schemaorg', 'articleBody'],
    '.nbcnews.': ['schemaorg', 'articleBody'],
    '.ndtv.': ['div&itemprop&articleBody', 'p'],
    '.news.yahoo.': ['div&class&caas-body||div&class&caas-readmore', 'p'],
    '.newsweek.': ['schemaorg', 'articleBody'],
    'nypost.': ['', 'p'],
    '.nytimes.': [],         # ['', 'p&class&css-at9mc1'],
    '.politico.': ['', 'p&class&story-text__paragraph'],
    '.reuters.': ['', 'p&data-testid&*paragraph-'],
    '.rt.': ['div&class&article__text', 'p', 'div&class&article__share article__share_bottom'],
    '.thedailybeast.': ['article&class&Body hpCCr', 'p', 'p&class'],
    '.theglobeandmail.': ['', 'p&class&c-article-body__text'],
    '.theguardian.': ['div&class&article-body-commercial-selector', 'p&class&dcr-94xsh'],
    'thehill.': ['article', 'p'],
    'timesofindia.indiatimes': ['schemaorg', 'articleBody'],
    '.usatoday.': ['article', 'p'],
    '.vnexpress.': ['div&class&fck_detail', 'p'],
    '.washingtonpost.': ['schemaorg', 'hasPart/cssSelector=".article-body"/value'],
    '.washingtontimes.': ['div&class&storyareawrapper', 'p'],
    '.wsj.': []
}

excluded = ['author', 'publisher', 'Here\'s', 'Developments']

In [7]:
def check_excluded(url) -> bool:
    for exclude_url in excluded_urls:
        if exclude_url in url:
            return True
    for key in extraction_details:
        if key in url:
            return False
    return True


def find_element(soup, start) -> list:
    if '||' in start:
        starts = start.split('||')
        elements = []
        for alt_start in starts:
            elements.extend(find_element(soup, alt_start))
            return elements
    else:
        if '&' in start:
            strings = start.split('&')
            tag = strings[0]
            attrib_name = strings[1]
            name = strings[2]
            return soup.find(tag, {attrib_name: name})
        else:
            return soup.find(start)

    
def get_text(soup_element, text_element: str, stop: str):
    article_text = ''
    if soup_element:
        if text_element.startswith('<'):
            text_elements = text_element.split('>')
            first_element = text_elements[0][1:]
            second_element = text_elements[1][1:]
            first_elements = soup_element.find_all(first_element)
            for first in first_elements:
                first_child = str(first.select_one(":nth-child(1)"))
                if first_child.startswith(f'<{second_element}'):
                    new_text = first.text.strip()
                    if new_text and len(new_text) > 75:
                        article_text += new_text + '\n'
                else:
                    continue
        else:
            if '&' in text_element:
                strings = text_element.split('&')
                tag = strings[0]
                attrib_name = strings[1]
                name = strings[2]
                art_texts = soup_element.find_all(tag, {attrib_name: name})
            else:
                art_texts = soup_element.find_all(text_element)
            for art_text in art_texts:
                new_text = art_text.text.strip()
                if new_text:
                    article_text += new_text + '\n'
    return article_text
    
    
def process_extraction(soup, value):
    if len(value) == 0:
        return ''
    start = value[0]
    text_at = value[1]
    if len(value) == 3:
        stop = value[2]
    else:
        stop = ''
    if start == 'schemaorg':
        return process_json(soup)
    elif start == '':
        element_detail = soup
    else:
        element_detail = find_element(soup, start)
    return get_text(element_detail, text_at, stop)  


def process_html(url: str, index: int, manual_array: list) -> str:
    text = ''
    web_page = requests.get(url, headers=headers)
    if web_page.status_code == 200:
        soup = BeautifulSoup(web_page.text, 'html.parser')
        found = False
        for key,value in extraction_details.items():
            if key in url:
                if len(value) > 0:
                    found = True
                    text = process_extraction(soup, value)
                break
        if not found:
            if len(value) == 0:
                manual_array.append(f'{index}, paywall')
            else:
                manual_array.append(f'{index}, no algorithm')
        elif found and not text:
            manual_array.append(f'{index}, no text')
    else:
        manual_array.append(f'{index}, error getting ref')
    return text

        
def process_json(soup):
    json_texts = soup.find_all('script', type='application/ld+json')
    for json_text in json_texts:
        json_string = str(json_text)
        try:
            json_detail = json.loads(json_string[json_string.index('>') + 1:json_string.index('</script')])
            if 'articleBody' in json_detail:
                return json_detail['articleBody']
            # TODO: Calculate from value
            elif 'hasPart' in json_detail and 'cssSelector' in json_detail['hasPart'] and \
                '.article-body' == json_detail['hasPart']['cssSelector']:
                return json_detail['hasPart']['value']
        except Exception as e:
            # Line feeds/carriage returns in articleBody are invalid
            if '"articleBody":' in json_string:   
                index_body = json_string.index('"articleBody":') + 14
                return json_string[json_string.index('"', index_body):json_string.index('",', index_body + 25)]
            print(f'** Exception in JSON-LD handling, {str(e)}')
    return ''

In [8]:
article_texts = ['' for x in range(len(updated_titles))]
manual_retrieval = []
skipped = []
for index in sorted(df_widget.selection): 
    url = updated_urls[index]
    skip = False
    if 'removed.com' in url:
        manual_retrieval.append(f'{index}, no url')
        skip = True
    elif url.startswith('https://news.google.com/'):
        interim_soup = BeautifulSoup(requests.get(url).text, 'html.parser')
        # TODO: FinancialTimes url ref not in og:url but = www.ft.com/content/id,
        #   where id in <script> LD, trackingData->pageDescription->rootContentId
        new_url = interim_soup.find_all('meta', {'property': 'og:url'})
        new_url0 = ''
        if len(new_url) == 1:
            new_url0 = new_url[0]['content']
        if new_url0 and '404' not in new_url0:
            url = new_url0
            updated_urls[index] = url
        else:
            # Unable to access Bloomberg, France24, WSJ, NYT, Hindustan Times
            manual_retrieval.append(f'{index}, unable to access source')
            skip = True
            continue
    if skip or check_excluded(url):
        skipped.append(index)
        continue
    article_text = process_html(url, index, manual_retrieval)
    if article_text and len(article_text) > 500:
        article_texts[index] = article_text
    else:
        manual_retrieval.append(f'{index}, insufficient text')
        skipped.append(index)

In [9]:
print(manual_retrieval)

['31, insufficient text', '32, insufficient text', '34, error getting ref', '34, insufficient text', '118, insufficient text', '142, error getting ref', '142, insufficient text', '156, insufficient text', '275, insufficient text', '282, insufficient text', '287, insufficient text', '303, insufficient text', '412, error getting ref', '412, insufficient text', '421, error getting ref', '421, insufficient text']


In [10]:
print(skipped)

[31, 32, 34, 41, 73, 118, 142, 156, 223, 224, 237, 275, 282, 287, 303, 318, 412, 421]


In [11]:
print(article_texts[143])

RAFAH, Gaza Strip (AP) — Truckloads of aid idled at Egypt’s border with Gaza as residents and humanitarian groups pleaded Monday for water, food and fuel for dying generators, saying the tiny Palestinian territory sealed off by Israel after last week’s rampage by Hamas was near total collapse.
U.S. President Joe Biden planned to travel to Israel on Wednesday to signal White House support for the country and to Jordan to meet with Arab leaders. U.S. Secretary of State Antony Blinken announced the trip early Tuesday in Tel Aviv during his second visit to Israel in less than a week amid fears that the fighting could expand into a broader regional conflict.
In Gaza, hospitals were on the verge of losing electricity, threatening the lives of thousands of patients, and hundreds of thousands of Palestinians displaced from their homes searched for bread. Israel maintained punishing airstrikes across Gaza as a ground invasion loomed, while Hamas militants kept up a barrage of rocket attacks, an

In [12]:
print(updated_urls[143])

https://apnews.com/article/israel-palestinians-gaza-hamas-war-781b3c63af4ae6e51c313a68f314e66d


## Fix up suggested manual retrievals and add other articles

In [12]:
print('Known problems resolving Google News links to Bloomberg, Financial Times, France24, Hindustan Times, NYTimes and WSJ')
print('Possibly due to URL resolution or a paywall')
print('Use manual addition of text')

Known problems resolving Google News links to Bloomberg, Financial Times, France24, Hindustan Times, NYTimes and WSJ
Possibly due to URL resolution or a paywall
Use manual addition of text
