In [1]:
import urllib
from newsapi import NewsApiClient
from bs4 import BeautifulSoup as soup
import pandas as pd

In [2]:
base_url = 'https://www.straitstimes.com'
country = 'singapore' # world 
relevance = 'latest'

def get_url(base_url, country = None, relevance = None, start_page = 1, end_page = 5):
    main_url = []
    if country == None:
        main_url.append(base_url)
    elif country != None and relevance == None:
        main_url.append(base_url + '/' + country)
    else:
        for page_number in range(start_page, end_page+1):
            sub_url = base_url + '/' + country + '/' + relevance + '?page='+ str(page_number)
            main_url.append(sub_url)
    return main_url

main_url = get_url(base_url, country, relevance, 1, 2)
main_url

['https://www.straitstimes.com/singapore/latest?page=1',
 'https://www.straitstimes.com/singapore/latest?page=2']

In [3]:
def get_page_content(main_url):
    # create empty df
    df = pd.DataFrame(columns = ['title', 'url'])
    # query info
    for sub_url in main_url:
        # get page content
        with urllib.request.urlopen(sub_url) as url:
            txt = url.read()
        # format content using BeautifulSoup
        page_content = soup(txt.decode("utf-8"))
        # find titles
        containers = page_content.findAll('span',{'class':'story-headline'})
        # clean titles (URL & Heading)
        for item in containers:
            try:
                title = item.text.replace('\n','')
            except:
                title = None
            try:
                url = item.find(href=True)['href']
            except:
                url = None
            # attach final results
            df = df.append({'title':title,'url': url}, ignore_index=True)
    return df

df = get_page_content(main_url)
df.head()

Unnamed: 0,title,url
0,I can't smell the orange: NUS don tested posit...,/singapore/health/coronavirus-i-feel-extremely...
1,This week's top reads from The Straits Times,/singapore/this-weeks-top-reads-from-the-strai...
2,Coronavirus: Stay-home notices clearly state p...,/singapore/coronavirus-stay-home-notices-clear...
3,Elderly couple's decade-long romance ends with...,/singapore/elderly-couples-decade-long-romance...
4,Green Pulse Podcast: How NEA works with cleani...,/singapore/environment/green-pulse-podcast-how...


In [4]:
def get_article_content(base_url, main_df):
    
    # create empty df
    df_2 = pd.DataFrame(columns = ['url', 'author', 'email', 'datepublish', 'datemodified', 'keywords'])

    for i_url in main_df['url']:
        # get page content
        sub_url = base_url+i_url
        with urllib.request.urlopen(sub_url) as url:
            txt = url.read()
        # format content using BeautifulSoup
        page_content = soup(txt.decode("utf-8"))
        # find author
        try:
            author = page_content.find_all('meta',attrs={'name':'author'})[0]['content']
        except:
            author = None
        # find author email
        try:
            email = str(page_content.decode).split('a href="mailto:')[1].split(' ')[0].replace('"', '')
        except:
            email = None
        # find datepublished
        try:
            datepublish = str(page_content.select('time[itemprop="datePublished"]')[0]).split('datetime=')[1].split(' ')[0].replace('"', '')
        except:
            datepublish = None
        # find datemodified
        try:
            datemodified = str(page_content.select('time[itemprop="dateModified"]')[0]).split('datetime=')[1].split(' ')[0].replace('"', '')
        except:
            datemodified = None
        # find keywords
        try:
            keywords = page_content.find_all('meta',attrs={'name':'news_keywords'})[0]['content'].split(', ')
        except:
            keywords = []
        # attach final results
        df_2 = df_2.append({'url': i_url, 'author': author, 'email': email,
                            'datepublish': datepublish, 'datemodified': datemodified, 'keywords': keywords}, 
                           ignore_index=True)
    df_2['datepublish'] = pd.to_datetime(df_2['datepublish'])
    df_2['datemodified'] = pd.to_datetime(df_2['datemodified'])
    main_df = main_df.merge(df_2, how='left', on='url')
    
    return main_df

final_df = get_article_content(base_url, df[0:5])
final_df.head()

Unnamed: 0,title,url,author,email,datepublish,datemodified,keywords
0,I can't smell the orange: NUS don tested posit...,/singapore/health/coronavirus-i-feel-extremely...,TIMOTHY GOH,timgoh@sph.com.sg,2020-03-28 19:01:33+08:00,2020-03-29 07:19:14+08:00,"[CORONAVIRUS, COVID-19, SINGAPORE GENERAL HOSP..."
1,This week's top reads from The Straits Times,/singapore/this-weeks-top-reads-from-the-strai...,,,2020-03-28 18:58:00+08:00,2020-03-28 18:58:01+08:00,[NEWSLETTER]
2,Coronavirus: Stay-home notices clearly state p...,/singapore/coronavirus-stay-home-notices-clear...,TAN TAM MEI,tammei@sph.com.sg,2020-03-28 18:42:00+08:00,2020-03-30 11:30:23+08:00,"[COVID-19, ICA, CORONAVIRUS]"
3,Elderly couple's decade-long romance ends with...,/singapore/elderly-couples-decade-long-romance...,CHERYL TEH,tienli@sph.com.sg,2020-03-28 18:31:27+08:00,2020-03-29 07:27:20+08:00,"[WEDDINGS AND ENGAGEMENTS, HOSPITALS]"
4,Green Pulse Podcast: How NEA works with cleani...,/singapore/environment/green-pulse-podcast-how...,,audreyt@sph.com.sg>audreyt@sph.com.sg</a>),2020-03-28 18:00:00+08:00,2020-03-28 18:00:02+08:00,"[ST PODCASTS, GREEN PULSE, COVID-19, CORONAVIRUS]"


In [5]:
# run actual
base_url = 'https://www.straitstimes.com'
country = 'singapore' # world 
relevance = 'latest'
main_url = get_url(base_url, country, relevance, 1, 20)
df = get_page_content(main_url)
final_df = get_article_content(base_url, df)

In [7]:
final_df.to_csv('straitstimes_'+country+'_'+relevance+'.csv')