### Libraries

In [100]:
import requests as req
from bs4 import BeautifulSoup as bs
import pandas as pd
from time import sleep
from random import uniform
from datetime import datetime, timedelta

### Main Function

In [142]:
def get_news(start_date = None,
             end_date = None,  
             max_num_articles = 100,  
             get_title = True,  
             get_url = True,    
             get_thumbnail = False,     
             get_content = False,       
             get_author = False,         
             get_publish_date = False,
             news_type = 'all',      
             set_sleep = 1,             
             debug = True               
):
    """ 
    Arguments description:
    - start_date
        Specify starting news publish date
        Must be a string written in 'DD-MM-YYYY' format
        If None, automatically set to be equal to end_date
    - end_date
        Specify ending news publish date
        Must be a string written in 'DD-MM-YYYY' format
        If None, automatically set to be the latest available news
    - max_num_articles
        Number of articles to be collected
        Must be positive integer, or -1 to collect all possible news
    - get_title
        Get article title? 
        Must be boolean
    - get_url
        Get article URL?
        Must be boolean
    - get_thumbnail
        Get article thumbnail URL?
        Must be boolean
    - get_content
        Get article content? (collect all texts shown in the article)  
        Must be boolean
    - get_author
        Get article author
        Must be boolean
    - get_publish_date
        Get article publish date
        Must be boolean
    - news_type
        Article types
        Can be 'all', 'bisnis', 'regional', 'matalokal', 'lestari', 'superskor', 'seleb', 'lifestyle', 'new-economy', 'otomotif', 'techno', 'sport','kesehatan'
    - set_sleep
        Set an interval between requests, recommended to keep it as is
        Must be non negative integer
    - debug
        Print debugging text, useful to know which elements are not accesible
        Must be boolean
        Currently not being used
    """
    
    # Argument Check
    # used to check whether argument values are expected
    if (type(get_title) != bool or type(get_url) != bool or type(get_thumbnail) != bool or type(get_content) != bool):
        print('argument get_ must be boolean!')
        return
    if (type(set_sleep) != int or set_sleep < 0):
        print('argument set_sleep must be non-negative integer!')
        return
    allowable_news_type = ['all', 'bisnis', 'regional', 'matalokal', 'lestari', 'superskor', 'seleb', 
                           'lifestyle', 'new-economy', 'otomotif', 'techno', 'sport', 'kesehatan']
    if news_type not in allowable_news_type:
        print('argument news_type must be one of the following:', allowable_news_type)
        return

    # Adjust news_type url
    main_url = 'https://www.tribunnews.com/index-news'
    if news_type != 'all':
        main_url = 'https://www.tribunnews.com/index-news/'+news_type

    # Set Headers
    header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}

    # Prepare lists to contain each elements
    list_news = []
    list_article_title = []
    list_article_url = []
    list_article_thumbnail = []
    list_article_content = []
    list_article_author = []
    list_article_publish_date = []
    result = {}

    # Manage Start and End Dates
    if end_date is None:
        end_date = bs(req.get(main_url, headers=header).text, features='html.parser').find(class_ = 'fbo2 grey ar')['title'][0:10]
        sleep(uniform(set_sleep,set_sleep+1))
    if start_date is None:
        start_date = end_date
    start_date = datetime.strptime(start_date, '%Y-%m-%d').date()
    end_date = datetime.strptime(end_date, '%Y-%m-%d').date()
    days_diff = end_date - start_date
    days_diff = days_diff.days    
    if (start_date > end_date):
        print('Error! end_date cannot be earlier than start_date!')
        return

    # Loop through dates, collect articles. Each articles is stored in 'list_news'.
    flag_stop_loop = 0
    num_articles = 0

    for i in range(days_diff+1):      
        current_date = end_date - timedelta(days=i)
        current_date = current_date.strftime('%Y-%m-%#d')

        page_index = 1
        while True:
            current_url = main_url+'?date='+current_date+'&page='+str(page_index)
            page_index += 1
            print('Accessing', current_date, ' - ', current_url)
            try:
                soup = bs(req.get(current_url, headers=header).text, features='html.parser')
                sleep(uniform(set_sleep,set_sleep+1))
                new_news = soup.findAll(class_ = 'ptb15')
                list_news = list_news + new_news
                if not len(new_news):
                    print('All articles scanned for', current_date,'!')
                    break
            except:
                break

            # Check if max_num_articles is reached
            num_articles = len(list_news)
            if max_num_articles != -1:
                if num_articles > max_num_articles:
                    print('num_articles reached!')
                    list_news = list_news[0:max_num_articles]
                    flag_stop_loop = 1
                    break    
        if flag_stop_loop:
            break
    
    
    # Loop through articles, collect title, url, etc. Each elements is stored in corresponding list.
    for news in list_news:
        if get_title:
            title = news.find("h3", class_ = "f16 fbo").text.strip()
            list_article_title.append(title)
        if get_url:
            url = news.find("h3", class_ = "f16 fbo").find("a", href = True)['href']
            list_article_url.append(url)
        if get_publish_date:
            publish_date = news.find(class_ = "grey").text
            list_article_publish_date.append(publish_date)
            
        if get_content or get_author or get_thumbnail: 
            print('Accessing article -', url)
            article_soup = bs(req.get(url, headers=header).text, features='html.parser')
            sleep(uniform(set_sleep, set_sleep+1))
            if get_content:
                try:
                    list_article_content.append(article_soup.find('div', class_ = 'side-article txt-article multi-fontsize').text.strip())
                except:
                    list_article_content.append("")
            if get_author:
                try:
                    list_article_author.append(article_soup.find('div', class_ = 'credit mt10').find('div', id = 'editor').text.strip())
                except:
                    list_article_author.append("")
            if get_thumbnail:
                try:
                    list_article_thumbnail.append(article_soup.find('img', class_ = 'imgfull')['src'])
                except:
                    list_article_thumbnail.append("")

    print('Done!')

    # Store the result as a dataframe
    if get_title:           result['title'] = list_article_title
    if get_url:             result['url'] = list_article_url
    if get_thumbnail:       result['thumbnail'] = list_article_thumbnail
    if get_content:         result['content'] = list_article_content
    if get_author:          result['author'] = list_article_author
    if get_publish_date:    result['publish_date'] = list_article_publish_date
    df = pd.DataFrame(result)
    return(df)

### Usage

In [None]:
get_news(news_type = 'kesehatan', max_num_articles=5, start_date = '2023-09-06', end_date = '2023-09-09', get_thumbnail=True, get_author=True, get_content=True, get_publish_date=True)