### Libraries

In [3]:
import requests as req
from bs4 import BeautifulSoup as bs
import pandas as pd
from time import sleep
from random import randint

### Main Function

In [84]:
def get_news(num_pages = 1,     
             num_articles = -1,  
             get_title = True,  
             get_url = True,    
             get_thumbnail = False,     
             get_content = False,       
             get_author = False,         
             get_publish_date = False,  
             news_type = 'newest',      
             set_sleep = 1,             
             debug = True               
):
    """ 
    Arguments description:
    - num_pages
        Number of pages to be collected, each pages may contain many articles
        Must be positive integer
    - num_articles
        Number of articles to be collected from all pages, may not be reached due to insufficient pages
        Must be positive integer, or -1 to collect all articles from all pages
    - get_title
        Get article title? 
        Must be boolean
    - get_url
        Get article URL?
        Must be boolean
    - get_thumbnail
        Get article thumbnail URL?
        Must be boolean
    - get_content
        Get article content? (collect all texts shown in the article)  
        Must be boolean
    - get_author
        Get article author
        Must be boolean
    - get_publish_date
        Get article publish date
        Must be boolean
    - news_type
        Article types
        Can be newest or popular
    - set_sleep
        Set an interval between requests, recommended to keep it as is
        Must be non negative integer
    - debug
        Print debugging text, useful to know which elements are not accesible
        Must be boolean
        Currently not being used
    """
    
    # Argument Check
    # used to check whether argument values are expected
    if (type(num_pages) != int) or (num_pages < 1):
        print('argument num_pages must be positive integer!')
        return
    if (type(get_title) != bool or type(get_url) != bool or type(get_thumbnail) != bool or type(get_content) != bool):
        print('argument get_ must be boolean!')
        return
    if (type(set_sleep) != int or set_sleep < 0):
        print('argument set_sleep must be non-negative integer!')
        return
    allowable_news_type = ['newest', 'popular']
    if news_type not in allowable_news_type:
        print('argument news_type must be one of the following:', allowable_news_type)

    # Prepare lists to contain each elements
    main_url = 'https://suarajatimpost.com/indeks'
    list_news = []
    list_article_title = []
    list_article_url = []
    list_article_thumbnail = []
    list_article_content = []
    list_article_author = []
    list_article_publish_date = []
    result = {}
    article_count = 0

    # Loop through pages, collect articles. Each articles is stored in 'list_news'.
    for i in range(num_pages):
        current_url = main_url+'?page='+str(i)
        print('Accessing page', i, ' - ', current_url)
        try:
            soup = bs(req.get(current_url).text, features='html.parser')
            list_news = list_news + soup.find("div", class_ = "col-sm-12 col-md-12 col-lg-8").findAll("div", class_ = "col-sm-12 col-md-6")
        except:
            print('Warning: error in page', i)
            break
        
        # Add a random sleep to not spam the website.
        if i>1:
            sleep(randint(set_sleep,set_sleep+1))

        # Check if num_articles is reached
        if num_articles != -1:
            article_count += len(list_news)
            if article_count > num_articles:
                print('num_articles reached!')
                list_news = list_news[0:num_articles]
                break    
    print('num_pages reached!')

    # Loop through articles, collect title, url, etc. Each elements is stored in corresponding list.
    for news in list_news:
        if get_title:
            title = news.find("h3", class_ = "title").text
            list_article_title.append(title)
        if get_url:
            url = news.find("h3", class_ = "title").find("a", href = True)['href']
            list_article_url.append(url)
        if get_thumbnail:
            thumbnail = news.find("img", class_ = "img-fluid lazyload")['data-src']
            list_article_thumbnail.append(thumbnail)
        if get_content or get_author or get_publish_date: 
            print('Accessing article -', url)
            article_soup = bs(req.get(url).text, features='html.parser')
            sleep(randint(set_sleep, set_sleep+1))
            if get_content:
                try:
                    list_article_content.append(article_soup.find('div', class_ = 'post-text mt-4').text.strip())
                except:
                    list_article_content.append("")
            if get_author:
                try:
                    list_article_author.append(article_soup.find(class_ = 'item-meta item-meta-author').text.strip())
                except:
                    list_article_author.append("")
            if get_publish_date:
                try:
                    list_article_publish_date.append(article_soup.find(class_ = "item-meta item-meta-date").text.strip())
                except:
                    list_article_publish_date.append("")
    print('Done!')

    # Store the result as a dataframe
    if get_title:           result['title'] = list_article_title
    if get_url:             result['url'] = list_article_url
    if get_thumbnail:       result['thumbnail'] = list_article_thumbnail
    if get_content:         result['content'] = list_article_content
    if get_author:          result['author'] = list_article_author
    if get_publish_date:    result['publish_date'] = list_article_publish_date
    df = pd.DataFrame(result)
    return(df)

### Usage

In [None]:
get_news(num_pages = 2, get_thumbnail=True, get_content=True, get_author=True, get_publish_date=True)