In [None]:
# Import packages
from datetime import datetime
import pandas as pd
import newspaper
from newspaper import news_pool

# Create downloader function
# From newspaper3k documentation: https://newspaper.readthedocs.io/en/latest/
def news_downloader(lists, number, save_csv=True):
    """
    The function is used to download the contents of the newspaper that is passed as a list of newspaper urls.
    The contents of the papers such as 'title', 'summary', 'authors', etc. are downloaded from the url with the 'newspaper3k' library.

    :param lists: contains the list of newspaper urls
    :return:
    """
    num = str(number)
    merged_url_data = []
    news_build = []
    for url in lists:
        news_build.append(newspaper.build(url, memoize_articles=False))
    news_pool.set(news_build)
    news_pool.join()

    for source in news_build:
        for art in source.articles:
            try:
                art.download()
                art.parse()
                art.nlp()

                url_text = art.text
                url_summary = art.summary
                url_author = "|".join(art.authors)
                url_top_image = art.top_image
                url_publish_date = art.publish_date
                url_keywords = "|".join(art.keywords)

                date = datetime.now().strftime("%m-%d-%y_%H_%M")
                filename = (num+"_"+date+".csv")

                url_data = {'Timestamp':[filename],'Date Published':[url_publish_date],'Source':[source.brand],'Summary':[url_summary], 'Authors':[url_author], 'Text':[url_text], 'Keywords':[url_keywords]} #,'Top Image':[url_top_image]
                df = pd.DataFrame(url_data, columns = ['Timestamp','Source','Summary','Authors','Text','Keywords']) #,'Top Image'])
                merged_url_data.append(df)
            
            except:
                continue

    print('Created News Downloader:',filename)
    merged_url_data = pd.concat(merged_url_data,axis=0)
    if save_csv:
        merged_url_data.to_csv(filename, index=False)
    return merged_url_data

In [None]:
# Import time packages
import schedule
import time

# Create function to collect news from pre-defined url list
def collect_news():
    
    doc = pd.read_csv("code_url.csv", header=None)
    doc.columns = ["codes", "urls"]
    print(doc)

    for index, row in doc.iterrows():
        code = row["codes"]
        url = [str(row["urls"])]
        print(code, url)
        
        try:
            news_downloader(url, code)
            print("Ok")
            
        except:
            print("ERROR:", url)
            continue

    print("The end!")
    return

# Schedule autorun
schedule.every().day.at("08:00").do(collect_news)

while True:
    schedule.run_pending()
    time.sleep(60)