In [1]:
url = 'https://www.gpb.org/news/2020/08/03/fulton-countys-distribution-of-covid-19-relief-funds-unconscionable-local-mayor'
from tqdm.auto import tqdm

def download_article(url):
    """ download article from a url
    """
    from newspaper import Article
    article = Article(url)
    try:
        article.download()
        article.parse()
    except:
        print(f"{url} cannot be downloaded or parsed.")
        return None
    title = article.title
    text = article.text
    top_image = article.top_image
    movies = article.movies
#     print(f"Downloaded an article with title: {title}..")
    result ={
        "title":title,
        "text":text,
        "top_image":top_image,
        "url":url,
#         "movies":movies,
    } 
    return result

In [2]:
result = download_article(url)

In [3]:
def get_urls_from_api(query,language,startdatetime,enddatetime,maxrecords=250):
    """get the urls based on the query keywords and time range
    """
    import urllib, json
    url = f"https://api.gdeltproject.org/api/v2/doc/doc?query={query}%20sourcelang:{language}&mode=artlist&STARTDATETIME={startdatetime}&ENDDATETIME={enddatetime}&MAXRECORDS={maxrecords}&format=json"
    data = urllib.request.urlopen(url).read().decode()
    data = json.loads(data)
    if not data:
        return []
    urls = []
    url_dic = {}
    for article in data['articles']:
        if article['url'] not in url_dic:
            urls.append(article['url'])
            url_dic[article['url']]=1
    return urls

In [4]:
query = "BOTULISM"
language = "SWAHILI"
startdatetime = "20170101000000"
enddatetime = "20171231235959"
urls = get_urls_from_api(query,language,startdatetime,enddatetime)[:3]
urls

[]

In [5]:
def get_articles(query,language,startdatetime,enddatetime):
    urls = get_urls_from_api(query,language,startdatetime,enddatetime)
    articles = []
    print(f"Find {len(urls)} urls for query [{query}] in {language}.")
    for url in tqdm(urls):
        article = download_article(url)
        if article:
            article["language"] = language
            articles.append(article)
    return articles

In [11]:
import pandas as pd
from tqdm.auto import tqdm

languages = ['ENGLISH', 'chinese', 'SPANISH', 'ARABIC', 'FRENCH', 'PORTUGUESE', 'RUSSIAN','INDONESIAN', 'PERSIAN']
query = "coronavirus"
startdatetime = "20201001000000"
enddatetime = "20201014000000"
# 'SWAHILI' is not supported
articles = []
for language in tqdm(languages):
    articles += get_articles(query,language,startdatetime,enddatetime)
articles_df = pd.DataFrame(articles)
articles_df.to_csv(f"query_{query}_{startdatetime}_{enddatetime}.csv")

HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

Find 250 urls for query [coronavirus] in ENGLISH.


HBox(children=(FloatProgress(value=0.0, max=250.0), HTML(value='')))

https://www.11alive.com/article/news/health/coronavirus/coronavirus-covid-19-maryland-dc-virginia-vaccine-reopening/65-b1bcce21-f1e6-4e01-a928-8501af45d657 cannot be downloaded or parsed.
https://www.wltx.com/article/news/health/coronavirus/coronavirus-covid-19-maryland-dc-virginia-vaccine-reopening/65-b1bcce21-f1e6-4e01-a928-8501af45d657 cannot be downloaded or parsed.
https://www.wwltv.com/article/news/health/coronavirus/coronavirus-covid-19-maryland-dc-virginia-vaccine-reopening/65-b1bcce21-f1e6-4e01-a928-8501af45d657 cannot be downloaded or parsed.
https://patch.com/michigan/troy/covid-19/2020-10-13 cannot be downloaded or parsed.
https://patch.com/rhode-island/cranston/covid-19/2020-10-02 cannot be downloaded or parsed.
https://www.wthr.com/article/news/health/coronavirus/latest-indiana-coronavirus-updates-saturday-october-3-2020/531-c85c80ef-5fb6-469e-9463-a2ef5f6cb381 cannot be downloaded or parsed.
https://www.wcvb.com/article/pfizer-to-start-testing-its-covid-19-vaccine-in-chi

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Find 250 urls for query [coronavirus] in SPANISH.


HBox(children=(FloatProgress(value=0.0, max=250.0), HTML(value='')))

https://www.elcomercio.es/asturias/coronavirus-asturias-hospital-cabuenes-cruz-roja-pacientes-20201010211514-nt.html cannot be downloaded or parsed.
https://www.lasprovincias.es/comunitat/nueve-brotes-nuevos-20201013190117-nt.html cannot be downloaded or parsed.
https://www.telecinco.es/informativos/internacional/trump-minimizado-coronavirus-antes-contagiarse_18_3021120061.html cannot be downloaded or parsed.
https://www.ideal.es/sociedad/lavarse-dientes-puede-evitar-coronavirus-20201014134304-nt.html cannot be downloaded or parsed.
https://www.elcomercio.es/asturias/coronavirus-asturias-positivo-colegio-montevil-gijon-20201013005142-nt.html cannot be downloaded or parsed.
https://www.elcorreo.com/alava/araba/alava-cierra-semana-casos-coronavirus-20201012124249-nt.html cannot be downloaded or parsed.
https://www.ideal.es/granada/cierran-iglesia-granada-positivo-coronavirus-fraile-20201007131747-nt.html cannot be downloaded or parsed.
https://www.ideal.es/sociedad/llega-sindemia-covid19

HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

https://www.almasryalyoum.com/news/details/2059529 cannot be downloaded or parsed.

Find 250 urls for query [coronavirus] in FRENCH.


HBox(children=(FloatProgress(value=0.0, max=250.0), HTML(value='')))

https://www.ouest-france.fr/monde/coree-du-nord/kim-jong-un/coronavirus-le-dirigeant-nord-coreen-kim-jong-un-affirme-qu-il-n-y-a-aucun-cas-dans-son-pays-7009177 cannot be downloaded or parsed.
https://www.challenges.fr/monde/coronavirus-seize-cas-supplementaires-en-chine-continentale_730817 cannot be downloaded or parsed.
https://www.challenges.fr/monde/coronavirus-madrid-sur-le-point-de-se-reconfiner_730629 cannot be downloaded or parsed.
https://fr.allafrica.com/stories/202010141105.html cannot be downloaded or parsed.

Find 250 urls for query [coronavirus] in PORTUGUESE.


HBox(children=(FloatProgress(value=0.0, max=250.0), HTML(value='')))


Find 250 urls for query [coronavirus] in RUSSIAN.


HBox(children=(FloatProgress(value=0.0, max=250.0), HTML(value='')))

http://news.bigmir.net/world/2034822-Y-vice-prezidenta-SShA-soobshili-itog-testa-na-COVID-19- cannot be downloaded or parsed.
https://www.dw.com/ru/jekonomika-frg-iz-za-koronavirusa-obvalitsja-silnee-chem-ozhidalos/a-55272193 cannot be downloaded or parsed.
http://ivona.bigmir.net/showbiz/news/491326-Donal-d-i-Melanija-Tramp-zarazilis--COVID-19 cannot be downloaded or parsed.

Find 0 urls for query [coronavirus] in INDONESIAN.


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Find 4 urls for query [coronavirus] in PERSIAN.


HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))



