diff --git a/requirements.colab.txt b/requirements.colab.txt new file mode 100644 index 0000000..a876f56 --- /dev/null +++ b/requirements.colab.txt @@ -0,0 +1,12 @@ +transformers==2.1.1 + +gensim +requests +torch +tqdm +pandas +sklearn +numpy +matplotlib +pyyaml +nltk diff --git a/wiki/utils.py b/wiki/utils.py new file mode 100644 index 0000000..2e92e63 --- /dev/null +++ b/wiki/utils.py @@ -0,0 +1,35 @@ +import requests + + +def get_text_from_wikipedia(title, wiki_domain='en.wikipedia.org'): + """ + Retrieve Wikipedia article content as plain-text + + :param title: Title of Wikipedia article + :param wiki_domain: API domain (e.g., en.wikipedia.org) + :return: Article content as plain-text + """ + + res = requests.get(f'https://{wiki_domain}/w/api.php', params={ + "action":"query", + "prop":"revisions", + "rvprop":"content", + "format":"json", + "titles":title, + "rvslots":"main" + }) + + from gensim.corpora.wikicorpus import filter_wiki + pages = [p for p_id, p in res.json()['query']['pages'].items()] + + if len(pages) == 0: + raise ValueError(f'Cannot find Wikipedia article: {title}') + elif len(pages) > 1: + raise ValueError(f'Wikipedia article title is unambigious. Multiple articles returned from API: {title}') + else: + p = pages[0] + + wikitext = p['revisions'][0]['slots']['main']['*'] # + text = filter_wiki(wikitext).strip() + + return text