From c7b8a8af4f4727157a11ff79ee744d8ceaad1d9e Mon Sep 17 00:00:00 2001 From: ivan Date: Thu, 13 Jul 2017 20:28:14 +0500 Subject: [PATCH 1/8] add segment wiki script --- gensim/scripts/segment_wiki.py | 214 +++++++++++++++++++++++++++++++++ 1 file changed, 214 insertions(+) create mode 100755 gensim/scripts/segment_wiki.py diff --git a/gensim/scripts/segment_wiki.py b/gensim/scripts/segment_wiki.py new file mode 100755 index 0000000000..0bfb830e6c --- /dev/null +++ b/gensim/scripts/segment_wiki.py @@ -0,0 +1,214 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Author: Jayant Jain +# Copyright (C) 2016 RaRe Technologies + +""" +Construct a corpus from a Wikipedia (or other MediaWiki-based) database dump and extract sections of pages from it + +If you have the `pattern` package installed, this module will use a fancy +lemmatization to get a lemma of each token (instead of plain alphabetic +tokenizer). The package is available at https://github.com/clips/pattern . + +""" + +import argparse +import json +import logging +import multiprocessing +import os +import re +import sys +from xml.etree import cElementTree + +from gensim.corpora.wikicorpus import ARTICLE_MIN_WORDS, IGNORED_NAMESPACES, WikiCorpus, \ + filter_wiki, get_namespace, tokenize, utils +from smart_open import smart_open + + +def segment_all_articles(file_path): + """ + Extract article titles and sections from a MediaWiki bz2 database dump. + + Return an iterable over (str, list) which generates + (title, [(section_heading, section_content)]) 2-tuples. + + """ + with smart_open(file_path, 'rb') as xml_fileobj: + wiki_sections_corpus = WikiSectionsCorpus(xml_fileobj) + wiki_sections_corpus.metadata = True + wiki_sections_text = wiki_sections_corpus.get_texts_with_sections() + for article_title, article_sections in wiki_sections_text: + yield article_title, article_sections + + +def segment_and_print_all_articles(file_path): + """ + Prints article title and sections to stdout, tab-separated + article_titlesection_headingsection_contentsection_headingsection_content + + """ + for article_title, article_sections in segment_all_articles(file_path): + printed_components = [json.dumps(article_title)] + for section_heading, section_content in article_sections: + printed_components.append(json.dumps(section_heading)) + printed_components.append(json.dumps(section_content)) + os.write(sys.stdout.fileno(), u"\t".join(printed_components).encode('utf-8') + b"\n") + + +# noinspection PyUnresolvedReferences +def extract_page_xmls(f): + """ + Extract pages from a MediaWiki database dump = open file-like object `f`. + + Return an iterable which generates xml strings for page tags. + + """ + elems = (elem for _, elem in cElementTree.iterparse(f, events=("end",))) + + elem = next(elems) + namespace = get_namespace(elem.tag) + ns_mapping = {"ns": namespace} + page_tag = "{%(ns)s}page" % ns_mapping + + for elem in elems: + if elem.tag == page_tag: + yield cElementTree.tostring(elem) + # Prune the element tree, as per + # http://www.ibm.com/developerworks/xml/library/x-hiperfparse/ + # except that we don't need to prune backlinks from the parent + # because we don't use LXML. + # We do this only for s, since we need to inspect the + # ./revision/text element. The pages comprise the bulk of the + # file, so in practice we prune away enough. + elem.clear() + + +# noinspection PyUnresolvedReferences +def segment(page_xml): + """ + Parse the content inside a page tag, returning its content as a list of tokens + (utf8-encoded strings). + + Returns a 2-tuple (str, list) - + (title, [(section_heading, section_content)]) + + """ + elem = cElementTree.fromstring(page_xml) + filter_namespaces = ('0',) + namespace = get_namespace(elem.tag) + ns_mapping = {"ns": namespace} + text_path = "./{%(ns)s}revision/{%(ns)s}text" % ns_mapping + title_path = "./{%(ns)s}title" % ns_mapping + ns_path = "./{%(ns)s}ns" % ns_mapping + lead_section_heading = "Introduction" + top_level_heading_regex = r"\n==[^=].*[^=]==\n" + top_level_heading_regex_capture = r"\n==([^=].*[^=])==\n" + + title = elem.find(title_path).text + text = elem.find(text_path).text + ns = elem.find(ns_path).text + if ns not in filter_namespaces: + text = None + + if text is not None: + section_contents = re.split(top_level_heading_regex, text) + section_headings = [lead_section_heading] + re.findall(top_level_heading_regex_capture, text) + assert(len(section_contents) == len(section_headings)) + else: + section_contents = [] + section_headings = [] + + section_contents = [filter_wiki(section_content) for section_content in section_contents] + sections = list(zip(section_headings, section_contents)) + return title, sections + + +# noinspection PyUnresolvedReferences,PyMissingConstructor,PyAttributeOutsideInit,PyAbstractClass,PyUnusedLocal +class WikiSectionsCorpus(WikiCorpus): + """ + Treat a wikipedia articles dump (\*articles.xml.bz2) as a (read-only) corpus. + + The documents are extracted on-the-fly, so that the whole (massive) dump + can stay compressed on disk. + + >>> wiki = WikiCorpus('enwiki-20100622-pages-articles.xml.bz2') # create word->word_id mapping, takes almost 8h + >>> MmCorpus.serialize('wiki_en_vocab200k.mm', wiki) # another 8h, creates a file in MatrixMarket format plus file with id->word + + """ + def __init__(self, fileobj, processes=None, lemmatize=utils.has_pattern(), dictionary=None, filter_namespaces=('0',)): + """ + Initialize the corpus. Unless a dictionary is provided, this scans the + corpus once, to determine its vocabulary. + + If `pattern` package is installed, use fancier shallow parsing to get + token lemmas. Otherwise, use simple regexp tokenization. You can override + this automatic logic by forcing the `lemmatize` parameter explicitly. + + """ + self.fileobj = fileobj + self.filter_namespaces = filter_namespaces + self.metadata = False + if processes is None: + processes = max(1, multiprocessing.cpu_count() - 1) + self.processes = processes + self.lemmatize = lemmatize + + def get_texts_with_sections(self): + """ + Iterate over the dump, returning titles and text versions of all sections of articles as a list + of 2-tuples [(article_title, [(section_heading, section_content)]]. + + Only articles of sufficient length are returned (short articles & redirects + etc are ignored). + + Note that this iterates over the **texts**; if you want vectors, just use + the standard corpus interface instead of this function:: + + >>> for vec in wiki_corpus: + >>> print(vec) + """ + articles = 0 + page_xmls = extract_page_xmls(self.fileobj) + pool = multiprocessing.Pool(self.processes) + # process the corpus in smaller chunks of docs, because multiprocessing.Pool + # is dumb and would load the entire input into RAM at once... + for group in utils.chunkize(page_xmls, chunksize=10 * self.processes, maxsize=1): + for article_title, sections in pool.imap(segment, group): # chunksize=10): + # article redirects and short stubs are pruned here + num_total_tokens = 0 + for section_title, section_content in sections: + if self.lemmatize: + num_total_tokens += len(utils.lemmatize(section_content)) + else: + num_total_tokens += len(tokenize(section_content)) + if num_total_tokens < ARTICLE_MIN_WORDS or any(article_title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES): + continue + articles += 1 + yield (article_title, sections) + pool.terminate() + self.length = articles # cache corpus length +# endclass WikiSectionsCorpus + + +logger = logging.getLogger(__name__) + +if __name__ == "__main__": + logging_format = '%(asctime)s : %(processName)s : %(levelname)s : %(message)s' + logging_level = logging.INFO + logging.basicConfig(format=logging_format, level=logging_level) + logger.info("running %s", " ".join(sys.argv)) + + program = os.path.basename(sys.argv[0]) + parser = argparse.ArgumentParser( + prog=program, + formatter_class=argparse.RawTextHelpFormatter, + description=globals()['__doc__']) + parser.add_argument( + '-f', '--file', + help='path to mediawiki database dump') + args = parser.parse_args() + segment_and_print_all_articles(args.file) + + logger.info("finished running %s", program) From 11691bb3a3088a7a9becf088b20fc896a0cacc3a Mon Sep 17 00:00:00 2001 From: ivan Date: Thu, 13 Jul 2017 22:43:41 +0500 Subject: [PATCH 2/8] fix indentation error --- gensim/scripts/segment_wiki.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/scripts/segment_wiki.py b/gensim/scripts/segment_wiki.py index 0bfb830e6c..8b426b61d7 100755 --- a/gensim/scripts/segment_wiki.py +++ b/gensim/scripts/segment_wiki.py @@ -184,7 +184,7 @@ def get_texts_with_sections(self): else: num_total_tokens += len(tokenize(section_content)) if num_total_tokens < ARTICLE_MIN_WORDS or any(article_title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES): - continue + continue articles += 1 yield (article_title, sections) pool.terminate() From fb83ef2e518edc0b64c667fcbc3b6f0c81055274 Mon Sep 17 00:00:00 2001 From: ivan Date: Fri, 6 Oct 2017 16:56:28 +0500 Subject: [PATCH 3/8] Add output file and logging + small fixes --- gensim/scripts/segment_wiki.py | 48 ++++++++++++++-------------------- 1 file changed, 20 insertions(+), 28 deletions(-) diff --git a/gensim/scripts/segment_wiki.py b/gensim/scripts/segment_wiki.py index 8b426b61d7..e2ba1f5ab5 100755 --- a/gensim/scripts/segment_wiki.py +++ b/gensim/scripts/segment_wiki.py @@ -17,7 +17,6 @@ import json import logging import multiprocessing -import os import re import sys from xml.etree import cElementTree @@ -27,6 +26,9 @@ from smart_open import smart_open +logger = logging.getLogger(__name__) + + def segment_all_articles(file_path): """ Extract article titles and sections from a MediaWiki bz2 database dump. @@ -43,21 +45,23 @@ def segment_all_articles(file_path): yield article_title, article_sections -def segment_and_print_all_articles(file_path): +def segment_and_print_all_articles(file_path, output_file): """ Prints article title and sections to stdout, tab-separated article_titlesection_headingsection_contentsection_headingsection_content """ - for article_title, article_sections in segment_all_articles(file_path): - printed_components = [json.dumps(article_title)] - for section_heading, section_content in article_sections: - printed_components.append(json.dumps(section_heading)) - printed_components.append(json.dumps(section_content)) - os.write(sys.stdout.fileno(), u"\t".join(printed_components).encode('utf-8') + b"\n") + with open(output_file, 'wb') as outfile: + for idx, (article_title, article_sections) in enumerate(segment_all_articles(file_path)): + printed_components = [json.dumps(article_title)] + for section_heading, section_content in article_sections: + printed_components.append(json.dumps(section_heading)) + printed_components.append(json.dumps(section_content)) + if (idx + 1) % 100000 == 0: + logger.info("Processed #%d articles", idx + 1) + outfile.write(u"\t".join(printed_components).encode('utf-8') + "\n") -# noinspection PyUnresolvedReferences def extract_page_xmls(f): """ Extract pages from a MediaWiki database dump = open file-like object `f`. @@ -85,7 +89,6 @@ def extract_page_xmls(f): elem.clear() -# noinspection PyUnresolvedReferences def segment(page_xml): """ Parse the content inside a page tag, returning its content as a list of tokens @@ -125,7 +128,6 @@ def segment(page_xml): return title, sections -# noinspection PyUnresolvedReferences,PyMissingConstructor,PyAttributeOutsideInit,PyAbstractClass,PyUnusedLocal class WikiSectionsCorpus(WikiCorpus): """ Treat a wikipedia articles dump (\*articles.xml.bz2) as a (read-only) corpus. @@ -137,7 +139,7 @@ class WikiSectionsCorpus(WikiCorpus): >>> MmCorpus.serialize('wiki_en_vocab200k.mm', wiki) # another 8h, creates a file in MatrixMarket format plus file with id->word """ - def __init__(self, fileobj, processes=None, lemmatize=utils.has_pattern(), dictionary=None, filter_namespaces=('0',)): + def __init__(self, fileobj, processes=None, lemmatize=utils.has_pattern(), filter_namespaces=('0',)): """ Initialize the corpus. Unless a dictionary is provided, this scans the corpus once, to determine its vocabulary. @@ -189,26 +191,16 @@ def get_texts_with_sections(self): yield (article_title, sections) pool.terminate() self.length = articles # cache corpus length -# endclass WikiSectionsCorpus -logger = logging.getLogger(__name__) - if __name__ == "__main__": - logging_format = '%(asctime)s : %(processName)s : %(levelname)s : %(message)s' - logging_level = logging.INFO - logging.basicConfig(format=logging_format, level=logging_level) + logging.basicConfig(format='%(asctime)s : %(processName)s : %(levelname)s : %(message)s', level=logging.INFO) logger.info("running %s", " ".join(sys.argv)) - program = os.path.basename(sys.argv[0]) - parser = argparse.ArgumentParser( - prog=program, - formatter_class=argparse.RawTextHelpFormatter, - description=globals()['__doc__']) - parser.add_argument( - '-f', '--file', - help='path to mediawiki database dump') + parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, description=globals()['__doc__']) + parser.add_argument('-f', '--file', help='path to mediawiki database dump', required=True) + parser.add_argument('-o', '--output', help='path to output file', required=True) args = parser.parse_args() - segment_and_print_all_articles(args.file) + segment_and_print_all_articles(args.file, args.output) - logger.info("finished running %s", program) + logger.info("finished running %s", sys.argv[0]) From 102c0df7468d9a9f4d73d01fd9dcc672a5b636e3 Mon Sep 17 00:00:00 2001 From: ivan Date: Fri, 6 Oct 2017 18:19:10 +0500 Subject: [PATCH 4/8] add smart_open --- gensim/scripts/segment_wiki.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/scripts/segment_wiki.py b/gensim/scripts/segment_wiki.py index e2ba1f5ab5..60239e0f2d 100755 --- a/gensim/scripts/segment_wiki.py +++ b/gensim/scripts/segment_wiki.py @@ -51,7 +51,7 @@ def segment_and_print_all_articles(file_path, output_file): article_titlesection_headingsection_contentsection_headingsection_content """ - with open(output_file, 'wb') as outfile: + with smart_open(output_file, 'wb') as outfile: for idx, (article_title, article_sections) in enumerate(segment_all_articles(file_path)): printed_components = [json.dumps(article_title)] for section_heading, section_content in article_sections: From ef3b094d9422c4c35d46c03e7b8682dd46a71890 Mon Sep 17 00:00:00 2001 From: ivan Date: Fri, 27 Oct 2017 13:55:53 +0500 Subject: [PATCH 5/8] Add numpy-style docstrings & fix .rst --- docs/src/apiref.rst | 1 + docs/src/scripts/segment_wiki.rst | 9 +++ gensim/scripts/segment_wiki.py | 106 ++++++++++++++++++++---------- 3 files changed, 83 insertions(+), 33 deletions(-) create mode 100644 docs/src/scripts/segment_wiki.rst diff --git a/docs/src/apiref.rst b/docs/src/apiref.rst index 80bfd8547a..3538dca954 100644 --- a/docs/src/apiref.rst +++ b/docs/src/apiref.rst @@ -79,6 +79,7 @@ Modules: scripts/make_wiki_online_lemma scripts/make_wiki_online_nodebug scripts/word2vec2tensor + scripts/segment_wiki parsing/porter parsing/preprocessing summarization/bm25 diff --git a/docs/src/scripts/segment_wiki.rst b/docs/src/scripts/segment_wiki.rst new file mode 100644 index 0000000000..b5e7070aaa --- /dev/null +++ b/docs/src/scripts/segment_wiki.rst @@ -0,0 +1,9 @@ +:mod:`scripts.segment_wiki` -- Convert wikipedia dump to plain text format +========================================================================== + +.. automodule:: gensim.scripts.segment_wiki + :synopsis: Convert wikipedia dump to plain text format. + :members: + :inherited-members: + :undoc-members: + :show-inheritance: diff --git a/gensim/scripts/segment_wiki.py b/gensim/scripts/segment_wiki.py index 60239e0f2d..2c163d9555 100755 --- a/gensim/scripts/segment_wiki.py +++ b/gensim/scripts/segment_wiki.py @@ -30,11 +30,18 @@ def segment_all_articles(file_path): - """ - Extract article titles and sections from a MediaWiki bz2 database dump. + """Extract article titles and sections from a MediaWiki bz2 database dump. + + Parameters + ---------- + file_path : str + Path to mediawiki dump, typical filename is wiki--pages-articles.xml.bz2 + or wiki-latest-pages-articles.xml.bz2. - Return an iterable over (str, list) which generates - (title, [(section_heading, section_content)]) 2-tuples. + Yields + ------ + tuple(str, list of tuple(str, str)) + Structure contains (title, [(section_heading, section_content), ...]). """ with smart_open(file_path, 'rb') as xml_fileobj: @@ -46,9 +53,17 @@ def segment_all_articles(file_path): def segment_and_print_all_articles(file_path, output_file): - """ - Prints article title and sections to stdout, tab-separated - article_titlesection_headingsection_contentsection_headingsection_content + """Write article title and sections to output_file, + tab-separated article_titlesection_headingsection_contentsection_headingsection_content. + + Parameters + ---------- + file_path : str + Path to mediawiki dump, typical filename is wiki--pages-articles.xml.bz2 + or wiki-latest-pages-articles.xml.bz2. + + output_file : str + Path to output file. """ with smart_open(output_file, 'wb') as outfile: @@ -63,10 +78,17 @@ def segment_and_print_all_articles(file_path, output_file): def extract_page_xmls(f): - """ - Extract pages from a MediaWiki database dump = open file-like object `f`. + """Extract pages from a MediaWiki database dump. - Return an iterable which generates xml strings for page tags. + Parameters + ---------- + f : file + File descriptor of MediaWiki dump. + + Yields + ------ + str + XML strings for page tags. """ elems = (elem for _, elem in cElementTree.iterparse(f, events=("end",))) @@ -90,12 +112,17 @@ def extract_page_xmls(f): def segment(page_xml): - """ - Parse the content inside a page tag, returning its content as a list of tokens - (utf8-encoded strings). + """Parse the content inside a page tag - Returns a 2-tuple (str, list) - - (title, [(section_heading, section_content)]) + Parameters + ---------- + page_xml : str + Content from page tag. + + Returns + ------- + tuple(str, list of tuple(str, str)) + Structure contains (title, [(section_heading, section_content)]). """ elem = cElementTree.fromstring(page_xml) @@ -118,7 +145,7 @@ def segment(page_xml): if text is not None: section_contents = re.split(top_level_heading_regex, text) section_headings = [lead_section_heading] + re.findall(top_level_heading_regex_capture, text) - assert(len(section_contents) == len(section_headings)) + assert len(section_contents) == len(section_headings) else: section_contents = [] section_headings = [] @@ -129,25 +156,31 @@ def segment(page_xml): class WikiSectionsCorpus(WikiCorpus): - """ - Treat a wikipedia articles dump (\*articles.xml.bz2) as a (read-only) corpus. + """Treat a wikipedia articles dump (wiki--pages-articles.xml.bz2 + or wiki-latest-pages-articles.xml.bz2) as a (read-only) corpus. - The documents are extracted on-the-fly, so that the whole (massive) dump - can stay compressed on disk. - - >>> wiki = WikiCorpus('enwiki-20100622-pages-articles.xml.bz2') # create word->word_id mapping, takes almost 8h - >>> MmCorpus.serialize('wiki_en_vocab200k.mm', wiki) # another 8h, creates a file in MatrixMarket format plus file with id->word + The documents are extracted on-the-fly, so that the whole (massive) dump can stay compressed on disk. """ def __init__(self, fileobj, processes=None, lemmatize=utils.has_pattern(), filter_namespaces=('0',)): - """ - Initialize the corpus. Unless a dictionary is provided, this scans the + """Initialize the corpus. Unless a dictionary is provided, this scans the corpus once, to determine its vocabulary. - If `pattern` package is installed, use fancier shallow parsing to get - token lemmas. Otherwise, use simple regexp tokenization. You can override + . You can override this automatic logic by forcing the `lemmatize` parameter explicitly. + Parameters + ---------- + fileobj : file + File descriptor of MediaWiki dump. + processes : int + Number of processes, max(1, multiprocessing.cpu_count() - 1) if None. + lemmatize : bool + If `pattern` package is installed, use fancier shallow parsing to get token lemmas. + Otherwise, use simple regexp tokenization. + filter_namespaces : tuple(int) + Enumeration of namespaces that will be ignored. + """ self.fileobj = fileobj self.filter_namespaces = filter_namespaces @@ -158,10 +191,10 @@ def __init__(self, fileobj, processes=None, lemmatize=utils.has_pattern(), filte self.lemmatize = lemmatize def get_texts_with_sections(self): - """ - Iterate over the dump, returning titles and text versions of all sections of articles as a list - of 2-tuples [(article_title, [(section_heading, section_content)]]. + """Iterate over the dump, returning titles and text versions of all sections of articles. + Notes + ----- Only articles of sufficient length are returned (short articles & redirects etc are ignored). @@ -170,6 +203,12 @@ def get_texts_with_sections(self): >>> for vec in wiki_corpus: >>> print(vec) + + Yields + ------ + tuple(str, list of tuple(str, str)) + Structure contains (title, [(section_heading, section_content), ...]). + """ articles = 0 page_xmls = extract_page_xmls(self.fileobj) @@ -185,7 +224,8 @@ def get_texts_with_sections(self): num_total_tokens += len(utils.lemmatize(section_content)) else: num_total_tokens += len(tokenize(section_content)) - if num_total_tokens < ARTICLE_MIN_WORDS or any(article_title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES): + if num_total_tokens < ARTICLE_MIN_WORDS or \ + any(article_title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES): continue articles += 1 yield (article_title, sections) @@ -198,8 +238,8 @@ def get_texts_with_sections(self): logger.info("running %s", " ".join(sys.argv)) parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, description=globals()['__doc__']) - parser.add_argument('-f', '--file', help='path to mediawiki database dump', required=True) - parser.add_argument('-o', '--output', help='path to output file', required=True) + parser.add_argument('-f', '--file', help='Path to mediawiki database dump', required=True) + parser.add_argument('-o', '--output', help='Path to output file', required=True) args = parser.parse_args() segment_and_print_all_articles(args.file, args.output) From 8eda36b23bd78ede2374904a5157f4f5db9d6f18 Mon Sep 17 00:00:00 2001 From: ivan Date: Fri, 27 Oct 2017 15:24:23 +0500 Subject: [PATCH 6/8] Fix types --- gensim/scripts/segment_wiki.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/gensim/scripts/segment_wiki.py b/gensim/scripts/segment_wiki.py index 2c163d9555..4c8af2faee 100755 --- a/gensim/scripts/segment_wiki.py +++ b/gensim/scripts/segment_wiki.py @@ -40,7 +40,7 @@ def segment_all_articles(file_path): Yields ------ - tuple(str, list of tuple(str, str)) + (str, list of (str, str)) Structure contains (title, [(section_heading, section_content), ...]). """ @@ -121,7 +121,7 @@ def segment(page_xml): Returns ------- - tuple(str, list of tuple(str, str)) + (str, list of (str, str)) Structure contains (title, [(section_heading, section_content)]). """ @@ -163,12 +163,7 @@ class WikiSectionsCorpus(WikiCorpus): """ def __init__(self, fileobj, processes=None, lemmatize=utils.has_pattern(), filter_namespaces=('0',)): - """Initialize the corpus. Unless a dictionary is provided, this scans the - corpus once, to determine its vocabulary. - - . You can override - this automatic logic by forcing the `lemmatize` parameter explicitly. - + """ Parameters ---------- fileobj : file @@ -178,7 +173,7 @@ def __init__(self, fileobj, processes=None, lemmatize=utils.has_pattern(), filte lemmatize : bool If `pattern` package is installed, use fancier shallow parsing to get token lemmas. Otherwise, use simple regexp tokenization. - filter_namespaces : tuple(int) + filter_namespaces : tuple of int Enumeration of namespaces that will be ignored. """ @@ -206,7 +201,7 @@ def get_texts_with_sections(self): Yields ------ - tuple(str, list of tuple(str, str)) + (str, list of (str, str)) Structure contains (title, [(section_heading, section_content), ...]). """ From e40f8c9d4dc50d7bdbea94cfc3cabbb9981afd95 Mon Sep 17 00:00:00 2001 From: ivan Date: Fri, 27 Oct 2017 16:43:42 +0500 Subject: [PATCH 7/8] Fix docstrings + output file format (json-lines) --- gensim/scripts/segment_wiki.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/gensim/scripts/segment_wiki.py b/gensim/scripts/segment_wiki.py index 4c8af2faee..132907d7b0 100755 --- a/gensim/scripts/segment_wiki.py +++ b/gensim/scripts/segment_wiki.py @@ -6,6 +6,7 @@ """ Construct a corpus from a Wikipedia (or other MediaWiki-based) database dump and extract sections of pages from it +and save to json-line format. If you have the `pattern` package installed, this module will use a fancy lemmatization to get a lemma of each token (instead of plain alphabetic @@ -35,7 +36,7 @@ def segment_all_articles(file_path): Parameters ---------- file_path : str - Path to mediawiki dump, typical filename is wiki--pages-articles.xml.bz2 + Path to MediaWiki dump, typical filename is wiki--pages-articles.xml.bz2 or wiki-latest-pages-articles.xml.bz2. Yields @@ -54,27 +55,31 @@ def segment_all_articles(file_path): def segment_and_print_all_articles(file_path, output_file): """Write article title and sections to output_file, - tab-separated article_titlesection_headingsection_contentsection_headingsection_content. + output_file is json-line file with 3 fields:: + + 'tl' - title of article, + 'st' - list of titles of sections, + 'sc' - list of content from sections. Parameters ---------- file_path : str - Path to mediawiki dump, typical filename is wiki--pages-articles.xml.bz2 + Path to MediaWiki dump, typical filename is wiki--pages-articles.xml.bz2 or wiki-latest-pages-articles.xml.bz2. output_file : str Path to output file. """ - with smart_open(output_file, 'wb') as outfile: + with smart_open(output_file, 'w') as outfile: for idx, (article_title, article_sections) in enumerate(segment_all_articles(file_path)): - printed_components = [json.dumps(article_title)] + output_data = {"tl": article_title, "st": [], "sc": []} for section_heading, section_content in article_sections: - printed_components.append(json.dumps(section_heading)) - printed_components.append(json.dumps(section_content)) + output_data["st"].append(section_heading) + output_data["sc"].append(section_content) if (idx + 1) % 100000 == 0: logger.info("Processed #%d articles", idx + 1) - outfile.write(u"\t".join(printed_components).encode('utf-8') + "\n") + outfile.write(json.dumps(output_data) + "\n") def extract_page_xmls(f): @@ -233,7 +238,7 @@ def get_texts_with_sections(self): logger.info("running %s", " ".join(sys.argv)) parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, description=globals()['__doc__']) - parser.add_argument('-f', '--file', help='Path to mediawiki database dump', required=True) + parser.add_argument('-f', '--file', help='Path to MediaWiki database dump', required=True) parser.add_argument('-o', '--output', help='Path to output file', required=True) args = parser.parse_args() segment_and_print_all_articles(args.file, args.output) From 1f923e2c1d2e722c2e71f1223f71fb980d866a67 Mon Sep 17 00:00:00 2001 From: ivan Date: Fri, 27 Oct 2017 16:49:45 +0500 Subject: [PATCH 8/8] Upd .rst --- docs/src/scripts/segment_wiki.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/src/scripts/segment_wiki.rst b/docs/src/scripts/segment_wiki.rst index b5e7070aaa..1c1846c8e5 100644 --- a/docs/src/scripts/segment_wiki.rst +++ b/docs/src/scripts/segment_wiki.rst @@ -1,8 +1,8 @@ -:mod:`scripts.segment_wiki` -- Convert wikipedia dump to plain text format -========================================================================== +:mod:`scripts.segment_wiki` -- Convert wikipedia dump to json-line format +========================================================================= .. automodule:: gensim.scripts.segment_wiki - :synopsis: Convert wikipedia dump to plain text format. + :synopsis: Convert wikipedia dump to json-line format. :members: :inherited-members: :undoc-members: