From c7b8a8af4f4727157a11ff79ee744d8ceaad1d9e Mon Sep 17 00:00:00 2001
From: ivan <menshikh.iv@gmail.com>
Date: Thu, 13 Jul 2017 20:28:14 +0500
Subject: [PATCH 1/8] add segment wiki script

---
 gensim/scripts/segment_wiki.py | 214 +++++++++++++++++++++++++++++++++
 1 file changed, 214 insertions(+)
 create mode 100755 gensim/scripts/segment_wiki.py
diff --git a/gensim/scripts/segment_wiki.py b/gensim/scripts/segment_wiki.py
new file mode 100755
index 0000000000..0bfb830e6c
--- /dev/null
+++ b/gensim/scripts/segment_wiki.py
@@ -0,0 +1,214 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Author: Jayant Jain <jayant@rare-technologies.com>
+# Copyright (C) 2016 RaRe Technologies
+
+"""
+Construct a corpus from a Wikipedia (or other MediaWiki-based) database dump and extract sections of pages from it
+
+If you have the `pattern` package installed, this module will use a fancy
+lemmatization to get a lemma of each token (instead of plain alphabetic
+tokenizer). The package is available at https://github.com/clips/pattern .
+
+"""
+
+import argparse
+import json
+import logging
+import multiprocessing
+import os
+import re
+import sys
+from xml.etree import cElementTree
+
+from gensim.corpora.wikicorpus import ARTICLE_MIN_WORDS, IGNORED_NAMESPACES, WikiCorpus, \
+    filter_wiki, get_namespace, tokenize, utils
+from smart_open import smart_open
+
+
+def segment_all_articles(file_path):
+    """
+    Extract article titles and sections from a MediaWiki bz2 database dump.
+
+    Return an iterable over (str, list) which generates
+    (title, [(section_heading, section_content)]) 2-tuples.
+
+    """
+    with smart_open(file_path, 'rb') as xml_fileobj:
+        wiki_sections_corpus = WikiSectionsCorpus(xml_fileobj)
+        wiki_sections_corpus.metadata = True
+        wiki_sections_text = wiki_sections_corpus.get_texts_with_sections()
+        for article_title, article_sections in wiki_sections_text:
+            yield article_title, article_sections
+
+
+def segment_and_print_all_articles(file_path):
+    """
+    Prints article title and sections to stdout, tab-separated
+    article_title<tab>section_heading<tab>section_content<tab>section_heading<tab>section_content
+
+    """
+    for article_title, article_sections in segment_all_articles(file_path):
+        printed_components = [json.dumps(article_title)]
+        for section_heading, section_content in article_sections:
+            printed_components.append(json.dumps(section_heading))
+            printed_components.append(json.dumps(section_content))
+        os.write(sys.stdout.fileno(), u"\t".join(printed_components).encode('utf-8') + b"\n")
+
+
+# noinspection PyUnresolvedReferences
+def extract_page_xmls(f):
+    """
+    Extract pages from a MediaWiki database dump = open file-like object `f`.
+
+    Return an iterable which generates xml strings for page tags.
+
+    """
+    elems = (elem for _, elem in cElementTree.iterparse(f, events=("end",)))
+
+    elem = next(elems)
+    namespace = get_namespace(elem.tag)
+    ns_mapping = {"ns": namespace}
+    page_tag = "{%(ns)s}page" % ns_mapping
+
+    for elem in elems:
+        if elem.tag == page_tag:
+            yield cElementTree.tostring(elem)
+            # Prune the element tree, as per
+            # http://www.ibm.com/developerworks/xml/library/x-hiperfparse/
+            # except that we don't need to prune backlinks from the parent
+            # because we don't use LXML.
+            # We do this only for <page>s, since we need to inspect the
+            # ./revision/text element. The pages comprise the bulk of the
+            # file, so in practice we prune away enough.
+            elem.clear()
+
+
+# noinspection PyUnresolvedReferences
+def segment(page_xml):
+    """
+    Parse the content inside a page tag, returning its content as a list of tokens
+    (utf8-encoded strings).
+
+    Returns a 2-tuple (str, list) -
+    (title, [(section_heading, section_content)])
+
+    """
+    elem = cElementTree.fromstring(page_xml)
+    filter_namespaces = ('0',)
+    namespace = get_namespace(elem.tag)
+    ns_mapping = {"ns": namespace}
+    text_path = "./{%(ns)s}revision/{%(ns)s}text" % ns_mapping
+    title_path = "./{%(ns)s}title" % ns_mapping
+    ns_path = "./{%(ns)s}ns" % ns_mapping
+    lead_section_heading = "Introduction"
+    top_level_heading_regex = r"\n==[^=].*[^=]==\n"
+    top_level_heading_regex_capture = r"\n==([^=].*[^=])==\n"
+
+    title = elem.find(title_path).text
+    text = elem.find(text_path).text
+    ns = elem.find(ns_path).text
+    if ns not in filter_namespaces:
+        text = None
+
+    if text is not None:
+        section_contents = re.split(top_level_heading_regex, text)
+        section_headings = [lead_section_heading] + re.findall(top_level_heading_regex_capture, text)
+        assert(len(section_contents) == len(section_headings))
+    else:
+        section_contents = []
+        section_headings = []
+
+    section_contents = [filter_wiki(section_content) for section_content in section_contents]
+    sections = list(zip(section_headings, section_contents))
+    return title, sections
+
+
+# noinspection PyUnresolvedReferences,PyMissingConstructor,PyAttributeOutsideInit,PyAbstractClass,PyUnusedLocal
+class WikiSectionsCorpus(WikiCorpus):
+    """
+    Treat a wikipedia articles dump (\*articles.xml.bz2) as a (read-only) corpus.
+
+    The documents are extracted on-the-fly, so that the whole (massive) dump
+    can stay compressed on disk.
+
+    >>> wiki = WikiCorpus('enwiki-20100622-pages-articles.xml.bz2') # create word->word_id mapping, takes almost 8h
+    >>> MmCorpus.serialize('wiki_en_vocab200k.mm', wiki) # another 8h, creates a file in MatrixMarket format plus file with id->word
+
+    """
+    def __init__(self, fileobj, processes=None, lemmatize=utils.has_pattern(), dictionary=None, filter_namespaces=('0',)):
+        """
+        Initialize the corpus. Unless a dictionary is provided, this scans the
+        corpus once, to determine its vocabulary.
+
+        If `pattern` package is installed, use fancier shallow parsing to get
+        token lemmas. Otherwise, use simple regexp tokenization. You can override
+        this automatic logic by forcing the `lemmatize` parameter explicitly.
+
+        """
+        self.fileobj = fileobj
+        self.filter_namespaces = filter_namespaces
+        self.metadata = False
+        if processes is None:
+            processes = max(1, multiprocessing.cpu_count() - 1)
+        self.processes = processes
+        self.lemmatize = lemmatize
+
+    def get_texts_with_sections(self):
+        """
+        Iterate over the dump, returning titles and text versions of all sections of articles as a list
+        of 2-tuples [(article_title, [(section_heading, section_content)]].
+
+        Only articles of sufficient length are returned (short articles & redirects
+        etc are ignored).
+
+        Note that this iterates over the **texts**; if you want vectors, just use
+        the standard corpus interface instead of this function::
+
+        >>> for vec in wiki_corpus:
+        >>>     print(vec)
+        """
+        articles = 0
+        page_xmls = extract_page_xmls(self.fileobj)
+        pool = multiprocessing.Pool(self.processes)
+        # process the corpus in smaller chunks of docs, because multiprocessing.Pool
+        # is dumb and would load the entire input into RAM at once...
+        for group in utils.chunkize(page_xmls, chunksize=10 * self.processes, maxsize=1):
+            for article_title, sections in pool.imap(segment, group):  # chunksize=10):
+                # article redirects and short stubs are pruned here
+                num_total_tokens = 0
+                for section_title, section_content in sections:
+                    if self.lemmatize:
+                        num_total_tokens += len(utils.lemmatize(section_content))
+                    else:
+                        num_total_tokens += len(tokenize(section_content))
+                if num_total_tokens < ARTICLE_MIN_WORDS or any(article_title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES):
+                   continue
+                articles += 1
+                yield (article_title, sections)
+        pool.terminate()
+        self.length = articles  # cache corpus length
+# endclass WikiSectionsCorpus
+
+
+logger = logging.getLogger(__name__)
+
+if __name__ == "__main__":
+    logging_format = '%(asctime)s : %(processName)s : %(levelname)s : %(message)s'
+    logging_level = logging.INFO
+    logging.basicConfig(format=logging_format, level=logging_level)
+    logger.info("running %s", " ".join(sys.argv))
+
+    program = os.path.basename(sys.argv[0])
+    parser = argparse.ArgumentParser(
+        prog=program,
+        formatter_class=argparse.RawTextHelpFormatter,
+        description=globals()['__doc__'])
+    parser.add_argument(
+        '-f', '--file',
+        help='path to mediawiki database dump')
+    args = parser.parse_args()
+    segment_and_print_all_articles(args.file)
+
+    logger.info("finished running %s", program)

From 11691bb3a3088a7a9becf088b20fc896a0cacc3a Mon Sep 17 00:00:00 2001
From: ivan <menshikh.iv@gmail.com>
Date: Thu, 13 Jul 2017 22:43:41 +0500
Subject: [PATCH 2/8] fix indentation error

---
 gensim/scripts/segment_wiki.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gensim/scripts/segment_wiki.py b/gensim/scripts/segment_wiki.py
index 0bfb830e6c..8b426b61d7 100755
--- a/gensim/scripts/segment_wiki.py
+++ b/gensim/scripts/segment_wiki.py
@@ -184,7 +184,7 @@ def get_texts_with_sections(self):
                     else:
                         num_total_tokens += len(tokenize(section_content))
                 if num_total_tokens < ARTICLE_MIN_WORDS or any(article_title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES):
-                   continue
+                    continue
                 articles += 1
                 yield (article_title, sections)
         pool.terminate()

From fb83ef2e518edc0b64c667fcbc3b6f0c81055274 Mon Sep 17 00:00:00 2001
From: ivan <menshikh.iv@gmail.com>
Date: Fri, 6 Oct 2017 16:56:28 +0500
Subject: [PATCH 3/8] Add output file and logging + small fixes

---
 gensim/scripts/segment_wiki.py | 48 ++++++++++++++--------------------
 1 file changed, 20 insertions(+), 28 deletions(-)

diff --git a/gensim/scripts/segment_wiki.py b/gensim/scripts/segment_wiki.py
index 8b426b61d7..e2ba1f5ab5 100755
--- a/gensim/scripts/segment_wiki.py
+++ b/gensim/scripts/segment_wiki.py
@@ -17,7 +17,6 @@
 import json
 import logging
 import multiprocessing
-import os
 import re
 import sys
 from xml.etree import cElementTree
@@ -27,6 +26,9 @@
 from smart_open import smart_open
 
 
+logger = logging.getLogger(__name__)
+
+
 def segment_all_articles(file_path):
     """
     Extract article titles and sections from a MediaWiki bz2 database dump.
@@ -43,21 +45,23 @@ def segment_all_articles(file_path):
             yield article_title, article_sections
 
 
-def segment_and_print_all_articles(file_path):
+def segment_and_print_all_articles(file_path, output_file):
     """
     Prints article title and sections to stdout, tab-separated
     article_title<tab>section_heading<tab>section_content<tab>section_heading<tab>section_content
 
     """
-    for article_title, article_sections in segment_all_articles(file_path):
-        printed_components = [json.dumps(article_title)]
-        for section_heading, section_content in article_sections:
-            printed_components.append(json.dumps(section_heading))
-            printed_components.append(json.dumps(section_content))
-        os.write(sys.stdout.fileno(), u"\t".join(printed_components).encode('utf-8') + b"\n")
+    with open(output_file, 'wb') as outfile:
+        for idx, (article_title, article_sections) in enumerate(segment_all_articles(file_path)):
+            printed_components = [json.dumps(article_title)]
+            for section_heading, section_content in article_sections:
+                printed_components.append(json.dumps(section_heading))
+                printed_components.append(json.dumps(section_content))
+            if (idx + 1) % 100000 == 0:
+                logger.info("Processed #%d articles", idx + 1)
+            outfile.write(u"\t".join(printed_components).encode('utf-8') + "\n")
 
 
-# noinspection PyUnresolvedReferences
 def extract_page_xmls(f):
     """
     Extract pages from a MediaWiki database dump = open file-like object `f`.
@@ -85,7 +89,6 @@ def extract_page_xmls(f):
             elem.clear()
 
 
-# noinspection PyUnresolvedReferences
 def segment(page_xml):
     """
     Parse the content inside a page tag, returning its content as a list of tokens
@@ -125,7 +128,6 @@ def segment(page_xml):
     return title, sections
 
 
-# noinspection PyUnresolvedReferences,PyMissingConstructor,PyAttributeOutsideInit,PyAbstractClass,PyUnusedLocal
 class WikiSectionsCorpus(WikiCorpus):
     """
     Treat a wikipedia articles dump (\*articles.xml.bz2) as a (read-only) corpus.
@@ -137,7 +139,7 @@ class WikiSectionsCorpus(WikiCorpus):
     >>> MmCorpus.serialize('wiki_en_vocab200k.mm', wiki) # another 8h, creates a file in MatrixMarket format plus file with id->word
 
     """
-    def __init__(self, fileobj, processes=None, lemmatize=utils.has_pattern(), dictionary=None, filter_namespaces=('0',)):
+    def __init__(self, fileobj, processes=None, lemmatize=utils.has_pattern(), filter_namespaces=('0',)):
         """
         Initialize the corpus. Unless a dictionary is provided, this scans the
         corpus once, to determine its vocabulary.
@@ -189,26 +191,16 @@ def get_texts_with_sections(self):
                 yield (article_title, sections)
         pool.terminate()
         self.length = articles  # cache corpus length
-# endclass WikiSectionsCorpus
 
 
-logger = logging.getLogger(__name__)
-
 if __name__ == "__main__":
-    logging_format = '%(asctime)s : %(processName)s : %(levelname)s : %(message)s'
-    logging_level = logging.INFO
-    logging.basicConfig(format=logging_format, level=logging_level)
+    logging.basicConfig(format='%(asctime)s : %(processName)s : %(levelname)s : %(message)s', level=logging.INFO)
     logger.info("running %s", " ".join(sys.argv))
 
-    program = os.path.basename(sys.argv[0])
-    parser = argparse.ArgumentParser(
-        prog=program,
-        formatter_class=argparse.RawTextHelpFormatter,
-        description=globals()['__doc__'])
-    parser.add_argument(
-        '-f', '--file',
-        help='path to mediawiki database dump')
+    parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, description=globals()['__doc__'])
+    parser.add_argument('-f', '--file', help='path to mediawiki database dump', required=True)
+    parser.add_argument('-o', '--output', help='path to output file', required=True)
     args = parser.parse_args()
-    segment_and_print_all_articles(args.file)
+    segment_and_print_all_articles(args.file, args.output)
 
-    logger.info("finished running %s", program)
+    logger.info("finished running %s", sys.argv[0])

From 102c0df7468d9a9f4d73d01fd9dcc672a5b636e3 Mon Sep 17 00:00:00 2001
From: ivan <menshikh.iv@gmail.com>
Date: Fri, 6 Oct 2017 18:19:10 +0500
Subject: [PATCH 4/8] add smart_open

---
 gensim/scripts/segment_wiki.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gensim/scripts/segment_wiki.py b/gensim/scripts/segment_wiki.py
index e2ba1f5ab5..60239e0f2d 100755
--- a/gensim/scripts/segment_wiki.py
+++ b/gensim/scripts/segment_wiki.py
@@ -51,7 +51,7 @@ def segment_and_print_all_articles(file_path, output_file):
     article_title<tab>section_heading<tab>section_content<tab>section_heading<tab>section_content
 
     """
-    with open(output_file, 'wb') as outfile:
+    with smart_open(output_file, 'wb') as outfile:
         for idx, (article_title, article_sections) in enumerate(segment_all_articles(file_path)):
             printed_components = [json.dumps(article_title)]
             for section_heading, section_content in article_sections:

From ef3b094d9422c4c35d46c03e7b8682dd46a71890 Mon Sep 17 00:00:00 2001
From: ivan <menshikh.iv@gmail.com>
Date: Fri, 27 Oct 2017 13:55:53 +0500
Subject: [PATCH 5/8] Add numpy-style docstrings & fix .rst

---
 docs/src/apiref.rst               |   1 +
 docs/src/scripts/segment_wiki.rst |   9 +++
 gensim/scripts/segment_wiki.py    | 106 ++++++++++++++++++++----------
 3 files changed, 83 insertions(+), 33 deletions(-)
 create mode 100644 docs/src/scripts/segment_wiki.rst

diff --git a/docs/src/apiref.rst b/docs/src/apiref.rst
index 80bfd8547a..3538dca954 100644
--- a/docs/src/apiref.rst
+++ b/docs/src/apiref.rst
@@ -79,6 +79,7 @@ Modules:
     scripts/make_wiki_online_lemma
     scripts/make_wiki_online_nodebug
     scripts/word2vec2tensor
+    scripts/segment_wiki
     parsing/porter
     parsing/preprocessing
     summarization/bm25
diff --git a/docs/src/scripts/segment_wiki.rst b/docs/src/scripts/segment_wiki.rst
new file mode 100644
index 0000000000..b5e7070aaa
--- /dev/null
+++ b/docs/src/scripts/segment_wiki.rst
@@ -0,0 +1,9 @@
+:mod:`scripts.segment_wiki` -- Convert wikipedia dump to plain text format
+==========================================================================
+
+.. automodule:: gensim.scripts.segment_wiki
+    :synopsis: Convert wikipedia dump to plain text format.
+    :members:
+    :inherited-members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/gensim/scripts/segment_wiki.py b/gensim/scripts/segment_wiki.py
index 60239e0f2d..2c163d9555 100755
--- a/gensim/scripts/segment_wiki.py
+++ b/gensim/scripts/segment_wiki.py
@@ -30,11 +30,18 @@
 
 
 def segment_all_articles(file_path):
-    """
-    Extract article titles and sections from a MediaWiki bz2 database dump.
+    """Extract article titles and sections from a MediaWiki bz2 database dump.
+
+    Parameters
+    ----------
+    file_path : str
+        Path to mediawiki dump, typical filename is <LANG>wiki-<YYYYMMDD>-pages-articles.xml.bz2
+        or <LANG>wiki-latest-pages-articles.xml.bz2.
 
-    Return an iterable over (str, list) which generates
-    (title, [(section_heading, section_content)]) 2-tuples.
+    Yields
+    ------
+    tuple(str, list of tuple(str, str))
+        Structure contains (title, [(section_heading, section_content), ...]).
 
     """
     with smart_open(file_path, 'rb') as xml_fileobj:
@@ -46,9 +53,17 @@ def segment_all_articles(file_path):
 
 
 def segment_and_print_all_articles(file_path, output_file):
-    """
-    Prints article title and sections to stdout, tab-separated
-    article_title<tab>section_heading<tab>section_content<tab>section_heading<tab>section_content
+    """Write article title and sections to output_file,
+    tab-separated article_title<tab>section_heading<tab>section_content<tab>section_heading<tab>section_content.
+
+    Parameters
+    ----------
+    file_path : str
+        Path to mediawiki dump, typical filename is <LANG>wiki-<YYYYMMDD>-pages-articles.xml.bz2
+        or <LANG>wiki-latest-pages-articles.xml.bz2.
+
+    output_file : str
+        Path to output file.
 
     """
     with smart_open(output_file, 'wb') as outfile:
@@ -63,10 +78,17 @@ def segment_and_print_all_articles(file_path, output_file):
 
 
 def extract_page_xmls(f):
-    """
-    Extract pages from a MediaWiki database dump = open file-like object `f`.
+    """Extract pages from a MediaWiki database dump.
 
-    Return an iterable which generates xml strings for page tags.
+    Parameters
+    ----------
+    f : file
+        File descriptor of MediaWiki dump.
+
+    Yields
+    ------
+    str
+        XML strings for page tags.
 
     """
     elems = (elem for _, elem in cElementTree.iterparse(f, events=("end",)))
@@ -90,12 +112,17 @@ def extract_page_xmls(f):
 
 
 def segment(page_xml):
-    """
-    Parse the content inside a page tag, returning its content as a list of tokens
-    (utf8-encoded strings).
+    """Parse the content inside a page tag
 
-    Returns a 2-tuple (str, list) -
-    (title, [(section_heading, section_content)])
+    Parameters
+    ----------
+    page_xml : str
+        Content from page tag.
+
+    Returns
+    -------
+    tuple(str, list of tuple(str, str))
+        Structure contains (title, [(section_heading, section_content)]).
 
     """
     elem = cElementTree.fromstring(page_xml)
@@ -118,7 +145,7 @@ def segment(page_xml):
     if text is not None:
         section_contents = re.split(top_level_heading_regex, text)
         section_headings = [lead_section_heading] + re.findall(top_level_heading_regex_capture, text)
-        assert(len(section_contents) == len(section_headings))
+        assert len(section_contents) == len(section_headings)
     else:
         section_contents = []
         section_headings = []
@@ -129,25 +156,31 @@ def segment(page_xml):
 
 
 class WikiSectionsCorpus(WikiCorpus):
-    """
-    Treat a wikipedia articles dump (\*articles.xml.bz2) as a (read-only) corpus.
+    """Treat a wikipedia articles dump (<LANG>wiki-<YYYYMMDD>-pages-articles.xml.bz2
+    or <LANG>wiki-latest-pages-articles.xml.bz2) as a (read-only) corpus.
 
-    The documents are extracted on-the-fly, so that the whole (massive) dump
-    can stay compressed on disk.
-
-    >>> wiki = WikiCorpus('enwiki-20100622-pages-articles.xml.bz2') # create word->word_id mapping, takes almost 8h
-    >>> MmCorpus.serialize('wiki_en_vocab200k.mm', wiki) # another 8h, creates a file in MatrixMarket format plus file with id->word
+    The documents are extracted on-the-fly, so that the whole (massive) dump can stay compressed on disk.
 
     """
     def __init__(self, fileobj, processes=None, lemmatize=utils.has_pattern(), filter_namespaces=('0',)):
-        """
-        Initialize the corpus. Unless a dictionary is provided, this scans the
+        """Initialize the corpus. Unless a dictionary is provided, this scans the
         corpus once, to determine its vocabulary.
 
-        If `pattern` package is installed, use fancier shallow parsing to get
-        token lemmas. Otherwise, use simple regexp tokenization. You can override
+        . You can override
         this automatic logic by forcing the `lemmatize` parameter explicitly.
 
+        Parameters
+        ----------
+        fileobj : file
+            File descriptor of MediaWiki dump.
+        processes : int
+            Number of processes, max(1, multiprocessing.cpu_count() - 1) if None.
+        lemmatize : bool
+            If `pattern` package is installed, use fancier shallow parsing to get token lemmas.
+            Otherwise, use simple regexp tokenization.
+        filter_namespaces : tuple(int)
+            Enumeration of namespaces that will be ignored.
+
         """
         self.fileobj = fileobj
         self.filter_namespaces = filter_namespaces
@@ -158,10 +191,10 @@ def __init__(self, fileobj, processes=None, lemmatize=utils.has_pattern(), filte
         self.lemmatize = lemmatize
 
     def get_texts_with_sections(self):
-        """
-        Iterate over the dump, returning titles and text versions of all sections of articles as a list
-        of 2-tuples [(article_title, [(section_heading, section_content)]].
+        """Iterate over the dump, returning titles and text versions of all sections of articles.
 
+        Notes
+        -----
         Only articles of sufficient length are returned (short articles & redirects
         etc are ignored).
 
@@ -170,6 +203,12 @@ def get_texts_with_sections(self):
 
         >>> for vec in wiki_corpus:
         >>>     print(vec)
+
+        Yields
+        ------
+        tuple(str, list of tuple(str, str))
+            Structure contains (title, [(section_heading, section_content), ...]).
+
         """
         articles = 0
         page_xmls = extract_page_xmls(self.fileobj)
@@ -185,7 +224,8 @@ def get_texts_with_sections(self):
                         num_total_tokens += len(utils.lemmatize(section_content))
                     else:
                         num_total_tokens += len(tokenize(section_content))
-                if num_total_tokens < ARTICLE_MIN_WORDS or any(article_title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES):
+                if num_total_tokens < ARTICLE_MIN_WORDS or \
+                        any(article_title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES):
                     continue
                 articles += 1
                 yield (article_title, sections)
@@ -198,8 +238,8 @@ def get_texts_with_sections(self):
     logger.info("running %s", " ".join(sys.argv))
 
     parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, description=globals()['__doc__'])
-    parser.add_argument('-f', '--file', help='path to mediawiki database dump', required=True)
-    parser.add_argument('-o', '--output', help='path to output file', required=True)
+    parser.add_argument('-f', '--file', help='Path to mediawiki database dump', required=True)
+    parser.add_argument('-o', '--output', help='Path to output file', required=True)
     args = parser.parse_args()
     segment_and_print_all_articles(args.file, args.output)
 

From 8eda36b23bd78ede2374904a5157f4f5db9d6f18 Mon Sep 17 00:00:00 2001
From: ivan <menshikh.iv@gmail.com>
Date: Fri, 27 Oct 2017 15:24:23 +0500
Subject: [PATCH 6/8] Fix types

---
 gensim/scripts/segment_wiki.py | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/gensim/scripts/segment_wiki.py b/gensim/scripts/segment_wiki.py
index 2c163d9555..4c8af2faee 100755
--- a/gensim/scripts/segment_wiki.py
+++ b/gensim/scripts/segment_wiki.py
@@ -40,7 +40,7 @@ def segment_all_articles(file_path):
 
     Yields
     ------
-    tuple(str, list of tuple(str, str))
+    (str, list of (str, str))
         Structure contains (title, [(section_heading, section_content), ...]).
 
     """
@@ -121,7 +121,7 @@ def segment(page_xml):
 
     Returns
     -------
-    tuple(str, list of tuple(str, str))
+    (str, list of (str, str))
         Structure contains (title, [(section_heading, section_content)]).
 
     """
@@ -163,12 +163,7 @@ class WikiSectionsCorpus(WikiCorpus):
 
     """
     def __init__(self, fileobj, processes=None, lemmatize=utils.has_pattern(), filter_namespaces=('0',)):
-        """Initialize the corpus. Unless a dictionary is provided, this scans the
-        corpus once, to determine its vocabulary.
-
-        . You can override
-        this automatic logic by forcing the `lemmatize` parameter explicitly.
-
+        """
         Parameters
         ----------
         fileobj : file
@@ -178,7 +173,7 @@ def __init__(self, fileobj, processes=None, lemmatize=utils.has_pattern(), filte
         lemmatize : bool
             If `pattern` package is installed, use fancier shallow parsing to get token lemmas.
             Otherwise, use simple regexp tokenization.
-        filter_namespaces : tuple(int)
+        filter_namespaces : tuple of int
             Enumeration of namespaces that will be ignored.
 
         """
@@ -206,7 +201,7 @@ def get_texts_with_sections(self):
 
         Yields
         ------
-        tuple(str, list of tuple(str, str))
+        (str, list of (str, str))
             Structure contains (title, [(section_heading, section_content), ...]).
 
         """

From e40f8c9d4dc50d7bdbea94cfc3cabbb9981afd95 Mon Sep 17 00:00:00 2001
From: ivan <menshikh.iv@gmail.com>
Date: Fri, 27 Oct 2017 16:43:42 +0500
Subject: [PATCH 7/8] Fix docstrings + output file format (json-lines)

---
 gensim/scripts/segment_wiki.py | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/gensim/scripts/segment_wiki.py b/gensim/scripts/segment_wiki.py
index 4c8af2faee..132907d7b0 100755
--- a/gensim/scripts/segment_wiki.py
+++ b/gensim/scripts/segment_wiki.py
@@ -6,6 +6,7 @@
 
 """
 Construct a corpus from a Wikipedia (or other MediaWiki-based) database dump and extract sections of pages from it
+and save to json-line format.
 
 If you have the `pattern` package installed, this module will use a fancy
 lemmatization to get a lemma of each token (instead of plain alphabetic
@@ -35,7 +36,7 @@ def segment_all_articles(file_path):
     Parameters
     ----------
     file_path : str
-        Path to mediawiki dump, typical filename is <LANG>wiki-<YYYYMMDD>-pages-articles.xml.bz2
+        Path to MediaWiki dump, typical filename is <LANG>wiki-<YYYYMMDD>-pages-articles.xml.bz2
         or <LANG>wiki-latest-pages-articles.xml.bz2.
 
     Yields
@@ -54,27 +55,31 @@ def segment_all_articles(file_path):
 
 def segment_and_print_all_articles(file_path, output_file):
     """Write article title and sections to output_file,
-    tab-separated article_title<tab>section_heading<tab>section_content<tab>section_heading<tab>section_content.
+    output_file is json-line file with 3 fields::
+
+        'tl' - title of article,
+        'st' - list of titles of sections,
+        'sc' - list of content from sections.
 
     Parameters
     ----------
     file_path : str
-        Path to mediawiki dump, typical filename is <LANG>wiki-<YYYYMMDD>-pages-articles.xml.bz2
+        Path to MediaWiki dump, typical filename is <LANG>wiki-<YYYYMMDD>-pages-articles.xml.bz2
         or <LANG>wiki-latest-pages-articles.xml.bz2.
 
     output_file : str
         Path to output file.
 
     """
-    with smart_open(output_file, 'wb') as outfile:
+    with smart_open(output_file, 'w') as outfile:
         for idx, (article_title, article_sections) in enumerate(segment_all_articles(file_path)):
-            printed_components = [json.dumps(article_title)]
+            output_data = {"tl": article_title, "st": [], "sc": []}
             for section_heading, section_content in article_sections:
-                printed_components.append(json.dumps(section_heading))
-                printed_components.append(json.dumps(section_content))
+                output_data["st"].append(section_heading)
+                output_data["sc"].append(section_content)
             if (idx + 1) % 100000 == 0:
                 logger.info("Processed #%d articles", idx + 1)
-            outfile.write(u"\t".join(printed_components).encode('utf-8') + "\n")
+            outfile.write(json.dumps(output_data) + "\n")
 
 
 def extract_page_xmls(f):
@@ -233,7 +238,7 @@ def get_texts_with_sections(self):
     logger.info("running %s", " ".join(sys.argv))
 
     parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, description=globals()['__doc__'])
-    parser.add_argument('-f', '--file', help='Path to mediawiki database dump', required=True)
+    parser.add_argument('-f', '--file', help='Path to MediaWiki database dump', required=True)
     parser.add_argument('-o', '--output', help='Path to output file', required=True)
     args = parser.parse_args()
     segment_and_print_all_articles(args.file, args.output)

From 1f923e2c1d2e722c2e71f1223f71fb980d866a67 Mon Sep 17 00:00:00 2001
From: ivan <menshikh.iv@gmail.com>
Date: Fri, 27 Oct 2017 16:49:45 +0500
Subject: [PATCH 8/8] Upd .rst

---
 docs/src/scripts/segment_wiki.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/src/scripts/segment_wiki.rst b/docs/src/scripts/segment_wiki.rst
index b5e7070aaa..1c1846c8e5 100644
--- a/docs/src/scripts/segment_wiki.rst
+++ b/docs/src/scripts/segment_wiki.rst
@@ -1,8 +1,8 @@
-:mod:`scripts.segment_wiki` -- Convert wikipedia dump to plain text format
-==========================================================================
+:mod:`scripts.segment_wiki` -- Convert wikipedia dump to json-line format
+=========================================================================
 
 .. automodule:: gensim.scripts.segment_wiki
-    :synopsis: Convert wikipedia dump to plain text format.
+    :synopsis: Convert wikipedia dump to json-line format.
     :members:
     :inherited-members:
     :undoc-members: