Dataset:
https://dumps.wikimedia.org/backup-index.html

Before Running the parser below, please run WikipediaExtractor script:

python2 WikipediaExtractor.py --json --bytes 10G <wikipedia-file-path>

Download:
https://github.com/attardi/wikiextractor

Source:
https://www.mediawiki.org/wiki/Alternative_parsers

# Load Libraries

In [1]:
from collections import Counter

import _pickle as cPickle
import io
import json
import os.path
import re
import string
import unidecode

# Convert corpus to token list

In [2]:
numbers = '0123456789'
punctuation = re.sub('[!?\.]', '', string.punctuation)
translator = str.maketrans('', '', punctuation + numbers) 

def get_sentences(corpus):
    # Remove punctuation and numbers
    preprocessed = corpus.translate(translator)
    
    # All words to Upper case
    #preprocessed = preprocessed.upper()
    
    # Remove accentuation
    #preprocessed = unidecode.unidecode(preprocessed)
    
    # Split text into sentences
    sentences = re.split('[!?\.\\n]', preprocessed)
    
    # Remove spaces from start and end of sentence
    sentences = [sentence.strip()
                 for sentence in sentences]
    
    # First caracter of each sentence to lower
    # Each sentence must have two or more words
    sentences = [sentence[:1].lower() + sentence[1:]
                 for sentence in sentences
                 if sentence and ' '  in sentence]
    
    # Split sentence in tokens
    sentences = [re.split('\s+', sentence)
                 for sentence in sentences]
    
    return sentences

# Convert corpora to sentences of indexes

In [3]:
def sentence_to_index(corpora):
    # All variables
    sentences = []
    idx = 0
    tokens = {}
    indices = {}
    indice_freq = Counter()
    word_count = 0
    
    # Preprocess all corpora
    for corpus in corpora:
        raw_sentences = get_sentences(corpus)
        for raw_sentence in raw_sentences:
            sentence = []
            for word in raw_sentence:                
                if word not in tokens:
                    tokens[word] = idx
                    indices[idx] = word
                    indice_freq[idx] = 1
                    idx += 1
                else:
                    indice_freq[tokens[word]] += 1
                sentence += [tokens[word]]
                word_count += 1
            sentences += [sentence]
    
    return sentences, indices, indice_freq, word_count

# Filter vocabulary to most frequent words

In [4]:
def filter_vocabulary(sentences, indices, indice_freq, max_tokens):
    
    # Get most frequent words
    indice_freq = Counter(dict(indice_freq.most_common(max_tokens)))
    
    # Remove all other words from indices
    indices = {key:indices[key] for key,_ in indice_freq.items()}
    
    # Remap indices so they stay contiguous
    remap = {key:i for i, key in enumerate(indices.keys())}
    
    # Apply remmap in all data structures        
    remapped_indices = {remap[key]:value for key, value in indices.items()}
    remapped_indice_freq = Counter({remap[key]:value for key, value in indice_freq.items()})
    remapped_sentences = [[remap[key] for key in sentence if key in indices] for sentence in sentences]
    word_count = sum(remapped_indice_freq.values())
    
    return remapped_sentences, remapped_indices, remapped_indice_freq, word_count

# Declare variables

In [5]:
data_path = '../datasets/ptwiki-20170801-pages-articles-parsed.json'
output_path = '../datasets/ptwiki-20170801-sentences-doc100k.pickle'
filtered_output_path = '../datasets/ptwiki-20170801-sentences-doc100k-vocab30k.pickle'

# Declare hyperparameters

In [6]:
MAX_WORDS_IN_VOCABULARY = 30000
MAX_ARTICLES_SAMPLE = 100000

# Data structures to be used

In [7]:
sentences = []
indices = {}
indice_freq = Counter()
word_count = 0

# Convert all articles to indexes

In [8]:
# Check if already converted
if os.path.isfile(output_path) and not os.path.isfile(filtered_output_path):
    with open(output_path, 'rb') as fp:
        indices = cPickle.load(fp) 
        indice_freq = cPickle.load(fp)
        sentences = cPickle.load(fp)
        word_count = sum(indice_freq.values())
elif not os.path.isfile(output_path):
    # Load data
    data_reader = io.open(data_path, mode="r", encoding="utf-8")

    # Go to beginning
    data_reader.seek(0)

    # Parse all text from json
    articles = [json.loads(line)['text']
                for line in data_reader]

    data_reader.close()

    # Temporary - Remove once word embedding algorithms are ready
    articles = articles[:MAX_ARTICLES_SAMPLE]

    # Run
    sentences, indices, indice_freq, word_count = sentence_to_index(articles)
    
    # Delete articles - memory restraint
    del articles
    
    # Save indexes
    with open(output_path, 'wb') as fp:
        cPickle.dump(indices, fp) 
        cPickle.dump(indice_freq, fp)
        cPickle.dump(sentences, fp) 

# Filter tokens to be used

In [10]:
if os.path.isfile(filtered_output_path):
    with open(filtered_output_path, 'rb') as fp:
        indices = cPickle.load(fp) 
        indice_freq = cPickle.load(fp)
        sentences = cPickle.load(fp)
        word_count = sum(indice_freq.values())
else:
    # Run
    sentences, indices, indice_freq, word_count = filter_vocabulary(sentences,
                                                                    indices,
                                                                    indice_freq,
                                                                    MAX_WORDS_IN_VOCABULARY)
    
    # Save indexes
    with open(filtered_output_path, 'wb') as fp:
        cPickle.dump(indices, fp) 
        cPickle.dump(indice_freq, fp)
        cPickle.dump(sentences, fp)    

# Debugging...

## Check length

In [11]:
print(len(sentences))
print(len(indices))
print(word_count)

2256783
30000
43174486


## Check conversion

In [12]:
line = [indices[idx]
        for idx in sentences[0]]
print(line)

['astronomia', 'é', 'uma', 'ciência', 'natural', 'que', 'estuda', 'corpos', 'celestes', 'como', 'estrelas', 'planetas', 'cometas', 'nebulosas', 'aglomerados', 'de', 'estrelas', 'galáxias', 'e', 'fenômenos', 'que', 'se', 'originam', 'fora', 'da', 'atmosfera', 'da', 'Terra', 'como', 'a', 'radiação', 'cósmica', 'de', 'fundo', 'em', 'microondas']


## Counters

In [13]:
counter = [(indices[idx],freq)
           for idx, freq in indice_freq.items()]
print(counter[:100])

[('de', 2792970), ('a', 1647884), ('e', 1405845), ('o', 1306982), ('do', 895945), ('em', 883814), ('da', 811221), ('que', 763086), ('com', 487890), ('um', 473180), ('uma', 469441), ('é', 428970), ('no', 424465), ('para', 408176), ('os', 396595), ('na', 378046), ('por', 342822), ('como', 294468), ('foi', 281065), ('as', 259474), ('dos', 253503), ('se', 215549), ('mais', 204121), ('ao', 201763), ('sua', 181005), ('não', 160158), ('das', 157043), ('ou', 145174), ('seu', 143600), ('à', 139964), ('são', 121997), ('também', 120958), ('entre', 107762), ('pela', 107606), ('ser', 107142), ('pelo', 105017), ('era', 87153), ('mas', 79502), ('cidade', 79066), ('anos', 78784), ('ele', 73945), ('até', 69869), ('foram', 67802), ('nos', 67576), ('seus', 65028), ('quando', 60903), ('onde', 57482), ('sendo', 57388), ('tem', 56150), ('área', 56114), ('nas', 55895), ('município', 55766), ('região', 55454), ('parte', 53235), ('sobre', 53172), ('durante', 53141), ('ainda', 52843), ('mesmo', 51896), ('ano', 