Dataset:
https://dumps.wikimedia.org/backup-index.html

Before Running the parser below, please run WikipediaExtractor script:

python2 WikipediaExtractor.py --json --bytes 10G <wikipedia-file-path>

Download:
https://github.com/attardi/wikiextractor

Source:
https://www.mediawiki.org/wiki/Alternative_parsers

# Load Libraries

In [1]:
from collections import Counter

import _pickle as cPickle
import io
import json
import re
import string
import unidecode

# Declare variables

In [2]:
data_path = '../datasets/ptwiki-20170820-pages-articles-parsed.json'
output_path = '../datasets/ptwiki-20170820-sentences-vocab30k.pickle'

# Declare hyperparameters

In [3]:
MAX_WORDS_IN_VOCABULARY = 30000
MAX_ARTICLES_SAMPLE = 1000

# Load All Data

In [4]:
data_reader = io.open(data_path, mode="r", encoding="utf-8")

# Parse all articles from JSON

In [5]:
# Go to beginning
data_reader.seek(0)

# Parse all text from json
articles = [json.loads(line)['text']
            for line in data_reader]

data_reader.close()

# Temporary - Remove once word embedding algorithms are ready
articles = articles[:MAX_ARTICLES_SAMPLE]

# Convert corpus to token list

In [6]:
numbers = '0123456789'
punctuation = re.sub('[!?\.]', '', string.punctuation)
translator = str.maketrans('', '', punctuation + numbers) 

def get_sentences(corpus):
    # Remove punctuation and numbers
    preprocessed = corpus.translate(translator)
    
    # All words to Upper case
    #preprocessed = preprocessed.upper()
    
    # Remove accentuation
    #preprocessed = unidecode.unidecode(preprocessed)
    
    # Split text into sentences
    sentences = re.split('[!?\.\\n]', preprocessed)
    
    # Remove spaces from start and end of sentence
    sentences = [sentence.strip()
                 for sentence in sentences]
    
    # First caracter of each sentence to lower
    # Each sentence must have two or more words
    sentences = [sentence[:1].lower() + sentence[1:]
                 for sentence in sentences
                 if sentence and ' '  in sentence]
    
    # Split sentence in tokens
    sentences = [re.split('\s+', sentence)
                 for sentence in sentences]
    
    return sentences

# Convert corpora to sentences of indexes

In [13]:
def preprocess_corpora(corpora, max_tokens=-1):    
    # All variables
    sentences = []
    idx = 0
    tokens = {}
    indices = {}
    indice_freq = Counter()
    word_count = 0
    
    # Preprocess all corpora
    for corpus in corpora:
        raw_sentences = get_sentences(corpus)
        for raw_sentence in raw_sentences:
            sentence = []
            for word in raw_sentence:                
                if word not in tokens:
                    tokens[word] = idx
                    indices[idx] = word
                    indice_freq[idx] = 1
                    idx += 1
                else:
                    indice_freq[tokens[word]] += 1
                sentence += [tokens[word]]
                word_count += 1
            sentences += [sentence]
    
    # Check if we need to filter the vocabulary
    if max_tokens == -1 or len(indices) <= max_tokens:
        return sentences, indices, indice_freq, word_count
    
    # Get most frequent words
    indice_freq = Counter(dict(indice_freq.most_common(max_tokens)))
    
    # Remove all other words from indices
    indices = {key:indices[key] for key,_ in indice_freq.items()}
    
    # Remap indices so they stay contiguous
    remap = {key:i for i, key in enumerate(indices.keys())}
    
    # Apply remmap in all data structures
    remapped_indices = {remap[key]:value for key, value in indices.items()}
    remapped_indice_freq = Counter({remap[key]:value for key, value in indice_freq.items()})
    remapped_sentences = [[remap[key] for key in sentence if key in indices] for sentence in sentences]   
    word_count = sum(remapped_indice_freq.values())
    
    return remapped_sentences, remapped_indices, remapped_indice_freq, word_count

# Run

In [14]:
sentences, indices, indice_freq, word_count = preprocess_corpora(articles, MAX_WORDS_IN_VOCABULARY)

# Dump all info

In [15]:
with open(output_path, 'wb') as fp:
    cPickle.dump(indices, fp) 
    cPickle.dump(indice_freq, fp)
    cPickle.dump(sentences, fp)

# Debugging...

## Check length

In [16]:
print(len(sentences))
print(len(indices))
print(word_count)

98001
30000
2148044


## Check conversion

In [17]:
line = [indices[idx]
        for idx in sentences[0]]
print(line)

['astronomia', 'é', 'uma', 'ciência', 'natural', 'que', 'estuda', 'corpos', 'celestes', 'como', 'estrelas', 'planetas', 'cometas', 'nebulosas', 'aglomerados', 'de', 'estrelas', 'galáxias', 'e', 'fenômenos', 'que', 'se', 'originam', 'fora', 'da', 'atmosfera', 'da', 'Terra', 'como', 'a', 'radiação', 'cósmica', 'de', 'fundo', 'em', 'microondas']


## Counters

In [18]:
counter = [(indices[idx],freq)
           for idx, freq in indice_freq.items()]
print(counter[:100])

[('de', 125948), ('a', 85477), ('e', 72004), ('o', 64932), ('do', 44767), ('em', 40505), ('da', 39805), ('que', 39705), ('os', 22714), ('um', 21970), ('uma', 20443), ('com', 20357), ('é', 20278), ('no', 18914), ('para', 18841), ('na', 16996), ('por', 16435), ('como', 16334), ('as', 15278), ('dos', 13619), ('mais', 11441), ('se', 10813), ('foi', 10579), ('ao', 9536), ('das', 8556), ('não', 8219), ('ou', 8171), ('são', 7790), ('sua', 7141), ('à', 7052), ('entre', 6581), ('também', 5912), ('ser', 5602), ('seu', 5518), ('pelo', 4930), ('pela', 4834), ('mas', 3831), ('anos', 3749), ('maior', 3699), ('era', 3529), ('foram', 3429), ('nos', 3325), ('até', 3311), ('sobre', 3059), ('parte', 3055), ('tem', 3055), ('país', 3006), ('outros', 2995), ('seus', 2857), ('nas', 2839), ('sendo', 2822), ('estado', 2783), ('cidade', 2722), ('grande', 2697), ('durante', 2637), ('quando', 2597), ('ainda', 2589), ('ele', 2577), ('Brasil', 2536), ('forma', 2527), ('pode', 2456), ('mesmo', 2344), ('século', 2340