Dataset:
https://dumps.wikimedia.org/backup-index.html

Before Running the parser below, please run WikipediaExtractor script:

python2 WikipediaExtractor.py --json --bytes 10G <wikipedia-file-path>

Download:
https://github.com/attardi/wikiextractor

Source:
https://www.mediawiki.org/wiki/Alternative_parsers

# Load Libraries

In [32]:
import _pickle as cPickle
import json
import re
import string
import unidecode

# Declare variables

In [2]:
data_path = '../datasets/ptwiki-20170820-pages-articles-parsed.json'
output_path = '../datasets/ptwiki-20170820-sentences.pickle'

# Load All Data

In [3]:
data_reader = open(data_path, 'r')

# Parse all articles from JSON

In [8]:
# Go to beginning
data_reader.seek(0)

# Parse all text from json
articles = [json.loads(line)['text']
            for line in data_reader]

# Temporary - Remove once word embedding algorithms are ready
articles = articles[:100]

# Convert corpus to token list

In [9]:
numbers = '0123456789'
punctuation = re.sub('[!?\.]', '', string.punctuation)
translator = str.maketrans('', '', punctuation + numbers) 

def get_sentences(corpus):
    # Remove punctuation and numbers
    preprocessed = corpus.translate(translator)
    
    # All words to Upper case
    #preprocessed = preprocessed.upper()
    
    # Remove accentuation
    #preprocessed = unidecode.unidecode(preprocessed)
    
    # Split text into sentences
    sentences = re.split('[!?\.\\n]', preprocessed)
    
    # Remove spaces from start and end of sentence
    sentences = [sentence.strip()
                 for sentence in sentences]
    
    # First caracter of each sentence to lower
    # Each sentence must have two or more words
    sentences = [sentence[:1].lower() + sentence[1:]
                 for sentence in sentences
                 if sentence and ' '  in sentence]
    
    # Split sentence in tokens
    sentences = [re.split('\s+', sentence)
                 for sentence in sentences]
    
    return sentences

# Convert corpora to sentences of indexes

In [23]:
def preprocess_corpora(corpora):    
    # All variables
    sentences = []
    idx = 0
    tokens = {}
    indices = {}
    indice_freq = {}
    word_count = 0
    
    # Preprocess all corpora
    for corpus in corpora:
        raw_sentences = get_sentences(corpus)
        for raw_sentence in raw_sentences:
            sentence = []
            for word in raw_sentence:                
                if word not in tokens:
                    tokens[word] = idx
                    indices[idx] = word
                    indice_freq[idx] = 1
                    idx += 1
                else:
                    indice_freq[tokens[word]] += 1
                sentence += [tokens[word]]
                word_count += 1
            sentences += [sentence]
    return sentences, indices, indice_freq, word_count

# Run

In [24]:
sentences, indices, indice_freq, word_count = preprocess_corpora(articles)

# Dump all info

In [35]:
with open(output_path, 'wb') as fp:
    cPickle.dump(indices, fp) 
    cPickle.dump(indice_freq, fp)
    cPickle.dump(sentences, fp)

# Debugging...

## Check length

In [25]:
print(len(sentences))
print(len(indices))
print(word_count)

11188
30740
264366


## Check conversion

In [27]:
line = [indices[idx]
        for idx in sentences[0]]
print(line)

['astronomia', 'é', 'uma', 'ciência', 'natural', 'que', 'estuda', 'corpos', 'celestes', 'como', 'estrelas', 'planetas', 'cometas', 'nebulosas', 'aglomerados', 'de', 'estrelas', 'galáxias', 'e', 'fenômenos', 'que', 'se', 'originam', 'fora', 'da', 'atmosfera', 'da', 'Terra', 'como', 'a', 'radiação', 'cósmica', 'de', 'fundo', 'em', 'microondas']


## Counters

In [38]:
counter = [(indices[idx],freq)
           for idx, freq in indice_freq.items()]
print(counter[:100])

[('astronomia', 41), ('é', 2119), ('uma', 2358), ('ciência', 64), ('natural', 92), ('que', 4733), ('estuda', 17), ('corpos', 11), ('celestes', 11), ('como', 1932), ('estrelas', 25), ('planetas', 14), ('cometas', 2), ('nebulosas', 1), ('aglomerados', 4), ('de', 14112), ('galáxias', 13), ('e', 9234), ('fenômenos', 23), ('se', 1257), ('originam', 4), ('fora', 57), ('da', 4622), ('atmosfera', 51), ('Terra', 86), ('a', 9918), ('radiação', 38), ('cósmica', 3), ('fundo', 14), ('em', 4784), ('microondas', 4), ('preocupada', 2), ('com', 2268), ('evolução', 38), ('física', 51), ('química', 15), ('o', 7552), ('movimento', 142), ('objetos', 47), ('bem', 157), ('formação', 48), ('desenvolvimento', 110), ('do', 5110), ('universo', 24), ('das', 1179), ('mais', 1332), ('antigas', 18), ('ciências', 32), ('culturas', 39), ('préhistóricas', 1), ('deixaram', 6), ('registrados', 10), ('vários', 157), ('artefatos', 2), ('astronômicos', 7), ('Stonehenge', 2), ('os', 3081), ('montes', 2), ('Newgrange', 1), ('