Dataset:
https://dumps.wikimedia.org/backup-index.html

Before Running the parser below, please run WikipediaExtractor script:

python2 WikipediaExtractor.py --json --bytes 10G <wikipedia-file-path>

Download:
https://github.com/attardi/wikiextractor

Source:
https://www.mediawiki.org/wiki/Alternative_parsers

# Load Libraries

In [1]:
import _pickle as cPickle
import io
import json
import re
import string
import unidecode

# Declare variables

In [2]:
data_path = '../datasets/ptwiki-20170820-pages-articles-parsed.json'
output_path = '../datasets/ptwiki-20170820-sentences.pickle'

# Load All Data

In [3]:
data_reader = io.open(data_path, mode="r", encoding="utf-8")

# Parse all articles from JSON

In [4]:
# Go to beginning
data_reader.seek(0)

# Parse all text from json
articles = [json.loads(line)['text']
            for line in data_reader]

data_reader.close()

# Temporary - Remove once word embedding algorithms are ready
articles = articles[:1000]

# Convert corpus to token list

In [5]:
numbers = '0123456789'
punctuation = re.sub('[!?\.]', '', string.punctuation)
translator = str.maketrans('', '', punctuation + numbers) 

def get_sentences(corpus):
    # Remove punctuation and numbers
    preprocessed = corpus.translate(translator)
    
    # All words to Upper case
    #preprocessed = preprocessed.upper()
    
    # Remove accentuation
    #preprocessed = unidecode.unidecode(preprocessed)
    
    # Split text into sentences
    sentences = re.split('[!?\.\\n]', preprocessed)
    
    # Remove spaces from start and end of sentence
    sentences = [sentence.strip()
                 for sentence in sentences]
    
    # First caracter of each sentence to lower
    # Each sentence must have two or more words
    sentences = [sentence[:1].lower() + sentence[1:]
                 for sentence in sentences
                 if sentence and ' '  in sentence]
    
    # Split sentence in tokens
    sentences = [re.split('\s+', sentence)
                 for sentence in sentences]
    
    return sentences

# Convert corpora to sentences of indexes

In [6]:
def preprocess_corpora(corpora):    
    # All variables
    sentences = []
    idx = 0
    tokens = {}
    indices = {}
    indice_freq = {}
    word_count = 0
    
    # Preprocess all corpora
    for corpus in corpora:
        raw_sentences = get_sentences(corpus)
        for raw_sentence in raw_sentences:
            sentence = []
            for word in raw_sentence:                
                if word not in tokens:
                    tokens[word] = idx
                    indices[idx] = word
                    indice_freq[idx] = 1
                    idx += 1
                else:
                    indice_freq[tokens[word]] += 1
                sentence += [tokens[word]]
                word_count += 1
            sentences += [sentence]
    return sentences, indices, indice_freq, word_count

# Run

In [7]:
sentences, indices, indice_freq, word_count = preprocess_corpora(articles)

# Dump all info

In [8]:
with open(output_path, 'wb') as fp:
    cPickle.dump(indices, fp) 
    cPickle.dump(indice_freq, fp)
    cPickle.dump(sentences, fp)

# Debugging...

## Check length

In [9]:
print(len(sentences))
print(len(indices))
print(word_count)

98001
105405
2259375


## Check conversion

In [10]:
line = [indices[idx]
        for idx in sentences[0]]
print(line)

['astronomia', 'é', 'uma', 'ciência', 'natural', 'que', 'estuda', 'corpos', 'celestes', 'como', 'estrelas', 'planetas', 'cometas', 'nebulosas', 'aglomerados', 'de', 'estrelas', 'galáxias', 'e', 'fenômenos', 'que', 'se', 'originam', 'fora', 'da', 'atmosfera', 'da', 'Terra', 'como', 'a', 'radiação', 'cósmica', 'de', 'fundo', 'em', 'microondas']


## Counters

In [11]:
counter = [(indices[idx],freq)
           for idx, freq in indice_freq.items()]
print(counter[:100])

[('astronomia', 109), ('é', 20278), ('uma', 20443), ('ciência', 805), ('natural', 559), ('que', 39705), ('estuda', 140), ('corpos', 310), ('celestes', 44), ('como', 16334), ('estrelas', 216), ('planetas', 376), ('cometas', 189), ('nebulosas', 12), ('aglomerados', 38), ('de', 125948), ('galáxias', 155), ('e', 72004), ('fenômenos', 225), ('se', 10813), ('originam', 11), ('fora', 494), ('da', 39805), ('atmosfera', 304), ('Terra', 798), ('a', 85477), ('radiação', 183), ('cósmica', 14), ('fundo', 113), ('em', 40505), ('microondas', 19), ('preocupada', 19), ('com', 20357), ('evolução', 444), ('física', 534), ('química', 317), ('o', 64932), ('movimento', 1003), ('objetos', 362), ('bem', 1276), ('formação', 504), ('desenvolvimento', 1212), ('do', 44767), ('universo', 165), ('das', 8556), ('mais', 11441), ('antigas', 203), ('ciências', 354), ('culturas', 239), ('préhistóricas', 5), ('deixaram', 76), ('registrados', 65), ('vários', 1304), ('artefatos', 26), ('astronômicos', 15), ('Stonehenge', 5