Dataset:
https://dumps.wikimedia.org/backup-index.html

Before Running the parser below, please run WikipediaExtractor script:

python2 WikipediaExtractor.py --json --bytes 10G <wikipedia-file-path>

Download:
https://github.com/attardi/wikiextractor

Source:
https://www.mediawiki.org/wiki/Alternative_parsers

# Load Libraries

In [1]:
import _pickle as cPickle
import json
import re
import string
import unidecode

# Declare variables

In [2]:
data_path = '../datasets/ptwiki-20170820-pages-articles-parsed.json'
output_path = '../datasets/ptwiki-20170820-sentences.pickle'

# Load All Data

In [22]:
import io
data_reader = io.open(data_path, mode="r", encoding="utf-8")

# Parse all articles from JSON

In [24]:
# Go to beginning
data_reader.seek(0)

# Parse all text from json
articles = [json.loads(line.encode('utf-8'))['text'] for line in data_reader]

# Temporary - Remove once word embedding algorithms are ready
articles = articles[:100]

data_reader.close()

# Convert corpus to token list

In [27]:
numbers = '0123456789'
punctuation = re.sub('[!?\.]', '', string.punctuation)
translator = str.maketrans('', '', punctuation + numbers) 

def get_sentences(corpus):
    # Remove punctuation and numbers
    preprocessed = corpus.translate(translator)
    
    # All words to Upper case
    #preprocessed = preprocessed.upper()
    
    # Remove accentuation
    #preprocessed = unidecode.unidecode(preprocessed)
    
    # Split text into sentences
    sentences = re.split('[!?\.\\n]', preprocessed)
    
    # Remove spaces from start and end of sentence
    sentences = [sentence.strip()
                 for sentence in sentences]
    
    # First caracter of each sentence to lower
    # Each sentence must have two or more words
    sentences = [sentence[:1].lower() + sentence[1:]
                 for sentence in sentences
                 if sentence and ' '  in sentence]
    
    # Split sentence in tokens
    sentences = [re.split('\s+', sentence)
                 for sentence in sentences]
    
    return sentences

# Convert corpora to sentences of indexes

In [28]:
def preprocess_corpora(corpora):    
    # All variables
    sentences = []
    idx = 0
    tokens = {}
    indices = {}
    indice_freq = {}
    word_count = 0
    
    # Preprocess all corpora
    for corpus in corpora:
        raw_sentences = get_sentences(corpus)
        for raw_sentence in raw_sentences:
            sentence = []
            for word in raw_sentence:                
                if word not in tokens:
                    tokens[word] = idx
                    indices[idx] = word
                    indice_freq[idx] = 1
                    idx += 1
                else:
                    indice_freq[tokens[word]] += 1
                sentence += [tokens[word]]
                word_count += 1
            sentences += [sentence]
    return sentences, indices, indice_freq, word_count

# Run

In [29]:
sentences, indices, indice_freq, word_count = preprocess_corpora(articles)

# Dump all info

In [30]:
with open(output_path, 'wb') as fp:
    cPickle.dump(indices, fp) 
    cPickle.dump(indice_freq, fp)
    cPickle.dump(sentences, fp)

# Debugging...

## Check length

In [31]:
print(len(sentences))
print(len(indices))
print(word_count)

11298
30956
266924


## Check conversion

In [32]:
line = [indices[idx]
        for idx in sentences[0]]
print(line)

['astronomia', 'é', 'uma', 'ciência', 'natural', 'que', 'estuda', 'corpos', 'celestes', 'como', 'estrelas', 'planetas', 'cometas', 'nebulosas', 'aglomerados', 'de', 'estrelas', 'galáxias', 'e', 'fenômenos', 'que', 'se', 'originam', 'fora', 'da', 'atmosfera', 'da', 'Terra', 'como', 'a', 'radiação', 'cósmica', 'de', 'fundo', 'em', 'microondas']


## Counters

In [33]:
counter = [(indices[idx],freq)
           for idx, freq in indice_freq.items()]
print(counter[:100])

[('astronomia', 41), ('é', 2149), ('uma', 2390), ('ciência', 64), ('natural', 93), ('que', 4816), ('estuda', 17), ('corpos', 11), ('celestes', 11), ('como', 1964), ('estrelas', 25), ('planetas', 14), ('cometas', 2), ('nebulosas', 1), ('aglomerados', 4), ('de', 14224), ('galáxias', 13), ('e', 9302), ('fenômenos', 23), ('se', 1262), ('originam', 4), ('fora', 57), ('da', 4679), ('atmosfera', 51), ('Terra', 86), ('a', 10011), ('radiação', 39), ('cósmica', 3), ('fundo', 14), ('em', 4813), ('microondas', 4), ('preocupada', 2), ('com', 2310), ('evolução', 38), ('física', 52), ('química', 15), ('o', 7632), ('movimento', 142), ('objetos', 47), ('bem', 158), ('formação', 49), ('desenvolvimento', 108), ('do', 5144), ('universo', 24), ('das', 1187), ('mais', 1353), ('antigas', 18), ('ciências', 33), ('culturas', 39), ('préhistóricas', 1), ('deixaram', 6), ('registrados', 10), ('vários', 158), ('artefatos', 2), ('astronômicos', 7), ('Stonehenge', 2), ('os', 3081), ('montes', 2), ('Newgrange', 1), (

In [35]:
len(counter)

30956

# Remove stopwords

In [1]:
import nltk

In [9]:
stopwords = nltk.corpus.stopwords.words('portuguese')
len(stopwords)

203

In [8]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [None]:
sentences_filter = [w for w in sentences?? if w not in stopwords]

# Get Pairs from sentences

In [None]:
window_size = 10

In [None]:
def vectorize(word):
    return np.zeroes(word_count)[indices[word]] = 1

In [None]:
def word_pairs(sentence, w_size):
    sentence_words = sentence.split(" ")
    pairs = []
    idx = 0
    for word in sentence_words:
        window_indexes = range(max(0,idx-w_size), min(len(sentence_words),idx+w_size))
        for neighbor_idx in window_indexes:
            if neighbor_idx == idf:
                continue
            else:
                pairs.append((vectorize(sentence_words[idx]),vectorize(sentence_words[neighbor_idx])))
        
        idx =+1
    return pairs
