Dataset:
https://dumps.wikimedia.org/backup-index.html

Before Running the parser below, please run WikipediaExtractor script:

python2 WikipediaExtractor.py --json --bytes 10G <wikipedia-file-path>

Download:
https://github.com/attardi/wikiextractor

Source:
https://www.mediawiki.org/wiki/Alternative_parsers

# Load Libraries

In [1]:
from collections import Counter

import _pickle as cPickle
import io
import json
import nltk.corpus
import os.path
import re
import string
import unidecode

# Convert corpus to token list

In [2]:
numbers = '0123456789'
punctuation = re.sub('[!?\.]', '', string.punctuation)
translator = str.maketrans('', '', punctuation + numbers) 

def get_sentences(corpus):
    # Remove punctuation and numbers
    preprocessed = corpus.translate(translator)
    
    # All words to Upper case
    #preprocessed = preprocessed.upper()
    
    # Remove accentuation
    #preprocessed = unidecode.unidecode(preprocessed)
    
    # Split text into sentences
    sentences = re.split('[!?\.\\n]', preprocessed)
    
    # Remove spaces from start and end of sentence
    sentences = [sentence.strip()
                 for sentence in sentences]
    
    # First caracter of each sentence to lower
    # Each sentence must have two or more words
    sentences = [sentence[:1].lower() + sentence[1:]
                 for sentence in sentences
                 if sentence and ' '  in sentence]
    
    # Split sentence in tokens
    sentences = [' '.join(re.split('\s+', sentence))
                 for sentence in sentences]
    
    return sentences

# Learn phrases 

In [3]:
def learn_phrases(sentences, min_count=5, threshold=10, token_limit=10e7, 
                  delimiter='_', token_delimiter=' ', stopwords=frozenset()):
    # All variables
    idx = 0
    tokens = {}
    token_freq = Counter()
    phrase_freq = Counter()
    word_count = 0
    unique_count = 0
    
    phrase_format = '%s' + delimiter + '%s'
    
    # Get counts
    for sentence in sentences:
        previous_word = None
        for word in sentence.split(token_delimiter):
            if unique_count > token_limit:
                break
                
            # Update token frequency
            token_freq[word] += 1

            # Update tokens
            if word not in tokens:
                tokens[word] = idx
                unique_count += 1
                idx += 1

            # Update phrases
            if previous_word:
                phrase = phrase_format % (previous_word, word)
                phrase_freq[phrase] += 1
                unique_count += 1

            # Next
            previous_word = word
            word_count += 1
        if unique_count > token_limit:
            print('Limit reached')
            break
                
    # Find valid phrases
    valid_phrases = Counter()
    for phrase, count in phrase_freq.items():        
        word_a, word_b = phrase.split(delimiter)
        
        # Check for stopwords
        if word_a in stopwords or word_b in stopwords:
            continue
        
        # Get counts
        count_a, count_b = token_freq[word_a], token_freq[word_b]
        
        # Calculate score
        score = word_count * (count - min_count)/ (count_a * count_b)
        
        # Append if valid
        if score > threshold:
            valid_phrases[phrase] = count
            
    return valid_phrases

# Replace all tokens to their respectives phrases

In [4]:
def replace_tokens_to_phrases(sentences, phrases, delimiter='_', token_delimiter=' '):
    phrase_format = '%s' + delimiter + '%s'
    
    # Update sentences
    for i in range(len(sentences)):
        # Get sentence
        sentence = sentences[i].split(token_delimiter)
        
        new_sentence = ''
        concat_phrase = None
        previous_word = sentence[0]
        for word in sentence[1:]:
            phrase = phrase_format % (previous_word, word)
            if phrase in phrases:
                concat_phrase = phrase if not concat_phrase else phrase_format % (concat_phrase, word)
            else:
                new_sentence += ((concat_phrase if concat_phrase else previous_word) + token_delimiter)
                concat_phrase = None
            previous_word = word
        new_sentence += (concat_phrase if concat_phrase else previous_word)
        sentences[i] = new_sentence
    return sentences

# Convert corpora to sentences of indexes

In [5]:
def corpora_to_sentences(corpora, token_delimiter=' '):
    # All variables
    sentences = []
    idx = 0
    tokens = {}
    token_freq = Counter()
    word_count = 0
    
    # Preprocess all corpora
    for corpus in corpora:
        sentences += get_sentences(corpus)
    
    # Load stopwords
    additional = set(['é', 'deste', 'destes', 'nossa', 'nossas', 'nosso', 'nossos', 'vez', 'cada', 'outras', 'outros', 
                      'certa', 'pode', 'podem', 'após', 'neste', 'nestes', 'nesse', 'nesses', 'ter', 'tido', 'tendo',
                      'deveriam', 'muito', 'muitos', 'muitas', 'º'])
    exception = set(['de', 'do', 'dos', 'da', 'das'])
    stopwords = set(nltk.corpus.stopwords.words('portuguese')) | additional - exception
    stopwords = frozenset(stopwords)
    
    # Get phrases
    phrases_freq = learn_phrases(sentences, stopwords=stopwords)
    
    # Update sentences with new phrases
    sentences = replace_tokens_to_phrases(sentences, phrases_freq)
    
    # Get counters
    for sentence in sentences:
        for word in sentence.split(token_delimiter):                
            if word not in tokens:
                tokens[word] = idx
                token_freq[word] = 1
                idx += 1
            else:
                token_freq[word] += 1
            word_count += 1
    
    return sentences, tokens, token_freq, phrases_freq, word_count

# Filter vocabulary to most frequent words

In [33]:
def filter_vocabulary(sentences, tokens, token_freq, max_tokens):
    
    # Get most frequent words
    token_freq = Counter(dict(token_freq.most_common(max_tokens)))
    
    # Remove all other words from indices
    tokens = {key:i for i, key in enumerate(token_freq.keys())}
    
    # Apply remmap in all data structures
    sentences = [' '.join([token 
                           for token in sentence
                           if token in tokens])
                 for sentence in sentences]
    word_count = sum(token_freq.values())
    
    return sentences, tokens, token_freq, word_count

# Declare variables

In [7]:
data_path = '../datasets/ptwiki-20170820-pages-articles-parsed.json'
output_path = '../datasets/ptwiki-20170820-sentences-doc100k.pickle'
filtered_output_path = '../datasets/ptwiki-20170820-sentences-doc100k-vocab30k.pickle'

# Declare hyperparameters

In [8]:
MAX_WORDS_IN_VOCABULARY = 30000
MAX_ARTICLES_SAMPLE = 100000

# Data structures to be used

In [9]:
sentences = []
tokens = {}
token_freq = Counter()
phrase_freq = Counter()
word_count = 0

# Convert all articles to indexes

In [10]:
# Check if already converted
if os.path.isfile(output_path) and not os.path.isfile(filtered_output_path):
    with open(output_path, 'rb') as fp:
        tokens = cPickle.load(fp) 
        token_freq = cPickle.load(fp)
        sentences = cPickle.load(fp)
        word_count = sum(token_freq.values())
elif not os.path.isfile(output_path):
    # Load data
    data_reader = io.open(data_path, mode="r", encoding="utf-8")

    # Go to beginning
    data_reader.seek(0)

    # Parse all text from json
    articles = [json.loads(line)['text']
                for line in data_reader]

    data_reader.close()

    # Temporary - Remove once word embedding algorithms are ready
    articles = articles[:MAX_ARTICLES_SAMPLE]

    # Run
    sentences, tokens, token_freq, phrases_freq, word_count = corpora_to_sentences(articles)
    
    # Delete articles - memory restraint
    del articles
    
    # Save indexes
    with open(output_path, 'wb') as fp:
        cPickle.dump(tokens, fp) 
        cPickle.dump(token_freq, fp)
        cPickle.dump(sentences, fp) 

# Filter tokens to be used

In [34]:
if os.path.isfile(filtered_output_path):
    with open(filtered_output_path, 'rb') as fp:
        tokens = cPickle.load(fp) 
        token_freq = cPickle.load(fp)
        sentences = cPickle.load(fp)
        word_count = sum(token_freq.values())
else:
    # Run
    sentences, tokens, token_freq, word_count = filter_vocabulary(sentences,
                                                                  tokens,
                                                                  token_freq,
                                                                  MAX_WORDS_IN_VOCABULARY)
    
    # Save indexes
    with open(filtered_output_path, 'wb') as fp:
        cPickle.dump(tokens, fp) 
        cPickle.dump(token_freq, fp)
        cPickle.dump(sentences, fp)    

# Debugging...

## Check length

In [35]:
print(len(sentences))
print(len(tokens))
print(word_count)

2261716
30000
37371204


## Check conversion

In [None]:
line = [tokens[word]
        for word in sentences[0]]
print(sentences[0])
print(line)

## Counters

In [None]:
counter = [(token,freq)
           for token, freq in token_freq.items()]
print(counter[:100])