In [1]:
import pickle
import random
import re
import time

from collections import Counter
from heapq import nlargest
from lxml import etree
from sys import getsizeof

import numpy as np

from sklearn.feature_extraction.text import CountVectorizer



NAMESPACE = '{http://www.mediawiki.org/xml/export-0.10/}'
PAGE_TAG = NAMESPACE + 'page'
TITLE_TAG = NAMESPACE + 'title'
REDIRECT_TAG = NAMESPACE + 'redirect'
TEXT_TAG = NAMESPACE + 'text'
SOURCE = './wiki-dump/enwiki-1gb'
DEST = './wiki-dump/'


-----
Utils:
-----

In [2]:
# Iteration over xml with low ram consumption
def fast_iter(context, func, dest):
    for event, elem in context:
        # Execute operations over xml node
        stop = func(elem, dest)
        
        # Clear data read
        elem.clear()
        while elem.getprevious() is not None:
            del elem.getparent()[0]
        
        if stop:
            break
    
    del context


--------------
Split wikipedia dump xml:
---------------

In [None]:
start_time = time.time()

context = etree.iterparse('./wiki-dump/enwiki-1gb', tag=PAGE_TAG , events=('end',))

xml_header = '<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.10/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.10/ http://www.mediawiki.org/xml/export-0.10.xsd" version="0.10" xml:lang="en">\n'
xml_tail = '</mediawiki>\n'

n = 2
name = './wiki-dump/enwiki-part'

for i in range(1, n + 1):
    filename = name + str(i)
    with open(filename, 'a') as f:
        f.write(xml_header)

# Splits wikipedia dump into n parts choosing randomly the articles of each part
for event, elem in context:
    # Execute operations over xml node
    temp_prob = 0
    rand = random()
    for i in range(1, n + 1):
        if temp_prob <= rand < i / n:
            filename = name + str(i)
            with open(filename, 'ab') as f:
                wiki_page = etree.tostring(elem)
                f.write(wiki_page)
            break
        else:
            temp_prob = i / n
    
    # Clear data read
    elem.clear()
    while elem.getprevious() is not None:
        del elem.getparent()[0]

del context

for i in range(1, n + 1):
    filename = name + str(i)
    with open(filename, 'a') as f:
        f.write(xml_tail)

print("--- %s seconds ---" % (time.time() - start_time))

--------------
Extract keyphrases:
-------------------

In [None]:
# clean_pattern = '\[\[(image:|file:|category:)(.+)\]\]'
# clean_pattern2 = '\[\[(image:|file:|category:)(.*(<br>\n)*)*]]'
extract_pattern = '\[\[([^][]+)\]\]'

ignore = ['Image:', 'File:', 'Category:', 'Q:', 'image:', 'file:', 'category:', 'q:']

# clean_regex = re.compile(clean_pattern, re.IGNORECASE)
extract_regex = re.compile(extract_pattern, re.IGNORECASE)

context = etree.iterparse('./wiki-dump/enwiki-1gb', tag=PAGE_TAG , events=('end',))

start_time = time.time()

vocabulary = Counter()
for event, elem in context:
    new_keys = []
    # Extract titles of pages
    try:
        iterator = elem.iterchildren(tag=REDIRECT_TAG)
        redirect_child = iterator.__next__()
        page_title = redirect_child.attrib['title'].lower()

    except StopIteration:
        iterator = elem.iterchildren(tag=TITLE_TAG)
        title_child = iterator.__next__()
        page_title = title_child.text.lower()
        
    new_keys.append(page_title)

    # Extract keyphrases in other morphological form
    iterator = elem.iterdescendants(tag=TEXT_TAG)
    text_child = iterator.__next__()
    if text_child.text is not None:
#         keyphrases = clean_regex.sub('', text_child.text)
        keyphrases = extract_regex.findall(text_child.text)
        for key in keyphrases:
            if not any(x in key for x in ignore) and len(key) > 0:
                new_keys.append(key.split('|')[-1].lower())
    
    vocabulary.update(new_keys)

    elem.clear()
    while elem.getprevious() is not None:
        del elem.getparent()[0]

del context


print('Vocabulary: {} ngrams --- {} MB'.format(len(vocabulary), getsizeof(vocabulary) / 1024**2))
print("--- %s seconds ---" % (time.time() - start_time))


# Save vocabulary
with open('./wiki-dump/vocabulary_dict', 'wb') as f:
    pickle.dump(vocabulary, f)


In [None]:
# with open('./wiki-dump/vocabulary_dict', 'rb') as f:
#     vocabulary = pickle.load(f)

print(len(vocabulary.keys()))

In [None]:
top_500 = nlargest(500, vocabulary.keys(), key=lambda x: len(x.split()))
print([len(i.split()) for i in top_500])
# sum([len(i.split()) for i in vocabulary.keys()]) / len(vocabulary)  # ==> 2.5


------------------
Count N-grams in articles
-------------------------

In [4]:
start_time = time.time()

with open('./wiki-dump/vocabulary_dict', 'rb') as f:
    vocabulary = pickle.load(f)

vec = CountVectorizer(lowercase=True, ngram_range=(1, 16), vocabulary=vocabulary.keys(), token_pattern='[\S]+')

context = etree.iterparse('./wiki-dump/enwiki-1gb', tag=TEXT_TAG , events=('end',))
articles = []
keys_count = np.zeros((1, len(vocabulary.keys())))

for i, (event, elem) in enumerate(context):
    if i % 100 == 0:
        m = vec.transform(articles)
        m = m.sum(axis=0)
        keys_count = np.add(keys_count, m)
        articles = []
    elif i == 501:
        break
    else:
        articles.append(elem.text)

    # Clear data read
    elem.clear()
    while elem.getprevious() is not None:
        del elem.getparent()[0]

del context

for index, keyword in enumerate(vec.get_feature_names()):
    keys_count[0, index] += vocabulary[keyword]

print("--- %s seconds ---" % (time.time() - start_time))

with open('wiki-dump/keys_count', 'wb') as f:
    pickle.dump(keys_count, f)


--- 3443.727946996689 seconds ---


In [94]:
print(ft[-2])
print(vec.vocabulary_['caught in the act'])
print(keys_count[0, 1561442])

caught in the act
1561442
18.0


In [87]:
keyphraseness = np.zeros((1, len(vocabulary.keys())))

for index, keyword in enumerate(vec.get_feature_names()):
    keyphraseness[0, index] = vocabulary[keyword] / keys_count[0, index]


In [88]:
keyphraseness

array([[ 1.        ,  0.88888889,  1.        , ...,  1.        ,
         0.16666667,  1.        ]])

In [None]:
def get_text(elem, dest):
    stop = False
    dest.append(elem.text)
    
    if len(dest) >= 10000:
        stop = True
        
    return stop

context = etree.iterparse('./wiki-dump/enwiki-1gb', tag=TEXT_TAG , events=('end',))
articles = []

fast_iter(context, get_text, articles)


In [None]:
with open('./wiki-dump/vocabulary_dict', 'rb') as f:
    vocabulary = pickle.load(f)

vec = CountVectorizer(lowercase=True, ngram_range=(1, 16), vocabulary=vocabulary.keys(), token_pattern='[\S]+')


start_time = time.time()

m = vec.transform(articles)

print("--- %s seconds ---" % (time.time() - start_time))

# 114.47 sec, ngram=(1, 20) 1,000
# 86.39 sec, ngram=(1, 16) 1,000
# 47.87 sec, ngram=(1, 10) 1,000
# 406.82 sec, ngram=(1, 16) 5,000
# 843.35 sec, ngram=(1, 16) 10,000
# 3443.72 sec, ngram=(1, 16) iterando de a 10,000