In [1]:
import pickle
import random
import re
import time

from collections import Counter
from heapq import nlargest
from lxml import etree
from sys import getsizeof

import numpy as np

from sklearn.feature_extraction.text import CountVectorizer



NAMESPACE = '{http://www.mediawiki.org/xml/export-0.10/}'
ARTICLE_ID = '0'
NAMESPACE_TAG = NAMESPACE + 'ns'
PAGE_TAG = NAMESPACE + 'page'
TITLE_TAG = NAMESPACE + 'title'
REDIRECT_TAG = NAMESPACE + 'redirect'
TEXT_TAG = NAMESPACE + 'text'
SOURCE = './wiki-dump/enwiki-1gb'
DEST = './wiki-dump/'


-----
Utils:
-----

In [2]:
# Iteration over xml with low ram consumption
def fast_iter(context, func, dest):
    for event, elem in context:
        # Execute operations over xml node
        stop = func(elem, dest)
        
        # Clear data read
        elem.clear()
        while elem.getprevious() is not None:
            del elem.getparent()[0]
        
        if stop:
            break
    
    del context


--------------
Split wikipedia dump xml:
---------------

In [None]:
start_time = time.time()

context = etree.iterparse('./wiki-dump/enwiki-1gb', tag=PAGE_TAG , events=('end',))

xml_header = '<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.10/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.10/ http://www.mediawiki.org/xml/export-0.10.xsd" version="0.10" xml:lang="en">\n'
xml_tail = '</mediawiki>\n'

n = 2
name = './wiki-dump/enwiki-part'

for i in range(1, n + 1):
    filename = name + str(i)
    with open(filename, 'a') as f:
        f.write(xml_header)

# Splits wikipedia dump into n parts choosing randomly the articles of each part
for event, elem in context:
    # Execute operations over xml node
    temp_prob = 0
    rand = random()
    for i in range(1, n + 1):
        if temp_prob <= rand < i / n:
            filename = name + str(i)
            with open(filename, 'ab') as f:
                wiki_page = etree.tostring(elem)
                f.write(wiki_page)
            break
        else:
            temp_prob = i / n
    
    # Clear data read
    elem.clear()
    while elem.getprevious() is not None:
        del elem.getparent()[0]

del context

for i in range(1, n + 1):
    filename = name + str(i)
    with open(filename, 'a') as f:
        f.write(xml_tail)

print("--- %s seconds ---" % (time.time() - start_time))

--------------
Extract keyphrases:
-------------------

In [4]:
# clean_pattern = '\[\[(image:|file:|category:)(.+)\]\]'
# clean_pattern2 = '\[\[(image:|file:|category:)(.*(<br>\n)*)*]]'
extract_pattern = '\[\[([^][]+)\]\]'

ignore = ['Image:', 'File:', 'Category:', 'Q:', 'image:', 'file:', 'category:', 'q:']
          #, '<span style=', 'talk:', 'Talk:', '<font style=']

# clean_regex = re.compile(clean_pattern, re.IGNORECASE)
extract_regex = re.compile(extract_pattern, re.IGNORECASE)

context = etree.iterparse('./wiki-dump/mini/enwiki-test1gb', tag=PAGE_TAG , events=('end',))

start_time = time.time()

vocabulary = Counter()
for event, elem in context:
    iterator = elem.iterchildren(tag=NAMESPACE_TAG)
    namespace_id = iterator.__next__().text
    
    # Extract data only over articles, exclude: talks, mediawiki, portal, user, etc
    if namespace_id == ARTICLE_ID:
        new_keys = []
        # Extract titles of pages
        try:
            iterator = elem.iterchildren(tag=REDIRECT_TAG)
            redirect_child = iterator.__next__()
            page_title = redirect_child.attrib['title'].lower()

        except StopIteration:
            iterator = elem.iterchildren(tag=TITLE_TAG)
            title_child = iterator.__next__()
            page_title = title_child.text.lower()

        new_keys.append(page_title)

        # Extract keyphrases in other morphological form
        iterator = elem.iterdescendants(tag=TEXT_TAG)
        text = iterator.__next__().text
        keyphrases = extract_regex.findall(text)
        for key in keyphrases:
            if not any(x in key for x in ignore) and len(key) > 0:
                new_keys.append(key.split('|')[-1].lower())

        vocabulary.update(new_keys)

    elem.clear()
    while elem.getprevious() is not None:
        del elem.getparent()[0]

del context

# Delete keyphrases that occurs less than 5 times
for key, value in list(vocabulary.items()):
    if value < 5:
        del vocabulary[key]


print('Vocabulary: {} ngrams --- {} MB'.format(len(vocabulary), getsizeof(vocabulary) / 1024**2))
print("--- %s seconds ---" % (time.time() - start_time))


# Save vocabulary
with open('./wiki-dump/vocabulary_dict', 'wb') as f:
    pickle.dump(vocabulary, f)

# Vocabulary: 204163 ngrams - 97.37 sec - enwiki-test1gb

Vocabulary: 204163 ngrams --- 96.00009155273438 MB
--- 89.53760933876038 seconds ---


In [5]:
top_10 = nlargest(100, vocabulary.keys(), key=lambda x: len(x.split()))
# print(top_100)
print([len(i.split()) for i in top_10])
# print(sum([len(i.split()) for i in vocabulary.keys()]) / len(vocabulary))  # ==> 2.09


[21, 18, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11]


------------------
Count N-grams in articles:
--------------------------

In [None]:
start_time = time.time()

with open('./wiki-dump/vocabulary_dict', 'rb') as f:
    vocabulary = pickle.load(f)

vec = CountVectorizer(lowercase=True, ngram_range=(1, 21), vocabulary=vocabulary.keys(), token_pattern='[\S]+')

context = etree.iterparse('./wiki-dump/mini/enwiki-test1gb', tag=PAGE_TAG , events=('end',))
articles = []
keys_count = np.zeros((1, len(vocabulary.keys())))

for i, (event, elem) in enumerate(context):
    iterator = elem.iterchildren(tag=NAMESPACE_TAG)
    namespace_id = iterator.__next__().text

    if namespace_id == ARTICLE_ID:
        if i % 10000 == 0:
            m = vec.transform(articles)
            m = m.sum(axis=0)
            keys_count = np.add(keys_count, m)
            articles = []
        else:
            iterator = elem.iterdescendants(tag=TEXT_TAG)
            text = iterator.__next__().text
            articles.append(text)

    # Clear data read
    elem.clear()
    while elem.getprevious() is not None:
        del elem.getparent()[0]

del context


# Leave this or not?
for index, keyword in enumerate(vec.get_feature_names()):
    keys_count[0, index] += vocabulary[keyword]


print("--- %s seconds ---" % (time.time() - start_time))

# Save data
with open('wiki-dump/keys_count', 'wb') as f:
    pickle.dump(keys_count, f)

# 114.47 sec, ngram=(1, 20) 1,000
# 86.39 sec, ngram=(1, 16) 1,000
# 47.87 sec, ngram=(1, 10) 1,000
# 406.82 sec, ngram=(1, 16) 5,000
# 843.35 sec, ngram=(1, 16) 10,000
# 3443.72 sec, ngram=(1, 16) iterando de a 10,000

In [None]:
print(ft[-2])
print(vec.vocabulary_['caught in the act'])
print(keys_count[0, 1561442])

-----------
Ranking method, Keyphraseness:
------------------------------

In [None]:
keyphraseness = np.zeros((1, len(vocabulary.keys())))

for index, keyword in enumerate(vec.get_feature_names()):
    keyphraseness[0, index] = vocabulary[keyword] / keys_count[0, index]

# Save data
with open('wiki-dump/keyphraseness', 'wb') as f:
    pickle.dump(keyphraseness, f)

In [None]:
keyphraseness