In [1]:
import pickle
import random
import re
import time

from collections import Counter
from heapq import nlargest
from lxml import etree
from sys import getsizeof

import numpy as np

from sklearn.feature_extraction.text import CountVectorizer

from string import punctuation

from nltk.tokenize import word_tokenize




NAMESPACE = '{*}'
ARTICLE_ID = '0'
NAMESPACE_TAG = NAMESPACE + 'ns'
PAGE_TAG = NAMESPACE + 'page'
TITLE_TAG = NAMESPACE + 'title'
REDIRECT_TAG = NAMESPACE + 'redirect'
TEXT_TAG = NAMESPACE + 'text'
SOURCE = './wiki-dump/enwiki-1gb'
DEST = './wiki-dump/'


-----
Utils:
-----

In [None]:
# Iteration over xml with low ram consumption
def fast_iter(context, func, dest):
    for event, elem in context:
        # Execute operations over xml node
        stop = func(elem, dest)
        
        # Clear data read
        elem.clear()
        while elem.getprevious() is not None:
            del elem.getparent()[0]
        
        if stop:
            break
    
    del context


--------------
Split wikipedia dump xml:
---------------

In [None]:
start_time = time.time()

context = etree.iterparse('./wiki-dump/enwiki-1gb', tag=PAGE_TAG , events=('end',))

xml_header = '<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.10/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.10/ http://www.mediawiki.org/xml/export-0.10.xsd" version="0.10" xml:lang="en">\n'
xml_tail = '</mediawiki>\n'

n = 2
name = './wiki-dump/enwiki-part'

for i in range(1, n + 1):
    filename = name + str(i)
    with open(filename, 'a') as f:
        f.write(xml_header)

# Splits wikipedia dump into n parts choosing randomly the articles of each part
for event, elem in context:
    # Execute operations over xml node
    temp_prob = 0
    rand = random()
    for i in range(1, n + 1):
        if temp_prob <= rand < i / n:
            filename = name + str(i)
            with open(filename, 'ab') as f:
                wiki_page = etree.tostring(elem)
                f.write(wiki_page)
            break
        else:
            temp_prob = i / n
    
    # Clear data read
    elem.clear()
    while elem.getprevious() is not None:
        del elem.getparent()[0]

del context

for i in range(1, n + 1):
    filename = name + str(i)
    with open(filename, 'a') as f:
        f.write(xml_tail)

print("--- %s seconds ---" % (time.time() - start_time))

--------------
Extract keywords:
-------------------

In [None]:
# clean_pattern = '\[\[(image:|file:|category:)(.+)\]\]'
# clean_pattern2 = '\[\[(image:|file:|category:)(.*(<br>\n)*)*]]'
extract_pattern = '\[\[([^][]+)\]\]'

ignore = ['Image:', 'File:', 'Category:', 'Q:', 'Wikipedia:', 'image:', 'file:', 'category:', 'q:', 'wikipedia:']
          #, '<span style=', 'talk:', 'Talk:', '<font style=']

# clean_regex = re.compile(clean_pattern, re.IGNORECASE)
extract_regex = re.compile(extract_pattern, re.IGNORECASE)

context = etree.iterparse('./wiki-dump/mini/enwiki-test1gb', tag=PAGE_TAG , events=('end',))

start_time = time.time()

vocabulary = Counter()
for event, elem in context:
    iterator = elem.iterchildren(tag=NAMESPACE_TAG)
    namespace_id = iterator.__next__().text
    
    # Extract data only over articles, exclude: talks, mediawiki, portal, user, etc
    if namespace_id == ARTICLE_ID:
        new_keys = []
        # Extract titles of pages
        try:
            iterator = elem.iterchildren(tag=REDIRECT_TAG)
            redirect_child = iterator.__next__()
            page_title = redirect_child.attrib['title'].lower()

        except StopIteration:
            iterator = elem.iterchildren(tag=TITLE_TAG)
            title_child = iterator.__next__()
            page_title = title_child.text.lower()

        new_keys.append(page_title)

        # Extract keyphrases in other morphological form
        iterator = elem.iterdescendants(tag=TEXT_TAG)
        text = iterator.__next__().text
        keyphrases = extract_regex.findall(text)
        for key in keyphrases:
            if not any(x in key for x in ignore) and len(key) > 0:
                new_keys.append(key.split('|')[-1].lower())

        vocabulary.update(new_keys)

    elem.clear()
    while elem.getprevious() is not None:
        del elem.getparent()[0]

del context

# Delete keyphrases that occurs less than 5 times
for key, value in list(vocabulary.items()):
    if value <= 5:
        del vocabulary[key]


print('Vocabulary: {} ngrams --- {} MB'.format(len(vocabulary), getsizeof(vocabulary) / 1024**2))
print("--- %s seconds ---" % (time.time() - start_time))


# Save vocabulary
with open('./wiki-dump/vocabulary_dict', 'wb') as f:
    pickle.dump(vocabulary, f)

# Vocabulary: 204163 ngrams - 97.37 sec - enwiki-test1gb

In [None]:
top_10 = nlargest(10, vocabulary.keys(), key=lambda x: len(x.split()))
# print(top_100)
# print([len(i.split()) for i in top_10])
# print(sum([len(i.split()) for i in vocabulary.keys()]) / len(vocabulary))  # ==> 2.09
top_10[-5:]
vocabulary

------------------
Count N-grams in articles:
--------------------------

In [None]:
start_time = time.time()

with open('./wiki-dump/vocabulary_dict', 'rb') as f:
    vocabulary = pickle.load(f)

vectorizer = CountVectorizer(lowercase=True, ngram_range=(1, 16), vocabulary=vocabulary.keys(), token_pattern='[\S]+')

context = etree.iterparse('./wiki-dump/mini/enwiki-test1gb', tag=PAGE_TAG , events=('end',))
articles = []
keys_count = np.zeros((1, len(vocabulary.keys())))

for i, (event, elem) in enumerate(context):
    iterator = elem.iterchildren(tag=NAMESPACE_TAG)
    namespace_id = iterator.__next__().text

    if namespace_id == ARTICLE_ID:
        if i % 10000 == 0:
            m = vectorizer.transform(articles)
            m = m.sum(axis=0)
            keys_count = np.add(keys_count, m)
            articles = []
        else:
            iterator = elem.iterdescendants(tag=TEXT_TAG)
            text = iterator.__next__().text
            articles.append(text)

    # Clear data read
    elem.clear()
    while elem.getprevious() is not None:
        del elem.getparent()[0]

del context


# Leave this or not?
# I take advantage of this loop to build a dict from vocabulary keys to their index in the array
key_to_index = dict()
for index, keyword in enumerate(vectorizer.get_feature_names()):
    keys_count[0, index] += vocabulary[keyword]
    key_to_index[keyword] = index


print("--- %s seconds ---" % (time.time() - start_time))

# Save data
# with open('wiki-dump/vectorizer', 'wb') as f:
#     pickle.dump(vectorizer, f)

with open('wiki-dump/keys_count', 'wb') as f:
    pickle.dump(keys_count, f)
    
with open('wiki-dump/key_to_index', 'wb') as f:
    pickle.dump(key_to_index, f)

# 114.47 sec, ngram=(1, 20) 1,000
# 86.39 sec, ngram=(1, 16) 1,000
# 47.87 sec, ngram=(1, 10) 1,000
# 406.82 sec, ngram=(1, 16) 5,000
# 843.35 sec, ngram=(1, 16) 10,000
# 3443.72 sec, ngram=(1, 16) iterando de a 10,000
# 4310.21 sec, ngram=(1, 21) iterando de a 10,000

In [None]:
print(keys_count[0, 52156])
print(vectorizer.get_feature_names()[2])
key_to_index.items()

In [None]:
print(ft[-2])
print(vectorizer.vocabulary_['caught in the act'])
print(keys_count[0, 1561442])

-----------
Ranking method, Keyphraseness:
------------------------------

In [None]:
keyphraseness = np.zeros((1, len(vocabulary.keys())))

for index, keyword in enumerate(vectorizer.get_feature_names()):
    keyphraseness[0, index] = vocabulary[keyword] / keys_count[0, index]

# Save data
with open('wiki-dump/keyphraseness', 'wb') as f:
    pickle.dump(keyphraseness, f)

------------------------
Evaluation of Keyphraseness:
----------------------------

In [None]:
with open('wiki-dump/vocabulary_dict', 'rb') as f:
    vocabulary = pickle.load(f)
    
with open('wiki-dump/keyphraseness', 'rb') as f:
    keyphraseness = pickle.load(f)

context = etree.iterparse('./wiki-dump/mini/enwiki-train', tag=PAGE_TAG , events=('end',))

count = 1000
articles = []
for event, elem in context:
    iterator = elem.iterchildren(tag=NAMESPACE_TAG)
    namespace_id = iterator.__next__().text
    
    if len(articles) < count:
        # Extract data only over articles, exclude: talks, mediawiki, portal, user, etc
        if namespace_id == ARTICLE_ID:
            try:
                iterator = elem.iterchildren(tag=REDIRECT_TAG)
                redirect_child = iterator.__next__()

            except StopIteration:
                iterator = elem.iterdescendants(tag=TEXT_TAG)
                text = iterator.__next__().text.lower()
                articles.append(text)
    else:
        break

    elem.clear()
    while elem.getprevious() is not None:
        del elem.getparent()[0]

del context


In [None]:
extract_pattern = '\[\[([^][]+)\]\]'
extract_regex = re.compile(extract_pattern, re.IGNORECASE)
ignore = ['Image:', 'File:', 'Category:', 'Q:', 'image:', 'file:', 'category:', 'q:']

for article in articles:
    # Gold keyphrases extracted for wikipedia articles
    gold_keyphrases = set([key.split('|')[-1].lower() for key in extract_regex.findall(article)
                           if not any(x in key for x in ignore) and len(key) > 0])
    
    # Predicted keyphrases
    vectorizer = CountVectorizer(lowercase=True, ngram_range=(1, 16), 
                                 vocabulary=vocabulary.keys(), token_pattern='[\S]+')
    m = vectorizer.transform([article])
    
    
    

In [None]:
article = articles[0]
vectorizer = CountVectorizer(lowercase=True, ngram_range=(1, 16), vocabulary=vocabulary.keys())
m = vectorizer.transform([article])

In [None]:
rows, cols = m.nonzero()

In [None]:
zipped = list(zip(cols, keyphraseness[0, cols]))

In [None]:
percent_6 = int(len(article.split(' ')) * .06)
percent_6

In [None]:
top = nlargest(percent_6, zipped, key=lambda t: t[1])
# top

In [None]:
extract_pattern = '\[\[([^][]+)\]\]'
extract_regex = re.compile(extract_pattern, re.IGNORECASE)
ignore = ['Image:', 'File:', 'Category:', 'Q:', 'image:', 'file:', 'category:', 'q:']

gold_keyphrases = set([key.split('|')[-1].lower() for key in extract_regex.findall(article)
                       if not any(x in key for x in ignore) and len(key) > 0])

In [None]:
features_names = vectorizer.get_feature_names()
predicted_keyphrases = [features_names[i] for i, prob in top]

In [None]:
gold_keyphrases.intersection(set(predicted_keyphrases))

In [None]:
features_names[140033]

In [None]:
predicted_keyphrases

In [None]:
vocabulary['22']

In [None]:
from functools import partial
from nltk import regexp_tokenize

# pattern = r'''(?ix)    # set flag to allow verbose regexps
#      ([A-Z]\.)+        # abbreviations, e.g. U.S.A.
#    | \$?\d+(\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
#    | \w+(-\w+)*        # words with optional internal hyphens
#    | \.\.\.            # ellipsis
#    | [][.,;"'?():-_`]  # these are separate tokens; includes ], [
# '''
pattern = r''' (?x)         # set flag to allow verbose regexps 
    ([A-Z]\.)+          # abbreviations (e.g. U.S.A.)
    | \w+(-\w+)*        # words with optional internal hyphens
    | \$?\d+(\.\d+)?%?  # currency & percentages
    | \.\.\.            # ellipses '''

vectorizer = CountVectorizer(analyzer=partial(regexp_tokenize, pattern=pattern))
# m = vectorizer.fit_transform('I love N.Y.C. 100% even with all of its traffic-ridden streets...')
# m = vectorizer.transform(['sale la flòr. Y [[el-gato]]', 'estaba el-gato'])

# print(vectorizer.get_feature_names())
# print(m.toarray())
vectorizer.build_analyzer()('I love N.Y.C. 100% even with all of its traffic-ridden streets...')

In [None]:
from nltk.tokenize import word_tokenize

voc = ['el gato', "blink 's"]
vectorizer = CountVectorizer(lowercase=True, ngram_range=(1, 6), vocabulary=voc, tokenizer=word_tokenize)
text = [
    'estaba [[el gato]].',
    "toco '''blink's'''",
]

m = vectorizer.transform(text)
print(vectorizer.get_feature_names())
print(m.toarray())


In [None]:
word_tokenize("El GATO:")

In [None]:
from string import punctuation
punctuation

In [5]:
# Ignore keywords starting with this names
_ignored_keywords = ['image:', 'file:', 'category:', 'wikipedia:']
# Regular expression used to extract keywords inside '[[ ]]'
_extract_regex = re.compile('\[\[([^][]+)\]\]', re.IGNORECASE)


def clean_text(text):
    text = text.lower()
    tokens = [token for token in word_tokenize(text)
              if not token in punctuation]
    return ' '.join(tokens)


def fast_xml_iter(context, func, dest):
    for event, elem in context:
        # Execute operations over xml node
        func(elem, dest)
#         break
        # Clear data read
        elem.clear()
        while elem.getprevious() is not None:
            del elem.getparent()[0]
            

def extract_keywords(elem, dest):
    iterator = elem.iterchildren(tag=NAMESPACE_TAG)
    namespace_id = next(iterator).text

    if namespace_id == ARTICLE_ID:
        keywords = []
        # Text in the article
        iterator = elem.iterdescendants(tag=TEXT_TAG)
        text = next(iterator).text
        # Find words inside '[[ ]]'
        words = _extract_regex.findall(text)

        for word in words:
            word = clean_text(word.split('|')[-1])
            if not any(x in word for x in _ignored_keywords) and len(word) > 0:
                keywords.append(word)

        dest.update(keywords)            


# Iterates over xml and extract keywords
start_time = time.time()

xml_iterator = etree.iterparse('./wiki-dump/mini/enwiki-test1gb', tag=PAGE_TAG)
# Vocabulary of keywords with their occurrence count
_vocabulary = Counter()
# fast_xml_iter(xml_iterator, extract_keywords, _vocabulary)
for event, elem in xml_iterator:
    iterator = elem.iterchildren(tag=NAMESPACE_TAG)
    namespace_id = next(iterator).text

    if namespace_id == ARTICLE_ID:
        keywords = []
        # Text in the article
        iterator = elem.iterdescendants(tag=TEXT_TAG)
        text = next(iterator).text
        # Find words inside '[[ ]]'
        words = _extract_regex.findall(text)

        for word in words:
#             word = clean_text(word.split('|')[-1])
            word = word.split('|')[-1]
            if not any(x in word for x in _ignored_keywords) and len(word) > 0:
                keywords.append(word)

        _vocabulary.update(keywords)
        
    elem.clear()
    while elem.getprevious() is not None:
        del elem.getparent()[0]
del xml_iterator
print("--- %s sec: extract_keywords ---" % (time.time() - start_time))


--- 79.41121578216553 sec: extract_keywords ---
