In [None]:
from lxml import etree
from collections import Counter
from io import StringIO, BytesIO
import json
import time
import re
import sys
from sys import getsizeof

try:
    import cPickle as pickle
except ImportError:
    import pickle


NAMESPACE = '{http://www.mediawiki.org/xml/export-0.10/}'
PAGE_TAG = NAMESPACE + 'page'
TITLE_TAG = NAMESPACE + 'title'
REDIRECT_TAG = NAMESPACE + 'redirect'
TEXT_TAG = NAMESPACE + 'text'
SOURCE = './wiki-dump/enwiki-1gb'
DEST = './wiki-dump/'


# Fast iteration over xml with low ram consumption
def fast_iter(context, func, dest):
    for event, elem in context:
        # Execute operations over xml node
        func(elem, dest)
        # Clear data read
        elem.clear()
        while elem.getprevious() is not None:
            del elem.getparent()[0]
    del context


In [None]:
# EXTRAER TITULOS NAIVE

start_time = time.time()

context = etree.iterparse('./wiki-dump/enwiki-1gb', events=('end',))

titles = []

for event, elem in context:
    if 'title' in elem.tag:
        if elem.text is not None:
            titles.append(elem.text.lower())
#             titles.append({'title': elem.text.lower(), 'redirect': ''})
#             titles.append((elem.text,))
    elif 'redirect' in elem.tag:
        titles[-1] = elem.attrib['title'].lower()
#         titles[-1]['redirect'] = elem.attrib['title'].lower()
#         titles[-1] = titles[-1] + (elem.attrib['title'],)
    
    elem.clear()
    while elem.getprevious() is not None:
        del elem.getparent()[0]

del context

print('Titles: {} --- {} MB'.format(len(titles), getsizeof(titles) / 1024**2))
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
start_time = time.time()

with open('./wiki-dump/titles', 'w') as outfile:
    json.dump(titles, outfile, indent=2)
    
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
# EXTRAER TITULOS: ALGORITMO ESTRELLA :D

start_time = time.time()

context = etree.iterparse('./wiki-dump/enwiki-1gb', tag=PAGE_TAG , events=('end',))

titles = set()
for event, elem in context:
    try:
        iter_child = elem.iterchildren(tag=REDIRECT_TAG)
        redirect_child = iter_child.__next__()
#         titles.append(redirect_child.attrib['title'].lower())
        titles.add(redirect_child.attrib['title'].lower())

    except StopIteration:
        iter_child = elem.iterchildren(tag=TITLE_TAG)
        title_child = iter_child.__next__()
#         titles.append(title_child.text.lower())
        titles.add(title_child.text.lower())
        
    elem.clear()
    while elem.getprevious() is not None:
        del elem.getparent()[0]

del context


print('Titles: {} --- {} MB'.format(len(titles), getsizeof(titles) / 1024**2))
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
repeated = titles
from collections import Counter
a = dict(Counter(repeated))
print(a)

In [None]:
count = 1000
for event, elem in etree.iterparse('./wiki-dump/enwiki-1gb', events=('start', 'end')):
    if event == 'start' and 'page' in elem.tag:
        f = elem.find('{http://www.mediawiki.org/xml/export-0.10/}redirect')
        if f is not None:
            print(f.attrib['title'])
        else:
            f = elem.find('{http://www.mediawiki.org/xml/export-0.10/}title')
            print(f.text)
    if count <= 0:
        break
    count -= 1
    elem.clear()
        

In [None]:
count = 170
text = ''
for event, elem in etree.iterparse('./wiki-dump/enwiki-1gb', events=('start', 'end')):
    if event == 'start' and 'text' in elem.tag:
        count -= 1
        if count == 0:
            text = elem.text
            break

# m = re.findall('\[\[(.*?)]]', text)
m = re.findall('\[\[([^][]+)]]', text)
print(len(m))
print(m)

for i, s in enumerate(m):
    m[i] = s.split('|')[-1]

print(m)


In [None]:
context1 = etree.iterparse('./wiki-dump/enwiki-1gb', tag=PAGE_TAG , events=('end',))
context2 = etree.iterparse('./wiki-dump/enwiki-1gb', tag=TEXT_TAG , events=('end',))

count1 = 0
for event, elem in context1:
    count1 += 1
    elem.clear()
    while elem.getprevious() is not None:
        del elem.getparent()[0]
    
count2 = 0
for event, elem in context2:
    count2 += 1
    elem.clear()
    while elem.getprevious() is not None:
        del elem.getparent()[0]

del context1  
del context2  
    
print(count1, count2)

In [None]:
vocabulary = []
alternative_keyphrases = set()

start_time = time.time()

# with open('./wiki-dump/titles', 'wa') as f:

# Extract keyphrases from articles
for event, elem in etree.iterparse('./wiki-dump/enwiki-20160113-pages-articles.xml', events=('start', 'end')):
    title = ''
    # Extract titles of pages
    if event == 'start' and 'title' in elem.tag:
        if elem.text is not None:
#                 vocabulary.append(elem.text.lower())
#                 title = elem.text.lower()
            pass

    elif event == 'start' and 'redirect' in elem.tag:
#             vocabulary[-1] = elem.attrib['title'].lower()
#             title = elem.attrib['title'].lower()
        pass

    # Extract keyphrases in other morphological form
    elif event == 'start' and 'text' in elem.tag:
        text = elem.text
        if text is not None:
#                 keyphrases = re.findall('\[\[([^][]+)]]', text)
#                 for key in keyphrases:
#                     alternative_keyphrases.add(key.split('|')[-1].lower())
            pass

    elem.clear()
        
    
    
print("--- %s seconds ---" % (time.time() - start_time))


start_time = time.time()
vocabulary = alternative_keyphrases.union(set(vocabulary))
print("--- %s seconds ---" % (time.time() - start_time))

start_time = time.time()
print('Vocabulary: ', len(vocabulary))
print("--- %s seconds ---" % (time.time() - start_time))

--------------
Extract keyphrases:
-------------------

In [None]:
# clean_pattern = '\[\[(image:|file:|category:)(.+)\]\]'
# clean_pattern2 = '\[\[(image:|file:|category:)(.*(<br>\n)*)*]]'
extract_pattern = '\[\[([^][]+)\]\]'

ignore = ['Image:', 'File:', 'Category:', 'Q:', 'image:', 'file:', 'category:', 'q:']

# clean_regex = re.compile(clean_pattern, re.IGNORECASE)
extract_regex = re.compile(extract_pattern, re.IGNORECASE)

context = etree.iterparse('./wiki-dump/enwiki-1gb',
                          tag=PAGE_TAG , events=('end',))

start_time = time.time()

vocabulary = Counter()
# double_key = set()
for event, elem in context:
    new_keys = []
    # Extract titles of pages
    try:
        iterator = elem.iterchildren(tag=REDIRECT_TAG)
        redirect_child = iterator.__next__()
        new_keys.append(redirect_child.attrib['title'].lower())

    except StopIteration:
        iterator = elem.iterchildren(tag=TITLE_TAG)
        title_child = iterator.__next__()
        new_keys.append(title_child.text.lower())

    # Extract keyphrases in other morphological form
    iterator = elem.iterdescendants(tag=TEXT_TAG)
    text_child = iterator.__next__()
    if text_child.text is not None:
#         keyphrases = clean_regex.sub('', text_child.text)
        keyphrases = extract_regex.findall(text_child.text)
        for key in keyphrases:
            if not any(x in key for x in ignore):
                new_keys.append(key.split('|')[-1].lower())
#                 if len(key.split('|')) > 1:
#                     double_key.add(key.lower())
    
    vocabulary.update(new_keys)

    elem.clear()
    while elem.getprevious() is not None:
        del elem.getparent()[0]

del context


print('Vocabulary: {} ngrams --- {} MB'.format(len(vocabulary), getsizeof(vocabulary) / 1024**2))
print("--- %s seconds ---" % (time.time() - start_time))


# Save vocabulary
with open('./wiki-dump/vocabulary_dict', 'wb') as f:
    pickle.dump(vocabulary, f)


In [None]:
# with open('./wiki-dump/vocabulary2', 'rb') as f:
#     voc = pickle.load(f)
# len(voc)
    
# with open('./wiki-dump/vocabulary2', 'rb') as f:
#     voc2 = pickle.load(f)

# with open('./wiki-dump/rares', 'rb') as f:
#     rare = pickle.load(f)
print(vocabulary)

In [None]:
# voc2.remove("when the pawn hits the conflicts he thinks like a king what he knows throws the blows when he goes to the fight and he'll win the whole thing 'fore he enters the ring there's no body to batter when your mind is your might so when you go solo, you hold your own hand and remember that depth is the greatest of heights and if you know where you stand, then you know where to land and if you fall it won't matter, cuz you'll know that you're right}}")

print(max(vocabulary, key=len))
print(len(rares))
print(max(rares, key=len))


--------------
Split wiki dump xml:
---------------

In [None]:
from random import random

start_time = time.time()

context = etree.iterparse('./wiki-dump/enwiki-1gb', tag=PAGE_TAG , events=('end',))

xml_header = '<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.10/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.10/ http://www.mediawiki.org/xml/export-0.10.xsd" version="0.10" xml:lang="en">\n'
xml_tail = '</mediawiki>\n'

n = 2
name = './wiki-dump/enwiki-part'

for i in range(1, n + 1):
    filename = name + str(i)
    with open(filename, 'a') as f:
        f.write(xml_header)

# Splits wikipedia dump into n parts choosing randomly the articles of each part
for event, elem in context:
    # Execute operations over xml node
    temp_prob = 0
    rand = random()
    for i in range(1, n + 1):
        if temp_prob <= rand < i / n:
            filename = name + str(i)
            with open(filename, 'ab') as f:
                wiki_page = etree.tostring(elem)
                f.write(wiki_page)
            break
        else:
            temp_prob = i / n
    
    # Clear data read
    elem.clear()
    while elem.getprevious() is not None:
        del elem.getparent()[0]

del context

for i in range(1, n + 1):
    filename = name + str(i)
    with open(filename, 'a') as f:
        f.write(xml_tail)

print("--- %s seconds ---" % (time.time() - start_time))

------------------
Count N-grams in articles
-------------------------

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
# from nltk.util import LazyMap

# with open('./wiki-dump/vocabulary', 'rb') as f:
#     voc = pickle.dump(f)

# minivoc = ['titulo ,e', 'otro mas-va', 'simple', '3']

# text = ['simple es el titulo ,e simple 2 y otro mas va',
#         'segunda prueba',
#         'otro mas-va',
#         '2 SIMPLE 3',
#        ]
minivoc = ['màchear', 'no, anda']
text = ['pará @ [[màchear]] no màchear', 
        'màchear màchear no, anda']

# context = etree.iterparse('./wiki-dump/enwiki-1gb', tag=TEXT_TAG , events=('end',))
vec = CountVectorizer(lowercase=True, ngram_range=(1, 10), vocabulary=minivoc, token_pattern='[\S]+')
vec2 = CountVectorizer(lowercase=True, ngram_range=(1, 10), vocabulary=minivoc, token_pattern='\[\[([^][]+)\]\]')
# vec.fit_transform([])

m = vec.transform(text).toarray()
print(vec.get_feature_names())
print(m)

m2 = vec2.transform(text).toarray()
print(vec2.get_feature_names())
print(m2)

re.findall('\[\[([^][]+)\]\]', 'viendo si [[funca|es]] esto')