In [1]:
import os
import time
import concurrent.futures
from bs4 import BeautifulSoup
from collections import Counter
import re

In [2]:
files = os.listdir("wiki")
print(len(files))
files[:10]

999


['Furubira_District,_Hokkaido.html',
 'Valentin_Yanin.html',
 'Kings_XI_Punjab_in_2014.html',
 'William_Harvey_Lillard.html',
 'Radial_Road_3.html',
 'George_Weldrick.html',
 'Zgornji_Otok.html',
 'Blue_Heelers_(season_8).html',
 'Taggen_Nunatak.html',
 '1951_National_League_tie-breaker_series.html']

In [3]:

with open("wiki/Radial_Road_3.html") as f:
    print(f.read()[:2000])
    

<!DOCTYPE html>
<html class="client-nojs" lang="en" dir="ltr">
<head>
<meta charset="UTF-8"/>
<title>Radial Road 3 - Wikipedia</title>
<script>document.documentElement.className = document.documentElement.className.replace( /(^|\s)client-nojs(\s|$)/, "$1client-js$2" );</script>
<script>(window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Radial_Road_3","wgTitle":"Radial Road 3","wgCurRevisionId":750120640,"wgRevisionId":750120640,"wgArticleId":22602027,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Infobox road temporary tracking category 1","Routes in Metro Manila"],"wgBreakFrames":false,"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","

In [4]:
# Simple reading:

def simple_read_files(files):
    content = []
    for file in files:
        with open("wiki/"+file,"r") as f:
            content.append(f.read())
    return content 

start1 = time.time()
simple_read_files(files)
end1 = time.time()
        
print(end1 - start1, "tiempo de lectura simple")

0.17531657218933105 tiempo de lectura simple


In [5]:
articles = [file.replace(".html","") for file in files]
articles[:10]

['Furubira_District,_Hokkaido',
 'Valentin_Yanin',
 'Kings_XI_Punjab_in_2014',
 'William_Harvey_Lillard',
 'Radial_Road_3',
 'George_Weldrick',
 'Zgornji_Otok',
 'Blue_Heelers_(season_8)',
 'Taggen_Nunatak',
 '1951_National_League_tie-breaker_series']

In [6]:
def read_files(filename):
    with open("wiki/"+filename,"r") as f:
        content = f.read()
        return content

start1 = time.time()
pool = concurrent.futures.ThreadPoolExecutor(max_workers=2)
contents = list(pool.map(read_files,files))
end1 = time.time()
print(end1 - start1, "tiempo con dos threads")

0.35270142555236816 tiempo con dos threads


###### We gains to little with threads because the task is computationally intensive and the GIRL of Cython does not allow parallelization in code-execution (one thread at a time).

In [8]:
# Testing optimal amount of workers for reading content of 
# files. 

for i in range(1,7):
    start2 = time.time()
    pool = concurrent.futures.ThreadPoolExecutor(max_workers=i)
    content2 = list(pool.map(read_files,files))
    end2 = time.time()
    print(end2 - start2, i, " workers")
    

0.344219446182251 1  workers
0.2848210334777832 2  workers
0.28457093238830566 3  workers
0.3295416831970215 4  workers
0.3415031433105469 5  workers
0.35746288299560547 6  workers


##### The efficiency changes for every execution. This is because we are working with few cores (less or equal than 2), but the performance is better than simple reading. 

In [22]:
# reading the content of html files. 

def parse_html(html_content):
    parser = BeautifulSoup(html_content, 'html.parser')
    return str(parser.find_all("div",id="content")[0])


In [25]:
# Using processes

del(pool)
for i in range(1,5):
    start2 = time.time()
    pool = concurrent.futures.ProcessPoolExecutor(max_workers=i)
    parsed = list(pool.map(parse_html,content2))
    end2 = time.time()
    print(end2 - start2, i, " workers")    

49.28571963310242 1  workers
35.53234243392944 2  workers
35.80071258544922 3  workers
38.409151554107666 4  workers


##### Since the time is variable for different reasons (like limited amount of CPU's), there is no significativelly gains in use different amount of workers,  each increment impacts in computation performance. But nevertheless it was suggested to use 3 workers because of the availables hardware capabilities. 

#####  Let's count up how many times each tag occurs. , if there are a lot of a tags on each page, we know that Wikipedia articles tend to be very connected to other articles or pages. On the other hand, a lot of div tags will tell us that Wikipedia pages tend to have a nested structure with many page elements.

##### Since the task is computationally intense, we gone a use Processes: 

In [26]:
def count_tags(html): 
    parser = BeautifulSoup(html,"html.parser")
    tags = {}
    for tag in parser.find_all(): 
        if tag.name not in tags: 
            tags[tag.name] = 1
        tags[tag.name] += 1
    return tags

start = time.time()

pool = concurrent.futures.ProcessPoolExecutor(max_workers=3)
dictionaries = list(pool.map(count_tags,parsed))

end = time.time()

print(end - start)


18.765032529830933


In [27]:
# Combine all dictionaries in a single one to get trends 
# in trends in all the wiki html files.

combined_tags = {}
for dic in dictionaries: 
    for k,v in dic.items():
        if k not in combined_tags:
            combined_tags[k] = v
        combined_tags[k] += v
        
print(combined_tags)

{'s': 20, 'ul': 12080, 'p': 9001, 'bdi': 10, 'strong': 994, 'rp': 66, 'mrow': 6, 'span': 68366, 'div': 29738, 'del': 6, 'h6': 4, 'img': 7707, 'big': 120, 'q': 99, 'dt': 406, 'small': 3719, 'br': 5714, 'mo': 6, 'rb': 34, 'area': 60, 'abbr': 4416, 'dl': 566, 'caption': 341, 'i': 19200, 'ruby': 34, 'h3': 983, 'blockquote': 89, 'pre': 4, 'rt': 34, 'wbr': 124, 'h5': 10, 'cite': 4160, 'b': 15455, 'hr': 70, 'sub': 180, 'math': 6, 'table': 4964, 'mstyle': 6, 'code': 162, 'annotation': 6, 'h2': 5020, 'samp': 6, 'source': 6, 'sup': 11984, 'center': 78, 'td': 58825, 'semantics': 6, 'li': 87029, 'tr': 28464, 'h4': 150, 'u': 64, 'font': 82, 'th': 15396, 'a': 162452, 'noscript': 2000, 'map': 6, 'ol': 1658, 'dd': 1464, 'audio': 6, 'h1': 2000}


In [46]:
# Counting words in every html-file

def counts_words(html):
    soup = BeautifulSoup(html, 'html.parser')
    words = {}
    text = soup.get_text()
    text = re.sub("\W+", " ", text.lower()) 
# remember that \w+ matches one or more word characters (same as [a-zA-Z0-9_]+).
    words = text.split(" ")
    words = [w for w in words if len(w) >= 5] 
    return dict(Counter(words))

In [47]:
start = time.time()

pool = concurrent.futures.ProcessPoolExecutor(max_workers=3)
dictionaries2 = list(pool.map(counts_words,parsed))

end = time.time()
print(end - start, "Taking the time for the task")

18.409634590148926 Taking the time for the task


In [52]:
# Combining dictionaries of word counts

combined_words = {}
for dic in dictionaries2: 
    for k,v in dic.items():
        if k not in combined_words:
            combined_words[k] = v
        combined_words[k] += v
print(len(combined_words), "words")

75653 words


In [55]:
# Finding the most common words in every html-file

def most_common_words(html):
    soup = BeautifulSoup(html, 'html.parser')
    words = {}
    text = soup.get_text()
    text = re.sub("\W+", " ", text.lower()) 
# remember that \w+ matches one or more word characters (same as [a-zA-Z0-9_]+).
    words = text.split(" ")
    words = [w for w in words if len(w) >= 5] 
    return dict(Counter(words).most_common(10))

In [56]:
start = time.time()

pool = concurrent.futures.ProcessPoolExecutor(max_workers=3)
dictionaries2 = list(pool.map(most_common_words,parsed))

end = time.time()
print(end - start, "Taking the time for the task")

19.40528702735901 Taking the time for the task


In [57]:
# Combining dictionaries of word counts

combined_words = {}
for dic in dictionaries2: 
    for k,v in dic.items():
        if k not in combined_words:
            combined_words[k] = v
        combined_words[k] += v
print(len(combined_words), "words")

4766 words


###### Next steps for data