In [2]:
#Read-in Wikipedia data

import os

#list all files in the wiki folder
files = os.listdir("wiki")
print(files)
print(len(files))


['Margo_Reuten.html', 'Hope_7_(album).html', 'Nejat_Alp.html', 'Pr%C3%A9liminaires.html', 'EDP_Sarichioi_Wind_Farm.html', 'Kenny_Cordray.html', 'Amborella.html', 'Valentin_Yanin.html', 'Yarumal.html', 'Urs_Burkart.html', 'Charles_Stuart_(rugby_union).html', 'Fahy,_County_Mayo.html', 'Acacia_dermatophylla.html', 'Peter_Collingwood.html', 'Manhattan_Murder_Mystery.html', 'Reb_Russell.html', 'Shigeo_Kurata.html', '83_(number).html', 'I_Am_Cold.html', 'KMTZ.html', 'Vanavara_Airport.html', 'Kul_Gul.html', 'Lee_Henderson_Watkins.html', 'Zgornji_Otok.html', 'Watsonville_Junction,_California.html', 'Cosmopterix_similis.html', 'Tapat%C3%ADo_hot_sauce.html', 'Plze%C5%88_Zoo.html', 'Olivaceous_flatbill.html', 'Hermann_Nuding.html', 'Yoshinkan.html', 'Shellie_Morris.html', 'Derek_Acorah.html', 'List_of_Argentine_Primera_Divisi%C3%B3n_transfers_January_2011.html', 'DiGiorgio_Corporation.html', 'Peltigera_membranacea.html', 'Hepatitis_B_virus.html', 'Vaccinium_padifolium.html', 'Evene.html', 'Alexio

In [3]:
#Look at single file in the wiki folder
with open("wiki/Margo_Reuten.html","r") as f:
    print(f.read())

<!DOCTYPE html>
<html class="client-nojs" lang="en" dir="ltr">
<head>
<meta charset="UTF-8"/>
<title>Margo Reuten - Wikipedia</title>
<script>document.documentElement.className = document.documentElement.className.replace( /(^|\s)client-nojs(\s|$)/, "$1client-js$2" );</script>
<script>(window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Margo_Reuten","wgTitle":"Margo Reuten","wgCurRevisionId":725341347,"wgRevisionId":725341347,"wgArticleId":45612085,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles with Dutch-language external links","CS1 Dutch-language sources (nl)","Pages containing links to subscription-only content","Use British English from March 2015","Articles with hCards","1966 births","Living people","Dutch chefs","Head chefs of Michelin starred restaurants","People from Maasgouw"],"wgBreakFrames":false,

In [4]:
#read in the files into different lists
#need to look at performance issues (by using different threads)
#I/O bound task
import concurrent.futures
import re
import os
import threading
import sys
import time


filenames = ["wiki/{}".format(f) for f in files]
content = []
articles = []

#print(filenames)

lock = threading.Lock()
def open_file(filename):
    lock.acquire()
    with open(filename, 'r') as f:
        content.append(f.read())
        articles.append(filename.replace(".html","").replace("wiki/",""))
    sys.stdout.flush()
    lock.release()

#temp_fnames = ["wiki/Margo_Reuten.html","wiki/Eternamente_Romanticos.html"]

threads = []
start = time.time()
for i, file in enumerate(filenames):
    thread = threading.Thread(target=open_file, args=(file,))
    thread.start()
    threads.append(thread)

for i, thread in enumerate(threads):
    thread.join()
    
#print(articles)
#print(content)
print("Elapsed time: {}".format(time.time()-start))


Elapsed time: 1.2579128742218018


In [5]:
#Thread Pool Executor

start = time.time()
pool = concurrent.futures.ThreadPoolExecutor(max_workers=1)
pool.map(open_file, filenames)
print("Elapsed time Thread Pool (1 worker): {}".format(time.time()-start))

start = time.time()
pool = concurrent.futures.ThreadPoolExecutor(max_workers=2)
pool.map(open_file, filenames)
print("Elapsed time Thread Pool (2 workers): {}".format(time.time()-start))

start = time.time()
pool = concurrent.futures.ThreadPoolExecutor(max_workers=4)
pool.map(open_file, filenames)
print("Elapsed time Thread Pool (4 workers): {}".format(time.time()-start))

Elapsed time Thread Pool (1 worker): 0.017709732055664062
Elapsed time Thread Pool (2 workers): 0.007915019989013672
Elapsed time Thread Pool (4 workers): 0.008593320846557617


In [None]:
#Process Pool Executor

start = time.time()
pool = concurrent.futures.ProcessPoolExecutor(max_workers=1)
pool.map(open_file, filenames)
print("Elapsed time Process Pool (w/ 1 workers): {}".format(time.time()-start))

start = time.time()
pool = concurrent.futures.ProcessPoolExecutor(max_workers=2)
pool.map(open_file, filenames)
print("Elapsed time Process Pool (w/ 2 workers): {}".format(time.time()-start))

start = time.time()
pool = concurrent.futures.ProcessPoolExecutor(max_workers=2)
pool.map(open_file, filenames)
print("Elapsed time Process Pool (w/ 4 workers): {}".format(time.time()-start))

Elapsed time Process Pool (w/ 1 workers): 0.07973337173461914
Elapsed time Process Pool (w/ 2 workers): 0.12804508209228516
Elapsed time Process Pool (w/ 4 workers): 0.13991785049438477


# Performance options for reading in files

Typical threading methods to run through the files and append them to a list is slow.  Performance can be optimized by using process or thread pool executors.  However, in general a thread pool executor with only 1 or 2 workers (typically depends on when it is being run) appears to be the fastest on this system.

In [None]:
#Removing extraneous markup

from bs4 import BeautifulSoup

   # soup = BeautifulSoup(html, 'html.parser')
    #return str(soup.find_all("div", id="content")[0])
def parse_html(content):
    lock.acquire()
    parser = BeautifulSoup(content, 'html.parser')
    #should only be one per page
    all_div = parser.find_all("div", id="content")[0]
    all_div = str(all_div)
    sys.stdout.flush()
    lock.release()
    return all_div

start = time.time()
pool = concurrent.futures.ProcessPoolExecutor(max_workers=1)
#converting output to list so it is usable
parsed = pool.map(parse_html, content)
time.sleep(5)
parsed = list(parsed)
print("Elapsed time Process Pool (w/ 1 workers): {}".format(time.time()-start))



In [None]:
# Finding common tags

start = time.time()
pool = concurrent.futures.ProcessPoolExecutor(max_workers=2)
parsed = pool.map(parse_html, content)
parsed = list(parsed)
print("Elapsed time Process Pool (w/ 2 workers): {}".format(time.time()-start))

start = time.time()
pool = concurrent.futures.ProcessPoolExecutor(max_workers=4)
parsed = pool.map(parse_html, content)
parsed = list(parsed)
print("Elapsed time Process Pool (w/ 4 workers): {}".format(time.time()-start))

print(parsed)

def find_tags(parsed):
    newdata = BeautifulSoup(parsed, 'html.parser')
    tags = {}
    for tag in newdata.find_all():
        if tag.name in tags:
            tags[tag.name]+=1
        else:
            tags[tag.name]=1
    return tags

tags = find_tags(parsed[0])
print(tags)

pool = concurrent.futures.ProcessPoolExecutor(max_workers=1)
tags = pool.map(find_tags, parsed)
print(tags)

In [None]:
#Finding common words 
#focus on words with at least 5 characters
#focus on words that happen at least several times
import re
from collections import Counter

def get_word_counts(parsed)
    newdata = BeautifulSoup(parsed, 'html.parser')
    text = newdata.get_text()
    text = re.sub("\W+", " ", text.upper())
    words = text.split(" ")
    words = [w for w in words if len(w) >= 5]
    count_words = {}
    for w in words:
        if w in count_words:
            count_words[w]+=1
        else:
            count_words[w]=1
    return Counter(words)


start = time.time()
pool = concurrent.futures.ProcessPoolExecutor(max_workers=3)
words = pool.map(get_word_counts, parsed)
words = list(words)

count_words = {}
for w in words:
    for word, cnt in w:
        if word in count_words:
            count_words[word] = cnt
        count_words[word] += cnt
end = time.time()
print(end - start)
print(count_words.most_common(10))




# Suggested Solution
https://github.com/dataquestio/solutions/blob/master/Mission227Solutions.ipynb

# Next Steps
We've done some basic analysis of the data, but there's still quite a bit more depth to go into:

What tags have the most content inside of them?
What articles are most commonly linked to from our articles?
What phrases are the most common?
What's the distribution of letters per word? How many 3 letter words are there? 4 letter?
What's the average reading level of a Wikipedia article? You can calculate this with readability metrics.
What images are most commonly shown in articles?
Most problems in data engineering are also fundamentally scale problems. The more data you have, the harder it is to process, and the more tradeoffs you have to make. It would be useful to download more articles, so that scale becomes more of a problem. Can you work with 1 gigabyte of articles, and have a reasonably fast processing methods? How about 10 gigabytes?

We recommend creating a Github repository and placing this project there. It will help other people, including employers, see your work. As you start to put multiple projects on Github, you'll have the beginnings of a strong portfolio. You're welcome to keep working on the project here, but we recommend downloading it to your computer using the download icon above and working on it there.

Readability tests, readability formulas, or readability metrics are formulae for evaluating the readability of text, usually by counting syllables, words, and sentences. Readability tests are often used as an alternative to conducting an actual statistical survey of human readers of the subject text (a readability survey). Word processing applications often have readability tests built-in, which can be deployed on documents in-editing.

The application of a useful readability test protocol will give a rough indication of a work's readability, with accuracy increasing when finding the average readability of a large number of works. The tests generate a score based on characteristics such as statistical average word length (which is used as an unreliable proxy for semantic difficulty) and sentence length (as an unreliable proxy for syntactic complexity) of the work.

In [None]:
#Find tags with the most content

def get_tag_content(parsed)
    newdata = BeautifulSoup(parsed, 'html.parser')
    tag_content = {}
    for tag in newdata.find_all():
        #not exact, but rough enough (counts # of characters)
        if tag in tag_content:
            tag_content[tag]+=len(tag.text.rstrip())
        else:
            tag_content[tag]=len(tag.text.rstrip())
    return tag_content
            
start = time.time()
pool = concurrent.futures.ProcessPoolExecutor(max_workers=3)
count_tag_content = pool.map(get_tag_content, parsed)
count_tag_content = list(count_tag_content)

cnt_tag_content = {}
for item in count_tag_content:
    for tag, val in item:
        if tag in cnt_tag_content:
            cnt_tag_content[tag]+=val
        else:
            cnt_tag_content[tag]=val
        
print(cnt_tag_content.most_common(10))


In [None]:
# articles most commonly liked to from other articles
#this is in the a href partition, title values
def most_linked_articles(parsed):
    newdata = BeautifulSoup(parsed, 'html.parser')
    temp_links = newdata.find_all("a href")
    #pprint(soup.find_all(href=re.compile("my_url"))) # all links contains key "my_url"
    #pprint(soup.find_all(attrs={'href' : re.compile("elsie"), 'id': 'link1'})) 
    # url name contains elsie and have id = link1

In [None]:
#What are the most common phrases

def extract_sentences(html):
    clean = []
    brs_in_a_row = 0
    temp = ""
    for item in raw_text.contents:
        if item.name == "br":
            brs_in_a_row = brs_in_a_row + 1
        else:
            temp = temp + item
        if brs_in_a_row == 2:
            clean.append(temp)
            temp = ""
            brs_in_a_row = 0
    return clean

import nltk

def most_common_phrases(parsed):
    clean_phrases = []
    text = newdata.get_text()
    #text = re.sub("\W+", " ", text.upper())
    text = text.upper()
    sentences = text.split(".").rstrip(" ")

    
tokens = nltk.word_tokenize("I want to be in an n gram")
#this looks at words of 5 strung together per sentence
nltk.util.ngrams(tokens, 5)
        words = nltk.word_tokenize(sent)
        for phrase in nltk.util.ngrams(words, length):
            phrase_counter[bphrase] += 1


most_common_phrases = phrase_counter.most_common(50)
for k,v in most_common_phrases:
    print '{0: <5}'.format(v), k

In [1]:
# Distribution of letter (length) per word

def count_word_length(parsed):
    text = newdata.get_text()
    text = re.sub("\W+", " ", text.upper())
    words = text.split(" ")
    length_words = {}
    for w in words:
        if len(words) in length_words:
            length_words[len(words)] += 1
        else:
            length_words[len(words)]=1
    return length_words

start = time.time()
pool = concurrent.futures.ProcessPoolExecutor(max_workers=3)
count_word_length = pool.map(count_word_length, parsed)
count_word_length = list(count_word_length)

word_lengths = []
for item in count_word_length:
    for cnt, val in item:
        if cnt in word_lengths:
            word_lengths[cnt]+=val
        else:
            word_lengths[cnt]=val

%matplotlib inline            
y_pos = np.arange(len(word_lengths))
plt.bar(y_pos, word_lengths, color="green")
plt.xticks(y_pos, word_lengths.index)
plt.ylabel("Number of Words")
plt.title("Number of Words with a Given Length")
plt.show()

NameError: name 'time' is not defined

# Average reading level of a wikipedia article

The application of a useful readability test protocol will give a rough indication of a work's readability, with accuracy increasing when finding the average readability of a large number of works. The tests generate a score based on characteristics such as statistical average word length (which is used as an unreliable proxy for semantic difficulty) and sentence length (as an unreliable proxy for syntactic complexity) of the work.

In [2]:
# word lengths and sentence lengths (i.e number of words per sentence)

In [None]:
# Images most commonly shown in articles
import imghdr

def most_common_images(parsed):
    newdata = BeautifulSoup(parsed, 'html.parser')
    temp_links = newdata.find_all(itemprop="image")
    #here we need to extract type (png?)
    for i in item_links:
    imghdr.what(temp_links)
    
    link = soup.find(itemprop="image")
    print(link["src"])