In [68]:
import requests
import os
from bs4 import BeautifulSoup

URL = "https://en.wikipedia.org/wiki/Penguin"
OUT_DIR = "output"

In [20]:
r = requests.get(URL)

r.status_code

200

In [155]:
def get_content_from_response(response):
    html = r.text
    root = BeautifulSoup(html, "html.parser")
    body = root.find("div", {"class": "mw-parser-output"})
    
    data = {
        "content": [["h1", root.find("h1").text], ],
        "title": root.find("h1").text,
        "url": r.url,
        "links": [],
    }
    
    for item in body:
        new_content = []
        if item.name in [None, "div", "table", "ul", "ol", "style", "blockquote", "center", "link", "dl"]:
            pass
        elif item.name == "p":
            if len(item.text) > 2:
                for subitem in item.findAll("sup"):
                    subitem.decompose() 
                new_content = ["p", item.text[:-1]]
        elif item.name.startswith("h"):
            for subitem in item.findAll("span", {"class": "mw-editsection"}):
                subitem.decompose() 
            if not item.text in ["References", "Bibliography", "External links", "See also"]:
                new_content = [item.name, item.text]
        else:
            print(item.name)
            
        if new_content:
            data["content"].append(new_content)

    for link in body.findAll("a"):
        if "href" in link.attrs:
            if link.attrs["href"].startswith("/wiki/") and not ":" in link.attrs["href"]:
                data["links"].append(link.attrs["href"])
                
    return data

    
data = get_content_from_response(r)

# for line in data["content"]:
#     print(line)



In [82]:
def content_to_md(content):
    output = []
    for item in content:
        new_item = "#" * int(item[0][1]) + " " if item[0].startswith("h") else ""            
        output.append(new_item + item[1]) 
    return "\n\n".join(output)

def content_to_html(content):
    output = []
    for item in content:
        new_item = "<{0}>{1}</{0}>".format(item[0], item[1])
        output.append(new_item)
    body = "".join(output)
    with open("data/template1.html", "r") as f:
        template = f.read()
    return template.replace("[[BODY]]", body)
    

output = content_to_html(data["content"])

print(output)

 <!DOCTYPE html>
<html>
    <head>
        <meta charset="UTF-8">
        <title>
        [[TITLE]]
        </title>
        <style>
            html {
                background-color: #999999;
            }
            body {
                width: 80%;
                max-width: 500px;
                border: 1px solid #555555;
                padding: 10px;
                margin: 0px auto;
                background-color: #FFFFFF;
                font-family: Verdana, Geneva, sans-serif;
            }
            img {
                width: 100%;
            }
            p {
                text-align: justify;
            }
        </style>
    </head>

    <body>
        <h1>Penguin</h1><p>Penguins (order Sphenisciformes, family Spheniscidae) are a group of aquatic flightless birds.  They live almost exclusively in the Southern Hemisphere, with only one species, the Galapagos penguin, found north of the equator. Highly adapted for life in the water, penguins have countershade

In [83]:

def save_to_file(data):
    exports = [
        ("md", content_to_md),
        ("html", content_to_html),
    ]
    
    for export in exports:
        filename = data["title"].lower().replace(" ", "_") + "." + export[0]
        path = os.path.join(OUT_DIR, filename)
        text = export[1](data["content"])
        with open(path,"w") as f:
            f.write(text)

save_to_file(data)


In [156]:
BASE_URL = "https://en.wikipedia.org"

urls_queue = {"/wiki/Penguin",}
urls_seen = set()

session = requests.session()

IDX_STOP = 30

while urls_queue:
    slug = urls_queue.pop()
    url = BASE_URL + slug
    urls_seen.add(slug)
    r = session.get(url)
    data = get_content_from_response(r)
    new_links = set(data["links"])
    
    
    save_to_file(data)
    
    urls_queue = urls_queue.union(new_links.difference(urls_seen))
    print(r.status_code, len(urls_queue), data["title"])
    
    IDX_STOP -= 1
    if IDX_STOP <= 0:
        break





200 491 Penguin
200 568 Megapode
200 868 Taxonomy (biology)
200 875 Anthropodyptes
200 936 List of Sphenisciformes by population
200 993 Evolutionary taxonomy
200 1002 Monophyly
200 1020 Monotypic taxon
200 1394 Actinobacteria
200 1474 PubMed
200 1627 Linneus, Maine
200 1626 Merrill, Maine
200 1638 Waigeo brushturkey
200 1840 Chlorophyta
200 1847 Trebouxiophyceae
200 1938 Linnaean taxonomy
200 1958 Barthélemy Charles Joseph Dumortier
200 2157 Little penguin
200 2750 Linus Torvalds
200 2758 The Birds of Australia (Gould)
200 2776 Wrapper library
200 2784 Florencio Utreras
200 2893 Pullen Island (South Australia)
200 2898 Actinobacteridae
200 3028 Extensively drug-resistant tuberculosis
200 3032 Ulotrichales
200 3087 Dromornithidae
200 3100 Alcoota
200 3847 List of fossil sites
200 3985 Omo Kibish Formation


In [132]:
A = {"a", "b", "c", "d", "a"}
B = {"x", "y", "a", "b"}
C = set()

x = A.pop()
C.add(x)
A = A.union(B.difference(C))

print(A)
print(B)
print(C)

{'y', 'x', 'a', 'c', 'b'}
{'a', 'y', 'x', 'b'}
{'d'}


In [136]:
x = A.pop()
C.add(x)
A = A.union(B.difference(C))

print(A)
print(B)
print(C)

{'b'}
{'a', 'y', 'x', 'b'}
{'y', 'x', 'd', 'a', 'c'}
