# Python web scraping 

In [None]:
import requests
from bs4 import BeautifulSoup
import time
import random
import os
import uuid

## Získání obsahu

In [52]:
URL = "https://en.wikipedia.org/wiki/Penguin"

r = requests.get(URL)

## Zpracování obsahu

In [87]:
def get_content_from_response(response):
    
    def handle_p(element):
        if element.text == "\n":
            return []
        for item in element.findAll("sup"):
            item.decompose() 
        clean_text = element.text.replace("\n", "")
        return [(element.name, clean_text), ]

    def handle_h(element):
        for item in element.findAll("span", {"class": "mw-editsection"}):
            item.decompose()
        return [(element.name, element.text), ]
    
    html = r.text
    tree = BeautifulSoup(html, "html.parser")
    main = tree.find("div", {"class": "mw-parser-output"})
    outcome = {
        "url": response.url,
        "title": tree.find("h1").text,
    }
    
    # content
    content = []
    for element in main.findAll(recursive=False):
        if element.name == "p":
            content += handle_p(element)
        elif element.name.startswith("h"):
            content += handle_h(element)
        elif element.name in ["div", "span", "table", "ul", "ol"]:
            pass
        else:
            pass
    outcome["content"] = content
    
    # links
    links = []
    for link in main.findAll("a"):
        if "href" in link.attrs and not ":" in link.attrs["href"] and link.attrs["href"].startswith("/wiki/"):
            links.append(link.attrs["href"])
    outcome["links"] = links
        
    return outcome
       
       
data = get_content_from_response(r)   

## Ukládání obsahu

In [106]:
def content2txt(data):
    return "\n".join([content["title"],] + [item[1] for item in content["content"]])

def content2md(content):
    output = "# " + content["title"] + "\n"
    for item in content["content"]:
        if item[0] == "p":
            output += item[1] + "\n"
        else:
            level = int(item[0][1:])
            output += ("#" * level) + " " + item[1] + "\n"
    return output
    
def content2html(content):
    with open(os.path.join("data", "template1.html"), "r") as f:
        template = f.read()
    output = "<h1>" + content["title"] + "</h1>"
    for item in content["content"]:
        output += "<{0}>{1}</{0}>".format(item[0], item[1])  
    output = template.replace("[[BODY]]", output).replace("[[TITLE]]", content["title"])
    return output

def save_data(data, extension, func):
    to_write = func(data)
    filename = data["title"] + "_" + str(uuid.uuid1()) +"." + extension
    savepath = os.path.join(OUTPUT_DIR, filename)
    with open(savepath, "w") as f:
        f.write(to_write)   
    
OUTPUT_DIR = "output"

save_data(data, "html", content2html)
save_data(data, "md", content2md)

## Automatizace

In [111]:
HOST = "https://en.wikipedia.org"
OUTPUT_DIR = "output"

slugs = ["/wiki/Penguin", ]
visited = []

session = requests.session()


idx = 0
while slugs:
    if not slugs[0] in visited:        
        url = HOST + slugs[0]
        visited.append(slugs[0])
        r = session.get(url)
        data = get_content_from_response(r)
        slugs += data["links"]
        save_data(data, "html", content2html)
        save_data(data, "md", content2md)
        print(r.status_code, data["title"], data["url"])    
        sleep_time = random.random() * 2 + 0.3
        time.sleep(sleep_time)
    else:
        print("XXX", data["title"], data["url"])
    
    del slugs[0]
    idx += 1
    if idx > 10:
        break


print("\n")
print("Visited: ", len(visited))
print("In queue: ", len(slugs))

        

200 Penguin https://en.wikipedia.org/wiki/Penguin
200 Penguin (disambiguation) https://en.wikipedia.org/wiki/Penguin_(disambiguation)
200 Paleocene https://en.wikipedia.org/wiki/Paleocene
200 Year https://en.wikipedia.org/wiki/Megaannum
200 Precambrian https://en.wikipedia.org/wiki/Precambrian
200 Cambrian https://en.wikipedia.org/wiki/Cambrian
200 Ordovician https://en.wikipedia.org/wiki/Ordovician
200 Silurian https://en.wikipedia.org/wiki/Silurian
200 Devonian https://en.wikipedia.org/wiki/Devonian
200 Carboniferous https://en.wikipedia.org/wiki/Carboniferous
200 Permian https://en.wikipedia.org/wiki/Permian


Visited:  11
In queue:  4057
