In [29]:
import requests

In [30]:
def query_api(base_uri:str, article_name:str):
    url = f'{base_uri}/w/api.php?action=parse&page={article_name}&format=json'
    response = requests.get(url)
    return response.json()

def query_wikipedia(article_name : str):
    base_uri = 'https://ru.wikipedia.org'
    return query_api(base_uri, article_name)

def query_mediawiki(article_name :str):
    base_uri = 'https://www.mediawiki.org'
    return query_api(base_uri, article_name)

def query_wiki(article_name : str) -> list:
    result = list()
    pedia_response = query_wikipedia(article_name)
    if 'parse' in pedia_response:
        result.append(pedia_response)
    media_response = query_mediawiki(article_name)
    if 'parse' in media_response:
        result.append(media_response)
    return result

query_wiki('Медицина')  

[{'parse': {'title': 'Медицина',
   'pageid': 2703,
   'revid': 139993460,
   'langlinks': [{'lang': 'af',
     'url': 'https://af.wikipedia.org/wiki/Geneeskunde',
     'langname': 'африкаанс',
     'autonym': 'Afrikaans',
     '*': 'Geneeskunde'},
    {'lang': 'als',
     'url': 'https://als.wikipedia.org/wiki/Medizin',
     'langname': 'швейцарский немецкий',
     'autonym': 'Alemannisch',
     '*': 'Medizin'},
    {'lang': 'am',
     'url': 'https://am.wikipedia.org/wiki/%E1%88%95%E1%8A%AD%E1%88%9D%E1%8A%93',
     'langname': 'амхарский',
     'autonym': 'አማርኛ',
     '*': 'ሕክምና'},
    {'lang': 'an',
     'url': 'https://an.wikipedia.org/wiki/Medicina',
     'langname': 'арагонский',
     'autonym': 'aragonés',
     '*': 'Medicina'},
    {'lang': 'ar',
     'url': 'https://ar.wikipedia.org/wiki/%D8%B7%D8%A8',
     'langname': 'арабский',
     'autonym': 'العربية',
     '*': 'طب'},
    {'lang': 'arc',
     'url': 'https://arc.wikipedia.org/wiki/%DC%90%DC%A3%DC%9D%DC%98%DC%AC%DC%90',
 

In [31]:
import bs4

def parse_html_to_soup(response : dict) -> bs4.BeautifulSoup:
    html = response['parse']['text']['*']
    soup = bs4.BeautifulSoup(html, 'html.parser')
    return soup

# for response in query_wiki('Пермь'):
#     print(parse_html(response))

In [32]:
from urllib.parse import quote_plus

In [35]:
def find_links(soup: bs4.BeautifulSoup):
    div = soup.find('div', class_='mw-parser-output')
    a_tags = div.find_all('a', href=True)

    result = []
    for a in a_tags:
        href = a.get('href')
        if href.startswith('/wiki/') and not href.startswith(f'/wiki/{quote_plus('Файл:')}'):
            atitle = a.get('title')
            if isinstance(atitle, str) and atitle.strip():
                result.append({
                    'title' : atitle if atitle else a.text,
                    'href' : href
                })
    return result
for response in query_wiki('Пермь'):
    for link in find_links(parse_html_to_soup(response)):
        print(link)

{'title': 'Пермь (значения)', 'href': '/wiki/%D0%9F%D0%B5%D1%80%D0%BC%D1%8C_(%D0%B7%D0%BD%D0%B0%D1%87%D0%B5%D0%BD%D0%B8%D1%8F)'}
{'title': 'Молотов (значения)', 'href': '/wiki/%D0%9C%D0%BE%D0%BB%D0%BE%D1%82%D0%BE%D0%B2_(%D0%B7%D0%BD%D0%B0%D1%87%D0%B5%D0%BD%D0%B8%D1%8F)'}
{'title': 'Герб Перми', 'href': '/wiki/%D0%93%D0%B5%D1%80%D0%B1_%D0%9F%D0%B5%D1%80%D0%BC%D0%B8'}
{'title': 'Россия', 'href': '/wiki/%D0%A0%D0%BE%D1%81%D1%81%D0%B8%D1%8F'}
{'title': 'Федеративное устройство России', 'href': '/wiki/%D0%A4%D0%B5%D0%B4%D0%B5%D1%80%D0%B0%D1%82%D0%B8%D0%B2%D0%BD%D0%BE%D0%B5_%D1%83%D1%81%D1%82%D1%80%D0%BE%D0%B9%D1%81%D1%82%D0%B2%D0%BE_%D0%A0%D0%BE%D1%81%D1%81%D0%B8%D0%B8'}
{'title': 'Пермский край', 'href': '/wiki/%D0%9F%D0%B5%D1%80%D0%BC%D1%81%D0%BA%D0%B8%D0%B9_%D0%BA%D1%80%D0%B0%D0%B9'}
{'title': 'Городской округ (Россия)', 'href': '/wiki/%D0%93%D0%BE%D1%80%D0%BE%D0%B4%D1%81%D0%BA%D0%BE%D0%B9_%D0%BE%D0%BA%D1%80%D1%83%D0%B3_(%D0%A0%D0%BE%D1%81%D1%81%D0%B8%D1%8F)'}
{'title': '15 мая', 'href':

In [6]:
# MAX_DEEP = 2

# def dive(level: int, article_name:str, dictionary : dict):
#     level += 1
#     if level <= MAX_DEEP:
#         responses = query_wiki(article_name)
#         for response in responses:
#             soup = parse_html_to_soup(response)
#             links = find_links(soup)
#             dictionary[article_name] = links
#             for link in links:
#                 dive(level, link['title'], dictionary)

# def dive_deep(article_name : str) -> dict:
#     result = {}
#     level = 0
#     dive(level, article_name, result)
#     return result


In [7]:
# import graphviz

# def make_dot_deeper(level : int, dot, parent, children, graph : dict):
#     level += 1
#     if level <= MAX_DEEP:
#         for child in children:
#             child_name = child['title']
#             dot.node(child_name)
#             dot.edge(parent, child_name)
#             if child_name in graph and not child_name in dot.body:
#                 make_dot_deeper(level, dot, child_name, graph[child_name], graph)

# def make_dot_graph(article_name : str):
#     dot = graphviz.Digraph()
#     graph = dive_deep(article_name)

#     root = next(iter(graph))
#     children = graph[root]

#     dot.node(root)
#     make_dot_deeper(0,dot, root, children, graph)

#     return dot.unflatten(stagger=10)

# make_dot_graph('Пермь')

In [38]:
from anytree import Node
from anytree.exporter import UniqueDotExporter

import random
r = random.SystemRandom()
MAX_HEIGHT = 10

def make_tree(article_name : str, height : int, parent_node, max_childer = 1000):
    if not article_name.strip():
        return

    if parent_node is None:
        parent_node = Node(article_name)
        current_node = parent_node
    else:
        current_node = Node(article_name, parent=parent_node)

    if height >= MAX_HEIGHT:
        return
    else:
        height += 1

    responses = query_wiki(article_name)
    for response in responses:
        soup = parse_html_to_soup(response)
        links = find_links(soup)
        if isinstance(max_childer, int):
            # links = links[:min(max_childer, len(links))]
            links = r.choices(links, k=min(max_childer, len(links)))
        for link in links:
            make_tree(link['title'], height, current_node, max_childer)
    
    return parent_node

# .to_picture('output.png')
UniqueDotExporter(make_tree('Пермь', 0, None, 3)).to_dotfile('graph.dot')
!dot ./graph.dot -Tsvg -o graph.svg

1318.87s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


In [39]:
from anytree.exporter import DotExporter
DotExporter(make_tree('Чёрная металлургия', 0, None, 3)).to_dotfile('graph3.dot')
!dot ./graph3.dot -Tsvg -o graph3.svg

1341.52s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
