In [1]:
import re
import requests
import urllib
import networkx as nx
from bs4 import BeautifulSoup
from typing import List
from networkx.algorithms.shortest_paths.generic import shortest_path

In [2]:
def links_from_text(text):
    soup = BeautifulSoup(text)
    content = soup.find("div", {"id": "mw-content-text"})
    
    links = content.find_all("a")
    re_rul = re.compile(":[А-ЯA-Z]+")
    for link in links:
        href = link.get('href', '')
        if href.startswith('/wiki'):
            wiki_link = urllib.parse.unquote(href).split("#")[0]
            if not re.findall(re_rul, wiki_link):
                yield wiki_link 

In [3]:
def queue_func(G, start_url, end_url, queue):
    success = []
    urls_to_queue = []
    edges_to_add = []
    uniq_urls = []
    for node in queue:
        if node in visited_urls:
            continue
        visited_urls.append(node)
        urls = []
        for link in links_from_text(requests.get(node).text):
            urls.append("https://ru.wikipedia.org" + link)
        uniq_urls.extend(set(urls))
        print(f'{node} has {len(uniq_urls)} children')
        
        for url in uniq_urls:
            edges_to_add.append((node, url))

    print(f'adding {len(uniq_urls)} nodes to graph')
    G.add_nodes_from(uniq_urls)
    print(f'adding {len(edges_to_add)} edges to graph')
    G.add_edges_from(edges_to_add)

    for node in uniq_urls:
        if node != end_url:
            urls_to_queue.append(node)
        else:
            success = shortest_path(G, start_url, node)
            return urls_to_queue, success
    return urls_to_queue, success

In [7]:
def wiki_walk(start, end):
    if start == end:
        return "start_url and end_url must be different"
    G = nx.Graph()
    global visited_urls
    visited_urls = []
    urls_to_q, success = queue_func(G, start, end, [start])
    while urls_to_q and not success:
        urls_to_q, success = queue_func(G, start, end, urls_to_q)
    return success

In [8]:
start_url = "https://ru.wikipedia.org/wiki/Улица_Руту"
end_url = "https://ru.wikipedia.org/wiki/Языки_мира"

In [10]:
wiki_walk(start_url, end_url)

https://ru.wikipedia.org/wiki/Улица_Руту has 9 children
adding 9 nodes to graph
adding 9 edges to graph
https://ru.wikipedia.org/wiki/Пийскопи has 11 children
https://ru.wikipedia.org/wiki/Тоом-Кооли has 30 children
https://ru.wikipedia.org/wiki/Курьер has 225 children
https://ru.wikipedia.org/wiki/Кесклинн has 345 children
https://ru.wikipedia.org/wiki/Таллин has 1173 children
https://ru.wikipedia.org/wiki/Ваналинн has 1271 children
https://ru.wikipedia.org/wiki/Эстонский_язык has 1458 children
https://ru.wikipedia.org/wiki/Вышгород_(Таллин) has 1610 children
https://ru.wikipedia.org/wiki/Старый_город_(Таллин) has 1783 children
adding 1783 nodes to graph
adding 7906 edges to graph


['https://ru.wikipedia.org/wiki/Улица_Руту',
 'https://ru.wikipedia.org/wiki/Эстонский_язык',
 'https://ru.wikipedia.org/wiki/Языки_мира']