# Distance à l'article sur la Philo de Wikipedia

In [34]:
from bs4 import BeautifulSoup
import requests as req
from urllib.parse import urlparse
from collections import deque
import string

In [35]:
domainUrl = 'https://fr.wikipedia.org'
sourcePath = '/wiki/Grève_étudiante_pour_le_climat'
targetPath = '/wiki/Philosophie'

In [36]:
maxDistance = 2

def computeDistance(domain, source, target, maxDistance=2):
    searchList = deque()  # URL on to do list
    spottedPaths = set()  # To avoid duplicates
    visitedPaths = {}  
    searchList.append((source, 0, ''))
    spottedPaths.add(source)

    while len(searchList):
        toSearch = searchList.popleft()
        visitedPaths[toSearch[0]] = (toSearch[1], toSearch[2])
        with req.get(domain + toSearch[0]) as response:
            if response.status_code != 200 \
                    or (('content-type' in response.headers) and not response.headers['content-type'].startswith('text/html')):
                continue
            soup = BeautifulSoup(response.text, 'html.parser')
            # Breadth first exploration
            for a in soup.find_all('a'):
                if 'href' not in a.attrs or len(a.attrs['href']) == 0:
                    continue

                hrefParsed = urlparse(a.attrs['href'])
                if (len(hrefParsed.netloc) > 0 and hrefParsed.netloc != domain) \
                        or len(hrefParsed.path) == 0:  # skip anchors in page
                    continue

                if hrefParsed.path.startswith(target):
                    return toSearch[1] + 1, toSearch[0], visitedPaths
                elif hrefParsed.path not in spottedPaths:
                    spottedPaths.add(hrefParsed.path)
                    if toSearch[1] < maxDistance:
                        searchList.append((hrefParsed.path, toSearch[1] + 1, toSearch[0]))
                    else:
                        print("Reached max distance for :", toSearch[0])
    return -1, '', visitedPaths

In [37]:
dist, fromPage, visited = computeDistance(domainUrl, sourcePath, targetPath, 5)

In [38]:
# Compute visit path
visitSequence = []
p = fromPage
while p != sourcePath:
    visitSequence.append(p)
    p = visited[p][1]

visitSequence.reverse()
visitSeqStr = ','.join(iter(visitSequence))

In [39]:
print("La distance à %s depuis %s est de %d, en passant par %s et en visitant %d pages" % (targetPath, sourcePath, dist, visitSeqStr, len(visited)))

La distance à /wiki/Philosophie depuis /wiki/Grève_étudiante_pour_le_climat est de 2, en passant par /wiki/%C3%89ducation_au_d%C3%A9veloppement_durable et en visitant 36 pages


In [40]:
visitSequence

['/wiki/%C3%89ducation_au_d%C3%A9veloppement_durable']