In [25]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import re
import random



In [21]:
class Page:
    """
    A wikipedia page
    """
    def __init__(self, title):
        self.title = title
        self.text = None # query to get text
        self.links = [None] # manipulate text to get links
        self.backlinks = None # access the 'links to this page' in the tools on the sidebar

    def __repr__(self):
        return self.title

    def __str__(self):
        return self.title

    def __eq__(self, other):
        return self.title == other.title

    def __hash__(self):
        return hash(self.title)
    

In [22]:
class PageWalker:
    """
    A page walker, not really implemented but will look a lot like this
    """
    def __init__(self, start_page, depth):
        self.start_page = start_page
        self.pages = [start_page]
        

    def __iter__(self):
        return self

    def __next__(self):
        if len(self.pages) == 0:
            raise StopIteration
        page = self.pages.pop()
        for link in page.links:
            if link not in self.visited:
                self.visited.add(link)
                self.pages.append(link)
        return page

In [23]:
def get_linked_pages(url):
    reqs = requests.get(url)
    soup = BeautifulSoup(reqs.text, 'html.parser')

    reg = re.compile('\/wiki\/.*')
    
    titles = []
    for link in soup.find_all('a', href=True, title=True):
        if reg.match(link["href"]):
            titles.append(link["title"].replace(' ','_'))
    
    titles = refine(titles)
    return titles
    

def refine(titles):
    i = 0
    reg = re.compile('Category:.*')
    while(i < len(titles) and not reg.match(titles[i])):
        i = i + 1
    return titles[:i]

    

In [24]:
articles = pd.read_csv('../top1000.csv', usecols=["article"])

Unnamed: 0,article
0,Main_Page
1,Special:Search
2,2022_Russian_invasion_of_Ukraine
3,Vladimir_Putin
4,The_Batman_(film)
...,...
995,West_Side_Story_(1961_film)
996,Battle_of_Kharkiv_(2022)
997,Patrick_Swayze
998,Sweden


In [56]:
def random_walk(n, curr, path):
    if n == 0:
        path.append(curr)
        return path
    else:
        links = get_linked_pages("https://en.wikipedia.org/wiki/" + curr)
        idx = random.randint(1,len(links)) - 1
        path.append(curr)
        return random_walk(n-1, links[idx], path)
    
    

In [58]:
idx = random.randint(0,999)
start = articles.at[idx,'article']
path = random_walk(5,start,list())
path

['Kharkiv',
 'Pavlo_Lobtsov',
 'Ukrainian_Association_of_Football',
 'Hryhoriy_Surkis',
 'Template:Expand_Ukrainian',
 'This_high-risk_template_is_permanently_semi-protected_to_prevent_vandalism']

In [53]:
path