In [25]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import re
import random



In [21]:
class Page:
    """
    A wikipedia page
    """
    def __init__(self, title):
        self.title = title
        self.text = None # query to get text
        self.links = [None] # manipulate text to get links
        self.backlinks = None # access the 'links to this page' in the tools on the sidebar

    def __repr__(self):
        return self.title

    def __str__(self):
        return self.title

    def __eq__(self, other):
        return self.title == other.title

    def __hash__(self):
        return hash(self.title)
    

In [22]:
class PageWalker:
    """
    A page walker, not really implemented but will look a lot like this
    """
    def __init__(self, start_page, depth):
        self.start_page = start_page
        self.pages = [start_page]
        

    def __iter__(self):
        return self

    def __next__(self):
        if len(self.pages) == 0:
            raise StopIteration
        page = self.pages.pop()
        for link in page.links:
            if link not in self.visited:
                self.visited.add(link)
                self.pages.append(link)
        return page

In [73]:
def get_linked_pages(url):
    reqs = requests.get(url)
    soup = BeautifulSoup(reqs.text, 'html.parser')

    reg = re.compile('\/wiki\/.*')
    
    titles = []
    for link in soup.find_all('a', href=True, title=True):
        if reg.match(link["href"]):
            titles.append(link["title"].replace(' ','_'))
    
    return titles
    

def refine_truncate(titles):
    i = 0
    reg = re.compile('Category:.*')
    while(i < len(titles) and not reg.match(titles[i])):
        i = i + 1
    return titles[:i]

def refine_remove_colon(titles):
    for t in titles:
        if ':' in t:
            titles.remove(t)
    return titles
    
    

Get dataframe containing top 1000 visited articles:

In [65]:
articles = pd.read_csv('../top1000.csv', usecols=["article"])

In [84]:
def random_walk(n, curr, path):
    if n == 0:
        path.append(curr)
        return path
    else:
        path.append(curr)
        links = get_linked_pages("https://en.wikipedia.org/wiki/" + curr)

        idx = random.randint(1,len(links)) - 1
        while(not links[idx] in articles.values):
            del links[idx]
            idx = random.randint(1,len(links)) - 1

        return random_walk(n-1, links[idx], path)
    
    

In [94]:
idx = random.randint(0,999)
start = articles.at[idx,'article']
path = random_walk(5,start,list())
path

['Clarence_Thomas',
 'American_Civil_War',
 'Wikipedia:Citation_needed',
 'Wikipedia:About',
 'Facebook',
 'Internet']

In [90]:
contain_values = articles[articles['article'].str.contains(':')]
contain_values


Unnamed: 0,article
1,Special:Search
16,File:HispanTv_logo.svg
24,Template:Russo-Ukrainian_War_detailed_map
28,Wikipedia:Featured_pictures
33,Portal:Current_events
46,Spider-Man:_No_Way_Home
56,XXX:_Return_of_Xander_Cage
98,XXX:_State_of_the_Union
141,Help:IPA/English
201,Vikings:_Valhalla
