In [1]:
import wikipedia as wp
import pydot

def ascii(inp):
    return str(inp.encode("ascii",errors="ignore"))

class WikiScraper:
    def __init__(self, startpage,maxbreadth=10):
        self.startpage=startpage
        self.maxbreadth=maxbreadth
        self.maxdepth=0

        self.visited = set()

        self.graph=pydot.Dot()

    def connect(self,parent,children):
        self.visited.add(parent)
        for child in children:
            edge=pydot.Edge(ascii(parent),ascii(child))
            self.graph.add_edge(edge)

    def pickLinks(self,page,n):
        """Pick `n` links from page."""
        links=page.links
        indices=range(1,len(links),len(links)/n)
        return [links[i] for i in indices]

    def explore(self,pagename,depth):
        #Return if we've exceeded max depth.
        if depth==self.maxdepth:
            return
        #Return if we've already visited a page
        if pagename in self.visited:
            return

        try:
            page=wp.page(pagename)
        except wp.exceptions.DisambiguationError:
            #Return in the event of reaching a disambiguation page
            return
        except wp.exceptions.PageError:
            #We've tried to find a page that doesn't exist
            print "The page {} could not be found".format(pagename.encode("utf-8"))
            return

        print "Exploring \""+pagename.encode("utf-8")+"\" at depth "+str(depth)

        links=self.pickLinks(page,self.maxbreadth)

        self.connect(pagename,links)

        for link in links:
            self.explore(link,depth+1)

    def start(self,maxdepth=0):
        self.maxdepth=maxdepth
        self.explore(self.startpage,1)

if __name__ == "__main__":
    STARTPAGE="Cats"
    BREADTH=5
    MAXDEPTH=5
    w=WikiScraper(STARTPAGE,BREADTH)
    w.start(MAXDEPTH)
    w.graph.write(STARTPAGE+".dot")

Exploring "Cats" at depth 1
Exploring "3-mercapto-3-methylbutan-1-ol" at depth 2
Exploring "3-mercaptohexan-1-ol" at depth 3
Exploring "ChemSpider" at depth 4
Exploring "International Chemical Identifier" at depth 4
Exploring "Preferred IUPAC name" at depth 4




 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


Exploring "Thiol" at depth 4
Exploring "Cat meat" at depth 2
Exploring "Abalone" at depth 3
Exploring "Abalone viral ganglioneuritis" at depth 4
Exploring "Clockmaker" at depth 4
Exploring "Haliotis sorenseni" at depth 4
Exploring "Nerite" at depth 4
Exploring "Seaweed farming" at depth 4
Exploring "Cat senses" at depth 3
Exploring "Accuracy and precision" at depth 4
Exploring "Cat righting reflex" at depth 4
Exploring "Evolutionary tree" at depth 4
Exploring "List of cat breeds" at depth 4
Exploring "Scottish Fold" at depth 4
Exploring "Visual perception" at depth 4
Exploring "Felinology" at depth 3
Exploring "Aegean cat" at depth 4
Exploring "Cat communication" at depth 4


KeyboardInterrupt: 