In [1]:
import urllib.request
from bs4 import BeautifulSoup as bs
import re
import networkx as nx
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import numpy as np

In [2]:
class Page:
    
    def __init__(self, myStart, myStop):
        self.start = myStart #myStart is a STRING, representing the name of the start article
        self.stop = myStop #myStop is an INT, reprsenting what tier to stop at
        self.nodes = []
        self.edges = []
        self.scores = {} # correlates with nodes
        self.mainTags = [] # tags of the start article
        self.findAll(myStop)

    def findTags(node):
        myTags = []
        
        response = urllib.request.urlopen('https://en.wikipedia.org/wiki/' + node)
        html = response.read().decode('utf-8')
        seg = re.findall('\<div id="catlinks".*\<div', str(html))[0]
        soup = bs(seg, 'lxml')
        for link in soup.find_all('a', href=True):
            url = link.get('href')
            if(url.startswith('/wiki/Category:')):
                cat = url[15:]
                myTags.append(cat)
        return myTags
    
    def tagScore(node1, node2):
        tags1 = Page.findTags(node1)
        tags2 = Page.findTags(node2)
        intersect = len( [x for x in tags1 if x in tags2] )
        len1 = len( tags1 )
        len2 = len( tags2 )
        score = float(intersect)/((len1 + len2)/2)
        score = int(score*100)
        return score
    
    def findAll(self, n):
        '''
        finds all nodes n branches away from center
        @param center is a string representing a wikipedia article title
        @param n is an int
        '''
        tier = 0
        self.findNext(self.start, 0, n)
#         self.mainTags = Page.findTags(str(self))
        for string in Page.findTags(self.start):
            for word in string.split('_'):
                self.mainTags.append(word)

    def findNext(self, center, tier, n):
        if(tier + 1 < n):
            fooNodes = self.findTier(center)
            for s in fooNodes:
                if( s not in self.nodes):
                    self.nodes.append(s)
                    self.scores[s] = Page.tagScore(str(self), s)
                    self.edges.append( sorted([center, s]))
                    self.findNext(s, tier + 1, n)

    def findTier(self, center):
        '''
        finds all nodes 1 branch away from center
        @param center is a string representing a wikipedia article title
        '''
        nodeList = []
        response = urllib.request.urlopen('https://en.wikipedia.org/wiki/' + center)
        html = response.read()
        soup = bs(html,'lxml')
        for par in soup.find_all('p'):
            for link in par.find_all('a', href=True):
                url = link.get('href')
                if(url.startswith('/wiki/')) and (':' not in url):
                    if('#' in url):
                        hashLoc = url.find('#')
                        curSubj = url[6:hashLoc]
                    else:
                        curSubj = url[6:]
                    nodeList.append(curSubj)
        return nodeList
    
    def getNodes(self):
        return self.nodes
    def getEdges(self):
        return self.edges
    def getMainTags(self):
        return self.mainTags
    def getScores(self):
        return self.scores
    
    def __str__(self):
        return self.start

In [None]:
sci = Page('science', 2)
for elem in sci.getNodes():
    print(elem)

In [4]:
# math = Page('mathematics', 3)

In [5]:
# lit = Page('literature', 3)

In [6]:
# hist = Page('history', 3)

In [7]:
# art = Page('art', 3)

In [8]:
# cs = Page('computer_science', 3)