In [44]:
import urllib.request
from bs4 import BeautifulSoup as bs
import re

class Page:
    '''
    represents a single wikipedia page
    '''
    def __init__(self, mySubj, myTier, myStop):
        '''
        constructor for page class
        mySubj is a string representing the desired subject
        '''
        self.tier = myTier # the tier of this page
        self.stop = myStop # which tier to not include
        self.subject = mySubj # every wikipedia page has shape 'en.wikipedia.org/wiki/subject'; the subject variable stores the last segment
        self.subList = [] # stores other page objects that are found on this page
        self.strList = [] # stores strings representing the names of the page objects
        self.scoreList = [] # stores floats representing the relevance of each page in subList to this page        
        if(self.tier != self.stop):
            response = urllib.request.urlopen('https://en.wikipedia.org/wiki/' + self.subject)
            html = response.read()
            soup = bs(html,'lxml')
            for par in soup.find_all('p'):
                for link in par.find_all('a', href=True):
                    url = link.get('href')
                    if(url.startswith('/wiki/')) and (':' not in url):
                        curSubj = '_'
                        if('#' in url):
                            hashLoc = url.find('#')
                            curSubj = url[6:hashLoc]
                        else:
                            curSubj = url[6:]
                        curPage = Page(curSubj, self.tier + 1, self.stop)
                        self.subList.append(curPage)
                        self.strList.append(str(curPage))
    
    def fillScoreList(self):
        for pg in subList:
            self.scoreList.append(self.getScore(pg))
    
    def getScoreList(self):
        return self.scoreList
    
    def getScore(self, aPage):
        '''
        returns a float between 0 and 1 representing the correlation between self and aPage
        @param aPage is another Page object
        
        score is calculated by len(intersection)/ avg(len(self),len(aPage))
        '''
        selfList = self.getStrList()
        aList = aPage.getStrList()
        intersect = len([x for x in selfList if x in aList])
        selfLen = len(selfList)
        otherLen = len(aList)
        score = intersect/((selfLen + otherLen)/2)
        return score        
    
    def getSubList(self):
        return self.subList
    
    def getStrList(self):
        return self.strList
    
#     def get scoreList():
#         '''
#         getter method for scoreList
#         '''
#         return scoreList
        
    def __str__(self):
        return self.subject

In [None]:
aPage = Page('mathematics', 0, 3)
aPage.fillScoreList()
for e in aPage.getScoreList():
    print(e)

In [None]:
# aPage = Page('science', 0, 3)
# bPage = Page('mathematics', 0, 3)

In [None]:
# print('aPage = ' + str(aPage) + ' | ' + str(len(aPage.getSubList())))
# print('bPage = ' + str(bPage) +  ' | ' + str(len(bPage.getSubList())))

# print('-----------------------------------')

# for pg in aPage.getSubList():
#     print(pg)
    
# print('-----------------------------------')

# for pg in bPage.getSubList():
#     print(pg)

In [None]:
# aPage.getScore(bPage)