In [49]:
from urllib.request import urlopen
from urllib.parse import urljoin
from html.parser import HTMLParser
import re
import operator


In [50]:
class Collector(HTMLParser):
    'collects hyperlink URLs into a list'

    def __init__(self, url):
        'initializes parser, the url, and a list'
        HTMLParser.__init__(self)
        self.url = url
        self.links = []
        self.data = []

    def handle_starttag(self, tag, attrs):
        'collects hyperlink URLs in their absolute format'
        if tag == 'a':
            for attr in attrs:
                if attr[0] == 'href':
                    # construct absolute URL
                    absolute = urljoin(self.url, attr[1])
                    # collect HTTP URLs from cdm.depaul.edu pages and limit responses to active server pages
                    if absolute[:26] == 'https://www.cdm.depaul.edu' and absolute[-4:] == 'aspx':
                        self.links.append(absolute)
                        
    
    def getLinks(self):
        'returns hyperlinks URLs in their absolute format'
        return self.links
    

In [51]:
def getContent(url):
    '''takes in url and returns the content as a string'''
    response = urlopen(url)
    htmlSource = response.read()    
    response.close()
    content = htmlSource.decode().lower()
    return content
    

In [52]:
def getLinks(url):
    '''gets links from content'''
    linkContent = getContent(url)
    collector = Collector(url)
    collector.feed(linkContent)
    return collector.getLinks()
    

In [53]:
def wordDict(content):
    '''goes through content and pulls out all text then modifies the word dictionary'''
    global wDict
    pList = []
    paragraphs = re.findall(r'<p>(.*?)</p>',str(content))
    for p in paragraphs:
        pList += p.split(' ')
    headers = re.findall(r'<h2>(.*?)</h2>',str(content))
    for h in headers:
        pList += h.split(' ')

    for x in pList:
        if x in wDict:
            wDict[x] += 1
        elif x not in wDict:
            wDict[x] = 1


In [55]:
#declaring variables
wDict = {}
linksGathered, linksCalled, linksErr = [], [], []

#gets content from first link then gathers all links from that page
baseurl = 'https://www.cdm.depaul.edu'
links = getLinks(baseurl)
linksCalled.append(baseurl)
wordDict(getContent(baseurl))

for link in links:
    if link not in linksGathered:
        linksGathered.append(link)

#recursively runs the functions for all links gathered
#handles for errors with web page
for url in linksGathered:
    if url not in linksCalled:
        try:
            links = getLinks(url)
            linksCalled.append(url)
            wordDict(getContent(url))
        except:
            linksErr.append(url)
        for link in links:
            if link not in linksGathered:
                linksGathered.append(link)

top15 = sorted(wDict.items(), key=operator.itemgetter(1), reverse=True)[:25]
print(top15)

https://www.cdm.depaul.edu/prospective%20students/pages/mastersdegreestudents.aspx
https://www.cdm.depaul.edu/academics/pages/mfa-in-creative-producing.aspx
https://www.cdm.depaul.edu/academics/pages/windrider-forum-at-sundance.aspx
https://www.cdm.depaul.edu/academics/pages/current/requirements-ms-in-cybersecurity-computer-security.aspx
https://www.cdm.depaul.edu/current%20students/pages/labsandresources/lab658-9.aspx
https://www.cdm.depaul.edu/academics/pages/current/requirements-ma-in-animation-animator.aspx
https://www.cdm.depaul.edu/academics/pages/current/requirements-mfa-in-film-and-television.aspx
https://www.cdm.depaul.edu/academics/pages/current/requirements-ms-is-it-enterprise-management.aspx
https://www.cdm.depaul.edu/academics/pages/current/requirements-ms-in-predictive-analytics-computational.aspx
https://www.cdm.depaul.edu/academics/pages/current/requirements-ms-in-predictive-analytics-health-care.aspx
https://www.cdm.depaul.edu/academics/pages/current/requirements-ms-in

https://www.cdm.depaul.edu/academics/old/old/2013fall-%20editing/ms_se_projectmanagement.aspx
https://www.cdm.depaul.edu/academics/old/old/2013fall-%20editing/ms_se_gamingandentertainment.aspx
https://www.cdm.depaul.edu/academics/old/old/2013fall-%20editing/ms-se-entrepreneurship.aspx


In [56]:
'''
Below is the output of my script. I limited the links to active server pages (.aspx)
It checks 831 links and takes ~7 minutes to run.

[('the', 5546),
 ('and', 3489),
 ('of', 3135),
 ('to', 2894),
 ('in', 2720),
 ('a', 2575),
 ('students', 2187),
 ('for', 1593),
 ('is', 1292),
 ('or', 1277),
 ('with', 1098),
 ('be', 1002),
 ('will', 993),
 ('are', 982),
 ('course', 974),]
 '''



748
90
831
{'the': 5546, 'next': 18, 'generation': 16, 'of': 3135, 'visual': 45, 'storytellers,': 3, 'taught': 23, 'by': 347, 'professional': 108, 'filmmakers,': 3, 'utilizing': 5, 'latest': 15, 'technology': 90, 'and': 3489, 'a': 2575, '32,000': 4, 'sq.': 3, 'ft.': 3, 'production': 72, 'facility': 10, 'innovative': 27, 'nationally': 4, 'recognized': 4, 'programs,': 11, 'inspiring': 4, 'teachers,': 5, 'strong': 26, 'connections': 8, 'with': 1098, 'industry,': 16, 'drawing': 6, 'on': 635, 'cutting': 8, 'edge': 8, 'research': 224, 'established': 25, 'emerging': 20, 'design': 153, 'disciplines': 10, 'focusing': 6, 'collaboration,': 4, 'critical': 30, 'thinking,': 7, 'research,': 11, 'play,': 6, 'socio-cultural': 3, 'problem': 5, 'solving': 8, 'community': 23, 'endless': 3, 'possibilities': 3, 'ipd': 114, 'offers': 46, '<a': 1665, 'href="/ipd/programs/pages/programsofstudy.aspx">intensive': 3, 'certificate': 78, 'programs</a>': 4, 'designed': 91, 'for': 1593, 'it': 407, 'professionals': 41