In [91]:
from bs4 import BeautifulSoup
import urllib, random, re, string
import cssutils
from pprint import pprint
import numpy as np
import pandas as pd
from pandas.io.formats.style import Styler

In [2]:
 
def visible(element):
    #if element.parent.name in [ 'script', '[document]', 'head', 'title','style']:
    if element.parent.name in ['[document]', 'script', 'head', 'title']:
        return False
    elif re.match('<!--.*-->', str(element)):
        return False
    return True

In [3]:
def isAboutPage(element):
    if 'about' in element:
        return True
    return False

In [4]:
def websiteText(url):
    htmltext = urllib.request.urlopen(url).read()
    soup = BeautifulSoup(htmltext)
    texts = soup.findAll(text=True)
    visible_texts = filter(visible, texts)
    result = filter(lambda x: x != '\n', visible_texts)
    articleResult = ""
 
    for text in result:
        articleResult += str(text.encode("utf-8"))
 
    articleResult = str(articleResult)
    #articleResult = BeautifulSoup(articleResult, convertEntities=BeautifulSoup.HTML_ENTITIES)
 
    return articleResult

In [5]:
def getAllHrefs(url):
    hrefs = []
    html_page = urllib.request.urlopen(url)
    soup = BeautifulSoup(html_page)
 
    for link in soup.findAll('a'):
        href = link.get('href')
 
        if href[0] == '/' :
            href = url + href
        hrefs.append(href)    
 
    return hrefs

# Semantic Text

In [6]:
def getAboutPages(url):
    allHrefs = getAllHrefs(url)
 
    aboutPages = filter(isAboutPage, allHrefs)
    return aboutPages
        

# Style Text

In [24]:
def isProperHex(t):
    return bool(re.search('[A-Fa-f0-9]{6}', t))


In [45]:
def onlyColour(t):
    startPos = t.find(':')
    return t[startPos+1:startPos+8]

In [7]:
def getStylePages(url):
    htmltext = urllib.request.urlopen(url).read()
    soup = BeautifulSoup(htmltext)
    cssSheets = [link["href"] for link in soup.findAll('link') if 'stylesheet' in link.get('rel', [])]
    
    return [sheet for sheet in cssSheets if ('//' not in sheet or 'https://' in sheet)] 

In [96]:
def getFonts(url, top=3):
    
    fonts = []
    cssPagesSample = getStylePages(url)
    cssPages = list(cssPagesSample)
    
    for cssPage in cssPages:
        if cssPage[0:4]=='http':
            styleSample = cssPage
        else:
            styleSample = url+cssPage
            
        cssText = urllib.request.urlopen(styleSample).read()
        soup = BeautifulSoup(cssText)
        cssTextRaw = soup.findAll(text=True)
        
        if cssTextRaw == []:
            break
        
        for styletag in str(cssTextRaw[0]).split('}'):
            if 'font-family' in styletag:
                startPos = styletag.find('font-family')
                endPos = styletag.find(';',startPos)
                fonts.append(styletag[startPos:endPos])
                
    
    fonts = np.array(fonts)
    unique, counts = np.unique(fonts, return_counts=True)
    
    df = pd.DataFrame(np.asarray((unique, counts)).T,columns=['Font','Count'])
    df = df.sort_values(by='Count',ascending=False).head(top)
    
    return df

In [92]:
def getBackgroundColors(url, top=3, renderColor = True):
    
    backgroundColors = []
    cssPagesSample = getStylePages(url)
    cssPages = list(cssPagesSample)
    
    #print(cssPages)
    
    for cssPage in cssPages:
        if cssPage[0:4]=='http':
            styleSample = cssPage
        else:
            styleSample = url+cssPage
            
        cssText = urllib.request.urlopen(styleSample).read()
        soup = BeautifulSoup(cssText)
        cssTextRaw = soup.findAll(text=True)
        
        #print(cssTextRaw )
        
        if cssTextRaw == []:
            break
        for styletag in str(cssTextRaw[0]).split('}'):
            if 'background-color' in styletag:
                startPos = styletag.find('background-color')
                endPos = styletag.find(';',startPos)
                backgroundColors.append(styletag[startPos:endPos])
    
    bgColors = np.array(backgroundColors)
    unique, counts = np.unique(bgColors, return_counts=True)
    
    df = pd.DataFrame(np.asarray((unique, counts)).T,columns=['Colour','Count'])
    df['IsProper'] = df['Colour'].apply(isProperHex)
    df = df[df['IsProper']==True]
    df['Colour'] = df['Colour'].apply(onlyColour)
    df['Count'] = df['Count'].apply(lambda x : int(x))
    df = df.drop(['IsProper'],axis=1)
    
    df = df.sort_values(by='Count',ascending=False).head(top)
    
    if renderColor:
        return df.style.applymap(lambda x:"background-color: %s"%x, subset=['Colour']) 
    else:
        return df


# Testing Area

In [97]:
getFonts('https://www.stackoverflow.com/')

Unnamed: 0,Font,Count
16,font-family:inherit,8
8,"font-family:Arial,""Helvetica Neue"",Helvetica,s...",7
11,"font-family:Arial,""Helvetica Neue"",Helvetica,s...",6


In [93]:
getBackgroundColors('https://www.stackoverflow.com/',3)

Unnamed: 0,Colour,Count
210,#eff0f1,32
173,#d6d9dc,9
124,#F48024,9


In [78]:
getBackgroundColors('https://www.sfu.ca/',3)

Unnamed: 0,Colour,Count
25,#a6192e,12
43,#f7c7ce,4
31,#e0d10a,2


In [79]:
getBackgroundColors('https://www.ubc.ca/',3)

Unnamed: 0,Colour,Count
2,#0680a6,4
1,#002145,1
6,#ab1f2e,1


In [80]:
getBackgroundColors('https://www.uwaterloo.ca/',3)

Unnamed: 0,Colour,Count
26,#4e4e4e,16
71,#e4b429,5
34,#787878,4


In [88]:
getBackgroundColors('https://www.amazon.ca',5)

Unnamed: 0,Colour,Count
64,#f3f3f3,18
11,#232f3e,6
18,#444C55,3
8,#19222d,3
9,#232F3E,3


In [84]:
getBackgroundColors('https://www.dell.com/en-ca',3)

Unnamed: 0,Colour,Count


In [89]:
getBackgroundColors('https://www.bestbuy.ca',3)

Unnamed: 0,Colour,Count
3,#001e7,51
17,#e0e6e,40
19,#f4f6f,16


In [None]:
Hm.. Why does bestbuy use only 5 characters for their hex codes? Is that a different standard?

In [95]:
getBackgroundColors('https://www.worksafebc.com',5)

Unnamed: 0,Colour,Count
70,#f2cf5,16
46,#E6ECf,8
160,#f5f5f5,7
49,#F7F8F,6
102,#337ab7,6


In [98]:
getFonts('https://www.sfu.ca/')

Unnamed: 0,Font,Count
25,font-family:'DINWeb',5
16,"font-family:""DINWebLight"",sans-serif",4
41,"font-family:DINWeb,sans-serif",3


In [99]:
getFonts('http://www.ubc.ca')

Unnamed: 0,Font,Count
3,"font-family:Whitney SSm A,Whitney SSm B,Arial,...",3
4,"font-family:Whitney SSm A,Whitney SSm B,Arial,...",2
5,font-family:fontawesome,2


In [10]:
for a in aboutPages:
    print('')
    print('')
    print(a)
    print('')
    
    print(websiteText(a))



https://uwaterloo.ca/about/

b'[if lt IE 7]><div id="ie6message">Your version of Internet Explorer web browser is insecure, not supported by Microsoft, and does not work with this web site. Please use one of these links to upgrade to a modern web browser: <a href="http://www.mozilla.org/firefox/">Firefox</a>, <a href="http://www.google.com/chrome">Google Chrome</a>, <a href="http://windows.microsoft.com/en-US/internet-explorer/products/ie/home">Internet Explorer</a>.</div><![endif]'b' 'b'Skip to main'b'Skip to footer'b'University of Waterloo'b'Admissions'b'About Waterloo'b'Faculties & academics'b'Offices & services'b'Support Waterloo'b'Search'b'Menu'b'This site'b'About Waterloo home'b'Who we are'b'Facts'b'History'b'Leadership and governance'b'Rankings'b'Our differentiators'b'Learning'b'Entrepreneurship'b'Co-op and experiential learning'b'Research'b'Reports'b'Accountability'b'Contact us'b'UWaterloo'b'Admissions'b'About Waterloo'b'Faculties & academics'b'Offices & services'b'Support Wa

HTTPError: HTTP Error 404: Not Found