In [None]:
import certifi
import urllib3
from bs4 import BeautifulSoup as bs
import pprint as pp
import os
from multiprocessing import Pool
import json
from numpy.random import choice


# Web scraping

In [None]:
http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED',ca_certs=certifi.where())

In [None]:
# define paths for data caching
dataDir = 'data'
if not os.path.exists(dataDir):
    os.makedirs(dataDir)
urlListFName = '%s/urls.txt' % dataDir

## wineoneline.com.au

In [None]:
# initialise web fetching stuff
domain = "http://www.wineonline.com.au"
dataDirThisDomain = dataDir + '/wineonline'
woAllTextFname = '%s/all.txt' % dataDirThisDomain

In [None]:
# scrape list of wines to get a list of urls to individual wines
if not os.path.exists(urlListFName):
    individualPages = []

    # get list of wine pages
    for wine_type in ['sparkling','white-wine','red-wine','imported']:
        print("Start of wine_type %s" % wine_type)
        page = 0
        while True:
            suffix = '?sort=alphaasc&page=%d' % page
            url = '/'.join([domain,wine_type,suffix])
            r = http.request('GET',url)
            if r.status == 200:
                print("Hit: " + url)
                html = r.data.decode('utf-8')
                soup = bs(html, 'html.parser')
                individualPages += [a.get('href') for a in soup.select('.ProdHeading a')]
                page = page + 1

            else:
                print("end of %s at %d" % (wine_type, page))
                break
    individualPages = list(set(individualPages))
    print("Writing list of urls to %s" % urlListFName)
    with open(urlListFName,'w') as f:
        f.write('\n'.join(individualPages))
else:
    print("Skipping fetching from the search results pages")
print("Reading in from %s" % urlListFName)
with open(urlListFName,'r') as f:
    individualPages = [line.strip() for line in f.read().split('\n') if line.strip() != '']
#individualPages = individualPages[:10] # only some for now
print("done")

In [None]:
paths = [url.rstrip('/').split('/')[-1] for url in individualPages]
assert(len(paths) == len(individualPages))
print("done")

In [None]:

# scrape all the wines in the list

def fetchPath(path):
#     print("fetching %s" % url)
    url = domain.rstrip('/') + '/' + path.lstrip('/')
    dirName = '%s/bottles/%s' % (dataDirThisDomain,path)
    if not os.path.exists(dirName):
        os.makedirs(dirName)
    htmlFName = '%s/%s' % (dirName,'html.html')
    if not os.path.exists(htmlFName):
        print("Fetching %s" % url)
        r = http.request('GET',url)
        if r.status == 200:
    #         print("Parsing %s" % url)
            html = r.data.decode('utf-8').replace(u'\xa0', u' ')
            print("Saving to %s" % htmlFName)
            with open(htmlFName,'w') as f:
                f.write(html)
        else:
            print("Error, status %d for url %s" % (r.status,url))
            return([])
    else:
        print("Hit cache for %s" % htmlFName)
    with open(htmlFName,'r') as f:
        html = f.read()
    return(html)
    

if not os.path.exists(woAllTextFname):
    with Pool(20) as p:
        htmls = p.map(fetchPath,paths)
    print("done")
else:
    print("Skipping, because I'll read from cache")

In [None]:
if not os.path.exists(woAllTextFname):
    paras = []
    soups = [bs(html) for html in htmls]
    for soup in soups:
        print('next soup')
        paras += soup.select('#ProductDescription p')

    print("done")
else:
    print("Skipping, because I'll read from cache")

In [None]:
if not os.path.exists(woAllTextFname):
    for para in paras:
        for tag in ['span','strong','p']:
            for el in para.find_all(tag):
                if el.string:
                    num_words = len(el.string.replace('\n',' ').split(' '))
                    if num_words < 15:
                        el.string.replace_with('')
else:
    print("Skipping, because I'll read from cache")

In [None]:
if not os.path.exists(woAllTextFname):
    text = '\n'.join([para.get_text() for para in paras]).replace(u'\xa0',' ')
    for c in '.?!':
        text = text.replace(c,c + ' ').replace(c + '  ',c + ' ')
    with open(woAllTextFname,'w') as f:
        f.write(text)

with open(woAllTextFname,'r') as f:
    allTextWO = f.read()
    


## nakedwines.com.au

In [None]:
# found the number 210 through trial and error
# bigger number means more results per page
# but their crappy site fails when you enter a number too big (e.g. 210)
domain = 'https://www.nakedwines.com.au'

dataDirThisDomain = dataDir + '/nakedwines'

In [None]:
def getSearchResults(searchPage):
    url = domain + "/wines/index?searchText=&sortWines=descprice&pageSize=20&view=Wines&layoutType=card&allWines=true&pageNum=%d" % searchPage
    print("Fetching " + url)
    r = http.request('GET',url)
    assert(r.status == 200)
    print("Hit: " + url)
    html = r.data.decode('utf-8')
    soup = bs(html, 'html.parser')
    links = [a.get('href') for a in soup.select('a.card__header')]
    print("Found %d results for searchPage %d" % (len(links),searchPage))
    return(links)



if not os.path.exists(dataDirThisDomain):
    os.makedirs(dataDirThisDomain)
fname = '%s/paths.txt' % dataDirThisDomain

if not os.path.exists(fname):
    print("No cache found, fetching from search results on " + domain)
    searchPage = 0
    results = getSearchResults(searchPage)
    individualPages = results
    while len(results) > 0:
        searchPage += 1
        results = getSearchResults(searchPage)
        individualPages += results
    with open(fname,'w') as f:
        f.write('\n'.join(individualPages))
    # write and then read back from file
    # to make sure we wrote correctly

print("Reading urls from cache")
with open(fname,'r') as f:
    individualPages = [x.strip() for x in f.read().split('\n') if x.strip() != '']
    

print("Using urls:")
print('\n'.join(individualPages[:3] + ['...'])) 
print("Discovered %d individual pages" % len(individualPages))

    


In [None]:

def fetchPath(path):
    url = domain + path
    print("Fetching " + url)
    r = http.request('GET',url)
    assert(r.status == 200)
    print("Hit: " + url)
    html = r.data.decode('utf-8')
    fname = dataDirThisDomain.rstrip('/') + '/' + path.lstrip('/') + '.html'
    directory = '/'.join(fname.split('/')[:-1])
    if not os.path.exists(directory):
        os.makedirs(directory)
    with open(fname,'w') as f:
        f.write(html)
    soup = bs(html, 'html.parser')
    textEls = soup.find(id="tab-tick")
    assert(textEls)
    text = textEls.get_text().replace(u'\xa0',' ').strip()
    for c in '.?!':
        text = text.replace(c,c + ' ').replace(c + '  ',c + ' ')
    return(text)

fname = '%s/all.txt' % dataDirThisDomain 
if not os.path.exists(fname):
    print("No cache, fetching")
    with Pool(20) as p:
        texts = p.map(fetchPath,individualPages)
    with open(fname,'w') as f:
        f.write('\n'.join(texts))
else:
    print("hit cache")
    
with open(fname,'r') as f:
    allTextNW = f.read()

    
print("done")

## combining web scrapes

In [None]:
allText = allTextWO + '\n' + allTextNW

In [None]:
# takes in a string, returns an array of words
# where punctuation is a 'word'
def toWords(s):
    s = s.replace('\n',' ')
    s = s.replace('\t',' ')
    words = []
    for chunk in s.split(' '):
        chunk = chunk.lstrip('([{\'').rstrip(')]}').strip('`‘""')
        if chunk.strip() != '':
            punctuation = ',.?!;'
            for p in punctuation:
                if chunk.endswith(p):
                    words.append(chunk[:-1])
                    words.append(p)
            if not(any([chunk.endswith(p) for p in punctuation])):
                words.append(chunk)
              
    wordsS = set(words)
    func = lambda x: x.lower() if x.lower() in wordsS else x
    words = [func(w) for w in words]
    return(words)

def testToWords():
    s = 'At the dawn'
    words = ['At','the','dawn']
    assert(words==toWords(s))
    
    s = 'Of a  new  age\n'
    words = ['Of','a','new','age']
    assert(words==toWords(s))
    
    s = 'At the dawn, of  a\nnew \n age!'
    expected = ['At','the','dawn',',','of','a','new','age','!']
    actual = toWords(s)
    if expected != actual:
        print("expected: ")
        pp.pprint(expected)
        print("actual: ")
        pp.pprint(actual)
    assert(expected==actual)
    
    s = 'Here is here'
    words = ['here','is','here']
    assert(words==toWords(s))
    
    s = 'Here is (bracket stuff)'
    words = ['Here','is','bracket','stuff']
    if words != toWords(s):
        print(toWords(s))
    assert(words==toWords(s))
    
    s = 'Here is "a quote"'
    words = ['Here','is','a','quote']
    assert(words==toWords(s))
    
    s = "Ha! That's funny"
    words = ['Ha','!',"That's",'funny']
    assert(words==toWords(s))
    
testToWords()
print("test passed")

In [None]:
allTextSplit = toWords(allText)
print(allTextSplit)

# Single Layer Chain

In [None]:
dataOne = {}
for i in range(1,len(allTextSplit)):
    if (i % 10000) == 0:
        print('...')
    thisWord = allTextSplit[i]
    prevWord = allTextSplit[i-1]
    if prevWord == '.':
        prevWord = '' # use empty string to represent start
    if prevWord not in dataOne:
        dataOne[prevWord] = {thisWord:1}
    elif thisWord in dataOne[prevWord]:
        dataOne[prevWord][thisWord] += 1
    else:
        dataOne[prevWord][thisWord] = 1
        
# remove single character sentences
for c in '?!.;':
    if c in dataOne['']:
        del dataOne[''][c]
print("done")
dataOne

In [None]:
fname = '%s/dataOne.json' % dataDir
with open(fname,'w') as f:
    json.dump(dataOne,f)

In [None]:
# data is a dictionary
# the keys are what we are choosing
# the values are the weights
def weightedRandom(dist):
    entries = list(dist.keys())
    assert(len(entries) > 0)
    probabilities = [dist[e] for e in entries]
    scale = sum(probabilities)
    probabilities = [float(p)/scale for p in probabilities]
    draw = choice(range(len(entries)), 1, p=probabilities)[0]
    return(entries[draw])

In [None]:
def generateSentenceOne():
    word = weightedRandom(dataOne['']) # first word
    words = [word]
    while word not in '?!.;':
        prevWord = words[-1]
        word = weightedRandom(dataOne[prevWord])
        words.append(word)
    
    sentence = ''
    for word in words:
        if sentence == '':
            if len(word) == 1:
                sentence = word.upper()
            else:
                sentence = word[0].upper() + word[1:]
        else:
            if (word not in '-.,?;!'):
                sentence += ' '
            sentence += word
    if len(sentence) < 10:
        sentence = generateSentenceOne()
    return(sentence)

In [None]:
[generateSentenceOne() for _ in range(10)]

# N Layer chain

In [None]:
numLayers = 2
assert(numLayers > 0)

In [None]:
# split allTextSplit into a list of sentences, where each sentence is a list of words
sentences = [[]]
for word in allTextSplit:
    sentences[-1].append(word)
    if word in '?!.;-':
        sentences.append([])


In [None]:
dataNStart = {}
dataN = {}
solo = 0
many = 0
for sentence in sentences:
    if len(sentence) > numLayers:
        # get the start of the sentence
        firstN = tuple(sentence[0:numLayers])
        if firstN in dataNStart:
            dataNStart[firstN] += 1
        else:
            dataNStart[firstN] = 1

        # now the rest
        for i in range(numLayers,len(sentence)):
            thisWord = sentence[i]
            prevWords = tuple(sentence[i-numLayers:i])
            if prevWords not in dataN:
                dataN[prevWords] = {thisWord:1}
            elif thisWord not in dataN[prevWords]:
                dataN[prevWords][thisWord] = 1
                solo += 1
            else:
                dataN[prevWords][thisWord] += 1
                many += 1


assert(all([type(k) == type((1,2)) for k in dataN.keys()]))
print("solo: %d" % solo)
print("many: %d" % many)
assert(all([k in dataN for k in dataNStart]))


In [None]:
print("Starting")
# This is pretty messy

# I want to remove all words for which filter(word) returns True
# but then I have to remove all paths which lead to only that word
# but then I have to remove all paths which lead to only those paths
# etc.

def filter(word):
    names = ['Ben','James','Tyson','Steve','Andrew','Campbell','Jen','Margaret','Nigel', 'Kim','James-Paul','Gary']
    for name in names:
        if word in [name,name + "'s"]:
            return(True)
    if word.lower() in ['angel',"angel's","angels",'naked']:
        # Naked Wines mentions these a lot
        return(True)
    elif word.lower() in ['points']:
        # wineonline mentions these
        return(True)
    else:
        return(False)

# returns a dict
# {'start':[],'main':{'keys':[],'leafs':{k:[]}},'count':x}
# Where that first list is the list of keys to remove from the start dict
# And the 2nd is the list of keys to remove from the outer level of the main dict
# And the 3rd is the list of keys to remove from the inner level of the main dict
def getToPrune(startData,mainData):
    ret = {}
    ret['start'] = set([words for words in startData if any([filter(word) for word in words]) or words not in mainData])
    ret['count'] = len(ret['start'])
    print("Populated ret[start] with %d" % ret['count'])
    ret['main'] = {'leafs':{},'keys':set()}
    for prevWords in mainData: # a tuple of words
        #print("running getToPrune with prevWords = " + str(prevWords))
        # TODO: check if these leafs exist as keys
        for nextWord in mainData[prevWords]: 
            
            # if this word is in the filter, delete it
            # if choosing this word will give a chain that leads to a dead end, delete it
            newChain = (prevWords + (nextWord,))[1:]
            if filter(nextWord) or ((newChain not in mainData) and (nextWord not in '.!?')):
                if prevWords not in ret['main']['leafs']:
                    ret['main']['leafs'][prevWords] = set([nextWord])
                else:
                    ret['main']['leafs'][prevWords].add(nextWord)
#                 if filter(nextWord):
#                     print("Incrementing count for mainData[%s][%s] because %s is filtered" % (str(prevWords),nextWord,nextWord))
#                 else:
#                     print("Incrementing count for mainData[%s][%s] because %s a dead end" % (str(prevWords),nextWord,newChain))
                ret['count'] += 1
        
        # if we are about to delete all entries for mainData[prevWords], delete prevWords from mainData
        deletePrevWords = (prevWords in ret['main']['leafs']) and (len(ret['main']['leafs'][prevWords]) == len(mainData[prevWords]))
        
        # if mainData[prevWords] is already empty, delete it
        deletePrevWords |= len(mainData[prevWords]) == 0
        
        if deletePrevWords:
            ret['main']['keys'].add(prevWords)
#             print("Incrementing count for words %s" % str(prevWords))
            ret['count'] += 1
        elif (len(mainData[prevWords]) == 0):
#             print("Incrementing count for words %s" % str(prevWords))
            ret['main']['keys'].add(prevWords)
            ret['count'] += 1
    return(ret)
    
print("Starting pruning")
toPrune = getToPrune(dataNStart,dataN)
# toPrune
while toPrune['count'] > 0:
#     print('pruning (count = %d)' % toPrune['count'])
#     print("pruning %d from start dict" % len(toPrune['start']))
    assert(all([k in dataNStart for k in toPrune['start']]))
    dataNStart = {k:dataNStart[k] for k in dataNStart if k not in toPrune['start']}
    
#     print("pruning %d from main leafs" % sum([len(toPrune['main']['leafs'][k]) for k in toPrune['main']['leafs']]))
    for k in toPrune['main']['leafs']:
        assert(k in dataN)
        assert(all([j in dataN[k] for j in toPrune['main']['leafs'][k]]))
        dataN[k] = {j:dataN[k][j] for j in dataN[k] if j not in toPrune['main']['leafs'][k]}
        
#     print("pruning %d from main keys" % len(toPrune['main']['keys']))
    assert(all([k in dataN for k in toPrune['main']['keys']]))
    dataN = {k:dataN[k] for k in dataN if k not in toPrune['main']['keys']}
    toPrune = getToPrune(dataNStart,dataN)
    
solo = 0
many = 0
for k in dataN:
    if len(dataN[k]) == 1:
        solo += 1
    else:
        many += 1
        
print("Solo: %d" % solo)
print("Many: %d" % many)
    
print("done")

In [None]:
def capitalise(word):
    if len(word) == 1:
        return(word.upper())
    else:
        return(word[0].upper() + word[1:])

assert(capitalise('a') == 'A')
assert(capitalise('hello') == 'Hello')
assert(capitalise('A') == 'A')
assert(capitalise('Hello') == 'Hello')
    
def generateSentenceMany():
    words = list(weightedRandom(dataNStart))
    while words[-1] not in '!?.':
        weights = dataN[tuple(words[-numLayers:])]
        nextWord = weightedRandom(weights)
        assert(type(nextWord) == type(''))
        words.append(nextWord)
        
    sentence = capitalise(words[0])
    for word in words[1:]:
        if word not in ',.!?;':
            sentence += ' '
        sentence += word
        
    if not (5 < len(words) < 25):
        # bad length, try again
        return(generateSentenceMany())
    else:
        return(sentence)

print('\n\n'.join([generateSentenceMany() for _ in range(10)]))