In [71]:
from urllib import request, parse
from bs4 import BeautifulSoup
from google.cloud import datastore
import time

In [112]:
def getNewsHeadlines(ABN, supplierName):
    print('Waiting 5 seconds then scraping news headlines for {}'.format(supplierName))
    time.sleep(5)
    header = {
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
        "X-Requested-With": "XMLHttpRequest"
    }
    url = 'https://www.google.com/search?q=' + parse.quote_plus(supplierName) + '&tbm=nws&source=lnt&tbs=sbd:1&sa=X&biw=1368&bih=807&dpr=2'

    req = request.Request(url, headers=header) 
    x = request.urlopen( req ).read()

    soup = BeautifulSoup(x, "html.parser")
    
    links = soup.find_all("a", class_="_PMs")
    
    headlines = []
    
    for headline in links:
        headlines.append({'ABN': ABN, 'Headline': headline.get_text(), 'URL': headline['href']})
        
    return headlines

In [72]:
client = datastore.Client()
collection = 'suppliers'

query = client.query(kind=collection)
query.order = ['Name']

results = list(query.fetch())
supplier_count = len(results)


suppliers = []
for r in results:
    s = datastore.Entity(r)
    supplier = {}
    supplier['ABN'] = s.key.key.name
    supplier['Name'] = s.key.get('Name')
    
    suppliers.append(supplier)

In [120]:
def saveToCloud(data, collection):
    client = datastore.Client()
    key = client.key(collection, data[0]['ABN'])
    record = datastore.Entity(key=key)
    record['ABN'] = data[0]['ABN']
    record['Headline'] = data[0]['Headline']
    record['URL'] = data[0]['URL']
    
    client.put(record)
    print('Saved headlines for ABN: {}'.format(record.key.name))

In [125]:
for s in suppliers:
    supplier_headlines = getNewsHeadlines(s['ABN'], s['Name'])
    
    if (len(supplier_headlines) > 0):
        saveToCloud(supplier_headlines, 'supplier_headlines')
    else:
        print('No headlines for {} {}'.format(s['ABN'], s['Name']))
        

Waiting 5 seconds then scraping news headlines for 4DATA Hall 1 Pty Ltd (trading as 4DATA)
Saved headlines for ABN: 51137899346
Waiting 5 seconds then scraping news headlines for A&A Testing Consultants Pty Ltd
Saved headlines for ABN: 95127484904
Waiting 5 seconds then scraping news headlines for ACSPRO Pty Ltd
Saved headlines for ABN: 49800667911
Waiting 5 seconds then scraping news headlines for AGIS Group Pty Ltd
No headlines for 34129384032 AGIS Group Pty Ltd
Waiting 5 seconds then scraping news headlines for ASG Group Ltd
Saved headlines for ABN: 57070045117
Waiting 5 seconds then scraping news headlines for ASPL Australia Pty Ltd
No headlines for 83123668940 ASPL Australia Pty Ltd
Waiting 5 seconds then scraping news headlines for Access Testing Pty Ltd trading as AccessHQ
No headlines for 13069942552 Access Testing Pty Ltd trading as AccessHQ
Waiting 5 seconds then scraping news headlines for AccessibilityOz Pty Ltd
No headlines for 52150446521 AccessibilityOz Pty Ltd
Waiting 5

Saved headlines for ABN: 97109189059
Waiting 5 seconds then scraping news headlines for Facet Consulting Pty Ltd
No headlines for 93103238759 Facet Consulting Pty Ltd
Waiting 5 seconds then scraping news headlines for Finite Group APAC Pty Ltd
Saved headlines for ABN: 43085406300
Waiting 5 seconds then scraping news headlines for Finkisoft Pty Ltd
No headlines for 98127371064 Finkisoft Pty Ltd
Waiting 5 seconds then scraping news headlines for Foresight IT Consulting Pty Ltd
Saved headlines for ABN: 82119675204
Waiting 5 seconds then scraping news headlines for Frontier Group Australia Pty Ltd
Saved headlines for ABN: 77087743879
Waiting 5 seconds then scraping news headlines for Fuji Xerox Australia Pty Ltd
Saved headlines for ABN: 63000341819
Waiting 5 seconds then scraping news headlines for Fujitsu Australia Ltd
Saved headlines for ABN: 19001011427
Waiting 5 seconds then scraping news headlines for Funnelback Pty Ltd
No headlines for 34116105296 Funnelback Pty Ltd
Waiting 5 seconds

Saved headlines for ABN: 72106208066
Waiting 5 seconds then scraping news headlines for Peoplebank Australia Ltd
Saved headlines for ABN: 42003995748
Waiting 5 seconds then scraping news headlines for Perocin Pty Ltd
No headlines for 15079494518 Perocin Pty Ltd
Waiting 5 seconds then scraping news headlines for Phoenix Management Services Pty Ltd
Saved headlines for ABN: 11151887802
Waiting 5 seconds then scraping news headlines for Powerdata Group Consulting Pty Ltd
No headlines for 15107160770 Powerdata Group Consulting Pty Ltd
Waiting 5 seconds then scraping news headlines for Precision Consulting Corporation Pty Limited
No headlines for 91131039506 Precision Consulting Corporation Pty Limited
Waiting 5 seconds then scraping news headlines for Predicate Partners Pty Ltd
No headlines for 34128711348 Predicate Partners Pty Ltd
Waiting 5 seconds then scraping news headlines for Procurement Professionals Pty Ltd
Saved headlines for ABN: 20076432212
Waiting 5 seconds then scraping news h

Saved headlines for ABN: 92104128001
