**ABC News Webcrawler**

Import dependencies

In [105]:
import nltk
import pandas as pd
import numpy as np
import json
from urllib.request import Request, urlopen
import re
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt 
from nltk.stem.porter import PorterStemmer
import seaborn as sns
nltk.download('stopwords')
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from google.colab import drive
drive.mount('/content/drive')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Mounted at /content/drive


Helper functions

In [3]:
#frequency of words
def freq_words(x, terms = 30): 
  all_words = ' '.join([text for text in x]) 
  all_words = all_words.split() 
  fdist = nltk.FreqDist(all_words) 
  words_df = pd.DataFrame({'word':list(fdist.keys()), 'count':list(fdist.values())}) 
  
  # selecting top n most frequent words 
  d = words_df.nlargest(columns="count", n = terms) 
  # visualize words and frequencies
  plt.figure(figsize=(12,15)) 
  ax = sns.barplot(data=d, x= "count", y = "word") 
  ax.set(ylabel = 'Word') 
  ax.set_xlabel('Count', fontsize = 46)
  ax.set_ylabel('Word', fontsize = 46)
  ax.tick_params(axis='both', which='major', labelsize=46)
  plt.show()
  
#determines unique words
def uniqueWords(x):
  words = ' '.join([text for text in x]) 
  words = nltk.tokenize.word_tokenize(words)
  fdist1 = nltk.FreqDist(words)
  freq = dict((word, freq) for word, freq in fdist1.items() if not word.isdigit())
  return freq

Generate lists of sites from archive

In [4]:
import urllib.request
page = urllib.request.urlopen('http://web.archive.org/cdx/search/cdx?url=abc.net.au/news/')
contents= page.read()
contents 
stringcontents = contents.decode("utf-8")
stringcontents = stringcontents.splitlines()
len(stringcontents) #66167 weblinks

66304

Generate a dictionary containing the eight digit date as key and the full website encode for navigation to archived page

In [None]:
contentssplit = contents.splitlines()
len(contentssplit)
dictPages = {}
for c in contentssplit:
  stringc = c.decode("utf-8") 
  csplit=stringc.split()
  for cc in csplit:
    cc=str(cc)
    print(cc)
    if cc.startswith("19") | cc.startswith("20"):
      testkey = int(cc[:8])  #10210101
      print ("test", testkey)
      if testkey >= 20210101:
        if not testkey in dictPages.keys():
          dictPages[testkey] = cc
      break
len(dictPages.keys()) #845 days

Extraction of content from news and story pages using BeautifulSoup

In [94]:
root = "http://www.abc.net/news/"
keyterms=["republic-", "republic ", "constitution", "monarch",]
articleurls=[] #contains article already scanned
#news page: returns a list of synopses and call for article text
def extractcontentfromnewspage(link):
    req = Request(link, headers={'User-Agent': 'Mozilla/5.0'})
    output=[]
    webpage = urlopen(req).read()
    soup = BeautifulSoup(webpage, 'html5lib')
    synopses = soup.findAll("div", {"data-component" : "CardDescription"})
    for synopsis in synopses:
      #add a synopsis to output if relevant ref keyterms
      found = False 
      if not found:
        for term in keyterms:
          if term in synopsis.get_text().lower():
            output.append(synopsis.get_text())
            found = True
            break
    # find the article pages
    anchors = soup.findAll("a", {"class" : "VolumeCard_link__GOaqC"})
    for a in soup.find_all('a', href=True):
      #if a news site and not already crawled
      if 'abc.net.au/news' in a and not a in articleurls:
        url = a['href']
        articleurls.append(url)
        found = False
        for term in keyterms:
          if not found:
            if term in url.lower():
              found = True
              articletext = extractcontentfromarticlepage(url)
              if articletext:
                output.append(articletext)
              break
    return output

#extracts info from article page
def extractcontentfromarticlepage(link):
  req = Request(link, headers={'User-Agent': 'Mozilla/5.0'})
  webpage = urlopen(req).read()
  soup = BeautifulSoup(webpage, 'html5lib')
  articletext=''
  paragraphs = soup.findAll("p", {"class" : "paragraph_paragraph__3Hrfa"})
  for paragraph in paragraphs:
    articletext += paragraph.get_text()
  return articletext

In [None]:
dictContent={}

Constuct a link and call page function to extract headline and summary info

In [128]:

from bs4 import BeautifulSoup
i=0
startindex=828

#used for resuming after error
for key, value in dictPages.items():
  i=i+1
  if i>= startindex:
    url = "https://web.archive.org/web/" + str(value) + "/http://abc.net.au/news"
    print(i)
    output = extractcontentfromnewspage(url)
    if output:
      dictContent[key] = []
      for content in output:
        dictContent[key] += [content]


828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845


In [129]:
np.save("/content/drive/MyDrive/A3/ABCbackupFinal.npy", dictContent)

In [113]:
dictContent.keys()

dict_keys([20210107, 20210115, 20210116, 20210119, 20210120, 20210127, 20210209, 20210210, 20210313, 20210314, 20210321, 20210513, 20210610, 20210615, 20210626, 20210819, 20210831, 20210922])