In [1]:
# !pip install google, timeout_decorator

import pandas as pd
import numpy as np
import requests
import timeout_decorator 

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from urllib.request import urlopen
from bs4 import BeautifulSoup
from googlesearch import search
from io import BytesIO

# Company dataset to work with

In [2]:
## Retrieve cleaned company dataset
company_data = pd.read_csv('data/asx_listed_cleaned.csv', index_col=0)
company_data.head()

## Sort data by market cap
data = company_data.sort_values(by='Market Cap', ascending=False).reset_index(drop=True)

In [3]:
## Extract the first 20 lines of the dataset
data_work = data[:20].copy()

data_work.head()

Unnamed: 0,Code,Company,Sector,Market Cap
0,csl,csl ltd,health care,140501000000
1,cba,commonwealth bank of australia,financials,110976000000
2,bhp,bhp group ltd,materials,95298300000
3,wbc,westpac banking corporation,financials,58798200000
4,nab,national australia bank ltd,financials,50611200000


# Automating Google Searches

In [44]:
## Library to extract top level domain

# !pip3 install tld
from tld import get_tld

In [59]:
# !pip install PyPDF2
import PyPDF2
from PyPDF2 import PdfReader

Using the `googlesearch` API to extract search results, then use `BeautifulSoup` to extract data from HTMLs.

In [4]:
def google_search(company, query):  
    # Initialise a search object
    res = search(
        f'{company} {query}',
        tld='com',
        lang='en',
        num=2,
        start=0,
        stop=1,
        pause=2.0
    )
    return list(res).pop()

# Extract text and PDF from HTMLs

In [15]:
@timeout_decorator.timeout(10)
def extract_text_from_html(url):
    html = urlopen(url).read()
    soup = BeautifulSoup(html, features="html.parser")

    # Eliminate scripts and style elements in the HTML object
    for script in soup(["script", "style"]):
        script.extract()

    # Retrieve raw text
    text = soup.get_text()

    # break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in text.splitlines())
    
    # break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    
    # drop blank lines
    text_processed = '\n'.join(chunk for chunk in chunks if chunk)
    
    return text_processed

In [102]:
@timeout_decorator.timeout(20)
def extract_pdfs_from_html(url):
    ## Request contents from the url
    html = requests.get(url)
    
    ## Extract text from the html object
    soup = BeautifulSoup(html.text, features="html.parser")
    
    ## Find all links with href tags
    urls_extract_href = soup.find_all('a')
    
    ## Remove all href tags
    urls_extract = list(map(lambda url: url.get('href'), urls_extract_href))
    
    try:
        ## Keep only links with PDF
        pdf_urls = [link for link in urls_extract if '.pdf' in link]

        ## Fix any links without top-level domain

        # Get top level domain as str
        tld = get_tld(url, as_object=True).fld
        pdf_urls_tld = [f"https://{tld}{link}" if not link.startswith('http') else link for link in pdf_urls]

        return pdf_urls_tld[0]
    
    except TypeError:
        return 'Extract list empty.'

In [30]:
## Test codes

# url = 'https://www.commbank.com.au/about-us/investors/annual-reports/annual-report-2022.html'
# html = requests.get(url)
# soup = BeautifulSoup(html.text, features="html.parser")

# tld = get_tld(url, as_object=True) #Get the root as an object
# tld.fld #res.fld to extract the domain

# urls_extract = soup.find_all('a')
# urls_extract_href = list(map(lambda url: url.get('href'), urls_extract))

# Parse PDF from HTML

In [69]:
## Test code

# report = requests.get('https://www.commbank.com.au/content/dam/commbank-assets/about-us/2022-08/2022-annual-report_spreads.pdf')
# raw_data = report.content

# with BytesIO(raw_data) as data:
#     file_object = PyPDF2.PdfReader(data)

#     corpus = []
#     for page in range(len(file_object.pages)):
#         text = file_object.pages[page].extract_text()
#         corpus.append(text)

In [71]:
def get_corpus_from_pdf(url):
    """
    Retrieve the corpus from a PDF url.
    
    ------------
    Input(s):
    url: the URL to the PDF file
    ----------
    Output(s):
    The raw string corpus.
    
    """
    report = requests.get(url)
    raw_data = report.content

    with BytesIO(raw_data) as data:
        file_object = PyPDF2.PdfReader(data)

        corpus = []
        for page in range(len(file_object.pages)):
            text = file_object.pages[page].extract_text()
            corpus.append(text)
    
    return " ".join(corpus)

# Complete procedure to extract Privacy policies to working data csv

In [10]:
# Extract URLs for privacy policy for companies
urls = [
    google_search(data_work['Company'].loc[i], 'privacy policy') for i in range(len(data_work['Company']))
]

In [16]:
## Scrape privacy policies and pass them into Pandas dataframe

privacy_policy_text = []

# Loop through URLs and scrape text from each of them
for i in range(len(urls)):
    print(f"Iteration {i+1}")
    try:
        text = extract_text_from_html(urls[i])
        privacy_policy_text.append(text)
    except:
        privacy_policy_text.append(np.nan)
        pass

# Attach to current working data
data_work['privacy_policy'] = privacy_policy_text

Iteration 1
Iteration 2
Iteration 3
Iteration 4
Iteration 5
Iteration 6
Iteration 7
Iteration 8
Iteration 9
Iteration 10
Iteration 11
Iteration 12
Iteration 13
Iteration 14
Iteration 15
Iteration 16
Iteration 17
Iteration 18
Iteration 19
Iteration 20


In [18]:
data_work.head()

Unnamed: 0,Code,Company,Sector,Market Cap,privacy_policy
0,csl,csl ltd,health care,140501000000,Privacy Policy | CSL Limited\nSkip to main con...
1,cba,commonwealth bank of australia,financials,110976000000,Group Privacy Statement - CommBank\nSkip to ma...
2,bhp,bhp group ltd,materials,95298300000,
3,wbc,westpac banking corporation,financials,58798200000,Privacy Statement | Westpac\nclose\nWe use coo...
4,nab,national australia bank ltd,financials,50611200000,National Australia Bank privacy policy - NAB\n...


In [19]:
data_work.isna().sum()

Code              0
Company           0
Sector            0
Market Cap        0
privacy_policy    5
dtype: int64

# Complete procedure to extract raw text from PDF files

In [72]:
# Extract URLs for 2022 annual report for companies
annual_reports_2022 = [
    google_search(data_work['Company'].loc[i], '2022 annual report') for i in range(len(data_work['Company']))
]

In [79]:
annual_reports_2022

['https://investors.csl.com/annualreport/2022/',
 'https://www.commbank.com.au/about-us/investors/annual-reports/annual-report-2022.html',
 'https://www.bhp.com/investors/annual-reporting/annual-report-2022',
 'https://www.westpac.com.au/about-westpac/investor-centre/financial-information/annual-reports/',
 'https://www.nab.com.au/content/dam/nab/documents/reports/corporate/2022-annual-report.pdf',
 'https://www.anz.com/shareholder/centre/reporting/annual-report-annual-review/',
 'https://www.woolworthsgroup.com.au/content/dam/wwg/investors/reports/2022/full-year/2022%20Annual%20Report.pdf',
 'https://sitefinity.wesfarmers.com.au/docs/default-source/2022-full-year-results/wesfarmers-annual-report-2022_interactive_v2.pdf?sfvrsn=6e2f1abb_2',
 'https://www.transurban.com/investor-centre/reporting-suite',
 'https://www.fmgl.com.au/docs/default-source/announcements/full-year-2022-annual-report-and-4e.pdf?sfvrsn=d0a3336a_2',
 'https://www.macquarie.com/au/en/investors/reports/full-year-2022.

In [103]:
annual_reports_2022_pdfs = []
for link in annual_reports_2022:
    try:
        if not '.pdf' in link:
            annual_reports_2022_pdfs.append(extract_pdfs_from_html(link))
        else:
            annual_reports_2022_pdfs.append(link)
    except:
        annual_reports_2022_pdfs.append('Cannot get pdf.')
        pass

In [104]:
annual_reports_2022_pdfs

['Cannot get pdf.',
 'Extract list empty.',
 'Cannot get pdf.',
 'Extract list empty.',
 'https://www.nab.com.au/content/dam/nab/documents/reports/corporate/2022-annual-report.pdf',
 'Extract list empty.',
 'https://www.woolworthsgroup.com.au/content/dam/wwg/investors/reports/2022/full-year/2022%20Annual%20Report.pdf',
 'https://sitefinity.wesfarmers.com.au/docs/default-source/2022-full-year-results/wesfarmers-annual-report-2022_interactive_v2.pdf?sfvrsn=6e2f1abb_2',
 'Extract list empty.',
 'https://www.fmgl.com.au/docs/default-source/announcements/full-year-2022-annual-report-and-4e.pdf?sfvrsn=d0a3336a_2',
 'Extract list empty.',
 'https://www.telstra.com.au/content/dam/tcom/about-us/investors/pdf-g/TEL-AR-2022-Spreads-FINAL.pdf',
 'Cannot get pdf.',
 'Cannot get pdf.',
 'Cannot get pdf.',
 'Extract list empty.',
 'Cannot get pdf.',
 'Extract list empty.',
 'Extract list empty.',
 'https://www2.asx.com.au/content/dam/asx/about/media-releases/2022/41-22-august-asx-2022-annual-report.p

In [105]:
## Scrape annual reports and pass them into Pandas dataframe

annual_report_text = []

# Loop through URLs and scrape text from each of them
for i in range(len(annual_reports_2022)):
    print(f"Iteration {i+1}")
    try:
        url = extract_pdfs_from_html(annual_reports_2022_pdfs[i])
        annual_report_text.append(get_corpus_from_pdf(url))
    except:
        annual_report_text.append(np.nan)
        pass

# Attach to current working data
data_work['ann_report_2022'] = annual_report_text

Iteration 1
Iteration 2
Iteration 3
Iteration 4
Iteration 5
Iteration 6
Iteration 7
Iteration 8
Iteration 9
Iteration 10
Iteration 11
Iteration 12
Iteration 13
Iteration 14
Iteration 15
Iteration 16
Iteration 17
Iteration 18
Iteration 19
Iteration 20
