### Web crawling models


In [None]:
import requests
from bs4 import BeautifulSoup, Tag


class Content:
    #  Common base class for all articles/pages
    def __init__(self, url: str, title: str, body: str):
        self.url = url
        self.title = title
        self.body = body

    def print(self) -> None:
        # A flexible display function controls the output
        print("URL: {}".format(self.url))
        print("TITLE: {}".format(self.title))
        print("BODY:\n{}".format(self.body))


class Website:
    # Contains information about the structure of the site
    def __init__(self, name: str, url: str, titleTag: Tag, bodyTag: Tag):
        self.name = name
        self.url = url
        self.titleTag = titleTag
        self.bodyTag = bodyTag


class Crawler:

    def getPage(self, url: str) -> (BeautifulSoup | None):
        try:
            req = requests.get(url)
        except requests.exceptions.RequestException:
            return None
        
        return BeautifulSoup(req.text, 'html.parser')

    def safeGet(self, pageObj: BeautifulSoup, selector: str) -> str:
        """
        Utility function used to obtain a string of content from a
        BeautifulSoup object and a selector. Returns a string
        empty if no object is found for the given selector
        """
        selectedElems = pageObj.select(selector)
        
        if selectedElems is not None and len(selectedElems) > 0:
            return '\n'.join([elem.get_text() for elem in selectedElems])
        return ''

    def parse(self, site: str, url: str) -> None:
        """
        Extract content from a given page URL
        """
        bs = self.getPage(url)
        if bs is not None:
            title = self.safeGet(bs, site.titleTag)
            body = self.safeGet(bs, site.bodyTag)
            
            if title != '' and body != '':
                content = Content(url, title, body)
                content.print()


crawler = Crawler()

siteData = [[
    'O\'Reilly Media', 'http://oreilly.com', 'h1',
    'section#product-description'
], [
    'Reuters', 'http://reuters.com', 'h1', 'div.StandardArticleBody_body_1gnLA'
], ['Brookings', 'http://www.brookings.edu', 'h1', 'div.post-body'],
            ['New York Times', 'http://nytimes.com', 'h1', 'p.story-content']]

websites = []
for row in siteData:
    websites.append(Website(row[0], row[1], row[2], row[3]))

crawler.parse(websites[0], 'http://shop.oreilly.com/product/'
              '0636920028154.do')

crawler.parse(
    websites[1], 'http://www.reuters.com/article/'
    'us-usa-epa-pruitt-idUSKBN19W2D0')

crawler.parse(
    websites[2], 'https://www.brookings.edu/blog/'
    'techtank/2016/03/01/idea-to-retire-old-methods-of-policy-education/')

crawler.parse(
    websites[3], 'https://www.nytimes.com/2018/01/'
    '28/business/energy-environment/oil-boom.html')


In [None]:
from bs4 import BeautifulSoup
from urllib.request import urlopen


class Content:
    # Common base class for all articles/pages
    def __init__(self, url: str, title: str, body: str):
        self.url = url
        self.title = title
        self.body = body

    def print(self) -> None:
        print(f'TITLE: {self.title}')
        print(f'URL: {self.url}')
        print(f'BODY:\n {self.body}')


def scrapeCNN(url: str) -> Content:
    bs = BeautifulSoup(urlopen(url))
    title = bs.find('h1').text
    body = bs.find('div', {'class': 'article__content'}).text
    print('body: ')
    print(body)
    return Content(url, title, body)


def scrapeBrookings(url: str) -> Content:
    bs = BeautifulSoup(urlopen(url))
    title = bs.find('h1').text
    body = bs.find('div', {'class': 'post-body'}).text  # None
    return Content(url, title, body)


url = 'https://www.brookings.edu/research/robotic-rulemaking/'
content = scrapeBrookings(url)
content.print()

""" url = 'https://www.cnn.com/2023/04/03/investing/dogecoin-elon-musk-twitter/index.html'
content = scrapeCNN(url)
content.print() """

### Crawling websites through search query


In [None]:
import requests
from bs4 import BeautifulSoup, ResultSet


class Website:
    # Contains information about the structure of the site
    def __init__(self, name: str, url: str, searchUrl: str, resultListing: str, resultUrl: str,
                 absoluteUrl: bool, titleTag: Tag, bodyTag: Tag):
        self.name = name
        self.url = url
        self.searchUrl = searchUrl
        self.resultListing = resultListing
        self.resultUrl = resultUrl
        self.absoluteUrl = absoluteUrl
        self.titleTag = titleTag
        self.bodyTag = bodyTag


class Content:
    # Common base class for all articles/pages
    def __init__(self, topic, url, title, body):
        self.topic = topic
        self.title = title
        self.body = body
        self.url = url

    def print(self) -> None:
        # A flexible display function controls the output
        print("New article found for topic: {}".format(self.topic))
        print("TITLE: {}".format(self.title))
        print("BODY:\n{}".format(self.body))
        print("URL: {}".format(self.url))


class Crawler:

    def getPage(self, url: str) -> (BeautifulSoup | None):
        try:
            req = requests.get(url)
        except requests.exceptions.RequestException:
            return None
        return BeautifulSoup(req.text, 'html.parser')

    def safeGetText(self, pageObj: BeautifulSoup, selector: str) -> str:
        childObj = pageObj.select(selector)
        if childObj is not None and len(childObj) > 0:
            return childObj[0].get_text()
        
        return ""

    def search(self, topic: str, site: Website) -> None:
        # Search a given website for a given topic and register all pages found
        bs = self.getPage(site.searchUrl + topic)
        searchResults = bs.select(site.resultListing)

        for result in searchResults:
            url = result.select(site.resultUrl)[0].attrs["href"]
            # Checks if it is a relative or absolute URL
            if (site.absoluteUrl):
                bs = self.getPage(url)
            else:
                bs = self.getPage(site.url + url)
            if bs is None:
                print("Something was wrong with that page or URL. Skipping!")
                return
            
            title = self.safeGetText(bs, site.titleTag)
            body = self.safeGetText(bs, site.bodyTag)
            if title != '' and body != '':
                content = Content(topic, title, body, url)
                content.print()


crawler = Crawler()

siteData = [[
    'O\'Reilly Media', 'http://oreilly.com', 'https://ssearch.oreilly.com/?q=',
    'article.product-result', 'p.title a', True, 'h1',
    'section#product-description'
],
            [
                'Reuters', 'http://reuters.com',
                'http://www.reuters.com/search/news?blob=',
                'div.search-result-content', 'h3.search-result-title a', False,
                'h1', 'div.StandardArticleBody_body_1gnLA'
            ],
            [
                'Brookings', 'http://www.brookings.edu',
                'https://www.brookings.edu/search/?s=',
                'div.list-content article', 'h4.title a', True, 'h1',
                'div.post-body'
            ]]

sites = []
for row in siteData:
    sites.append(
        Website(row[0], row[1], row[2], row[3], row[4], row[5], row[6],
                row[7]))

topics = ['python', 'data science']
for topic in topics:
    print("GETTING INFO ABOUT: " + topic)
    for targetSite in sites:
        crawler.search(topic, targetSite)
        

### Crawling multiple page types


In [None]:
class Website:
    # Common base class for all articles/pages
    def __init__(self, type: str, name: str, url: str, searchUrl: str, resultListing: list[str], 
                 resultUrl: str,absoluteUrl: bool, titleTag: Tag, bodyTag: Tag, pageType: str):
        self.name = name
        self.url = url
        self.titleTag = titleTag
        self.bodyTag = bodyTag
        self.pageType = pageType


class Webpage:
    # Common base class for all articles/pages
    def __init__(self, name: str, url: str, titleTag: Tag):
        self.name = name
        self.url = url
        self.titleTag = titleTag


class Product(Website):

    def __init__(self, name: str, url: str, titleTag: Tag, productNumber: str, price: str):
        Website.__init__(self, name, url, titleTag)
        self.productNumberTag = productNumber
        self.priceTag = price


class Article(Website):
    # Contains information to collect data from an article page
    def __init__(self, name: str, url: str, titleTag: Tag, bodyTag: Tag, dateTag: Tag):
        Website.__init__(self, name, url, titleTag)
        self.bodyTag = bodyTag
        self.dateTag = dateTag