# Webcrawling Tutorium (Solution)

## Introduction to web crawling

## Low Level Crawling

In [64]:
import requests
import bs4
import json
import time

In [59]:
base_url = 'https://techcrunch.com'
number_of_pages = 3

In [66]:
def get_article_urls(page):
    a_s = page.find_all('a', {'class': 'read-more'})
    
    hrefs = []
    for a in a_s:
        hrefs.append(a.attrs['href'])
        
    return hrefs


def get_article_info(url, delay=1):
    
    # Wait for delay seconds to crawl the next page
    time.sleep(delay)
    
    response = requests.get(url)
    if response.status_code != 200:
        print('Error getting page {} status_code:{}'.format(url, response.status_code))
        return {}
    
    # Converting raw response text to usable BeautifulSoup
    page = bs4.BeautifulSoup(response.text, "lxml")
    
    # Exctract Information
    title = page.find('h1', {'class': 'tweet-title'}).text
    
    authors_raw = page.find_all('a', {'rel': 'author'})
    authors = [author.text for author in authors_raw]
    
    date = page.find('time').attrs['datetime']
    
    tags_raw = page.find_all('div', {'class': 'acc-handle'})
    tags = [tag.get_text(strip=True) for tag in tags_raw if tag.text != 'Popular Posts']
    
    text = page.find('div', {'class': 'text'}).get_text(strip=True)
    
    # Combine all information in one set
    article = {
        'title': title,
        'url': url,
        'date': date,
        'authors': authors,
        'tags': tags,
        'text': text
    }
    
    return article


def get_next_url(page, base_url):
    
    list_item = page.find('li', {'class': 'next'})
    href = list_item.find('a').attrs['href']
    
    url = base_url + href
    return url

In [68]:
current_url = base_url

articles = []
for n in range(number_of_pages):
    
    print('Crawling: {}'.format(current_url))    
    response = requests.get(current_url)
    
    # Simple error handling, just stop execution when no proper response received
    if response.status_code != 200:
        print('Error getting page {}'.format(current_url))
        break
        
    # Converting raw response text to usable BeautifulSoup
    page = bs4.BeautifulSoup(response.text, "lxml")
    
    article_urls = get_article_urls(page)
    # Run through all articles and extract the desired information
    for url in article_urls:
        try:
            article_info = get_article_info(url, delay=0.3)
        except:
            print('Error for article: {}'.format(url))
        articles.append(article_info)
        
    # Find reference to next page listing articles
    current_url = get_next_url(page, base_url)
    

Crawling: https://techcrunch.com
Error for article: https://techcrunch.com/2018/03/02/blockchain-will-work-in-trucking-but-only-if-these-three-things-happen/
Crawling: https://techcrunch.com/page/2
Error for article: https://techcrunch.com/2018/03/02/toyota-creates-a-new-advanced-research-company-fo-focus-on-self-driving/
Crawling: https://techcrunch.com/page/3
Error for article: https://techcrunch.com/gallery/the-top-smartphones-of-mwc-2018/


In [69]:
response = requests.get(url)

In [70]:
response.text

'<!DOCTYPE html>\n<html xmlns="http://www.w3.org/1999/xhtml" xmlns:og="http://opengraphprotocol.org/schema/" xmlns:fb="http://www.facebook.com/2008/fbml" lang="en">\n<head>\n\t<title>The top smartphones of MWC 2018  |  TechCrunch</title>\n\t<meta http-equiv="X-UA-Compatible" content="IE=Edge" />\n\t<meta charset="UTF-8">\n\t\t\t<script type="text/javascript">var _sf_startpt = (new Date()).getTime()</script>\n\t\t<meta name="p:domain_verify" content="6189ff68ce30e30f12b40b3b40873027"/>\n\t<meta name="HandheldFriendly" content="True">\n\t<meta name="MobileOptimized" content="320">\n\t<meta name="viewport" content="initial-scale=1.0,width=device-width,user-scalable=no,minimum-scale=1.0,maximum-scale=1.0">\n\t<meta http-equiv="cleartype" content="on">\n\t<meta name="apple-mobile-web-app-title" content="TechCrunch">\n\t<meta name="robots" content="NOYDIR,NOODP" />\n\t<link rel="shortcut icon" type="image/x-icon" href="https://s0.wp.com/wp-content/themes/vip/techcrunch-2013/assets/images/fav

In [60]:
base_url + page.find('li', {'class': 'next'}).find('a').attrs['href']

'https://techcrunch.com/page/2'