# Webcrawling Tutorium (Solution)

## Introduction to web crawling

## Low Level Crawling

In [73]:
import requests
import bs4
import json
import time
import pandas as pd

In [74]:
base_url = 'https://techcrunch.com'
number_of_pages = 1

In [66]:
def get_article_urls(page):
    a_s = page.find_all('a', {'class': 'read-more'})
    
    hrefs = []
    for a in a_s:
        hrefs.append(a.attrs['href'])
        
    return hrefs


def get_article_info(url, delay=1):
    
    # Wait for delay seconds to crawl the next page
    time.sleep(delay)
    
    response = requests.get(url)
    if response.status_code != 200:
        print('Error getting page {} status_code:{}'.format(url, response.status_code))
        return {}
    
    # Converting raw response text to usable BeautifulSoup
    page = bs4.BeautifulSoup(response.text, "lxml")
    
    # Exctract Information
    title = page.find('h1', {'class': 'tweet-title'}).text
    
    authors_raw = page.find_all('a', {'rel': 'author'})
    authors = [author.text for author in authors_raw]
    
    date = page.find('time').attrs['datetime']
    
    tags_raw = page.find_all('div', {'class': 'acc-handle'})
    tags = [tag.get_text(strip=True) for tag in tags_raw if tag.text != 'Popular Posts']
    
    text = page.find('div', {'class': 'text'}).get_text(strip=True)
    
    # Combine all information in one set
    article = {
        'title': title,
        'url': url,
        'date': date,
        'authors': authors,
        'tags': tags,
        'text': text
    }
    
    return article


def get_next_url(page, base_url):
    
    list_item = page.find('li', {'class': 'next'})
    href = list_item.find('a').attrs['href']
    
    url = base_url + href
    return url

In [72]:
current_url = base_url

articles = []
for n in range(number_of_pages):
    
    print('Crawling: {}'.format(current_url))    
    response = requests.get(current_url)
    
    # Simple error handling, just stop execution when no proper response received
    if response.status_code != 200:
        print('Error getting page {}'.format(current_url))
        break
        
    # Converting raw response text to usable BeautifulSoup
    page = bs4.BeautifulSoup(response.text, "lxml")
    
    article_urls = get_article_urls(page)
    # Run through all articles and extract the desired information
    for url in article_urls:
        try:
            article_info = get_article_info(url, delay=0.3)
        except:
            print('Error for article: {}'.format(url))
        articles.append(article_info)
        
    # Find reference to next page listing articles
    current_url = get_next_url(page, base_url)
    
print('Finished crawling. Found {} Articles'.format(len(articles)))

Crawling: https://techcrunch.com
Crawling: https://techcrunch.com/page/2
Crawling: https://techcrunch.com/page/3
Error for article: https://techcrunch.com/gallery/the-top-smartphones-of-mwc-2018/
Finished crawling. Found 58 Articles


In [75]:
df = pd.DataFrame(articles)
df.head()

Unnamed: 0,authors,date,tags,text,title,url
0,[Josh Constine],2018-03-02 03:23:27,"[SXSW, TechCrunch, Entertainment]",TechCrunch invites you to our annual Crunch By...,Come to TechCrunch’s party and SXSW panels,https://techcrunch.com/2018/03/02/2018-party-a...
1,[Sarah Perez],2018-03-02 10:53:10,"[iphone apps, storage, iOS apps, Apps, Apps]","These days, home movies aren’t recorded with h...",Air’s app lets you record high-quality home mo...,https://techcrunch.com/2018/03/02/air-lets-you...
2,[Jonathan Salama],2018-03-02 07:45:38,"[trucking, Transportation]",Jonathan SalamaContributorJonathan Salama is c...,Blockchain will work in trucking — but only if...,https://techcrunch.com/2018/03/02/blockchain-w...
3,[John Biggs],2018-03-02 10:22:48,"[robots, Gadgets]",Researchers took part in the Ski Robot Challen...,These robotic skiers hit the slopes in style,https://techcrunch.com/2018/03/02/these-electr...
4,"[Ingrid Lunden, Steve O'Hear]",2018-03-02 03:18:05,"[RPA, uipath, Artificial Intelligence]",The initial hype around bots — applications th...,UiPath raising around $120M at $1B+ valuation ...,https://techcrunch.com/2018/03/02/uipath-rpa/


In [None]:
df.to_csv('my_crawled_articles.csv')