In [69]:
from bs4 import BeautifulSoup
import re
from urllib import request
import requests
import pandas as pd
from pprint import pprint
from datetime import datetime as dt
import time
import timeit

In [2]:
start_url = 'https://news.warwickshire.gov.uk/fireincidents/2011/01/page/2/'
req = request.Request(start_url, headers={'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36'})
content = request.urlopen(req)
p1_soup = BeautifulSoup(content, 'html.parser')

### Define functions

In [4]:
def incident_page_scrape(url):
    ''' takes an input of an incident report's url and outputs
    a dictionary containing the title, date and text'''
    
    ua_header = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36'}

    req = request.Request(url, headers=ua_header)
    content = request.urlopen(req)
    p1_soup = BeautifulSoup(content, 'html.parser')
    #print(p1_soup)
    incid_dict = {}
    
    content = p1_soup.find('div', id='content')
    incid_dict['title'] = content.find('h1', class_='entry-title').text
    incid_dict['date'] = content.find('time', class_='entry-date')['datetime']
    incid_dict['text'] = content.find('div', class_='entry-content').text.replace(u'\xa0', u'')
    
    return incid_dict

In [25]:
# Don't think this is necessary - archive seems to have days-old data in it
def live_page_urls(url):
    ''' takes an input of the live page's url and outputs
    a list of the urls of the incidents on that page'''
    
    ua_header = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36'}

    req = request.Request(url, headers=ua_header)
    content = request.urlopen(req)
    p1_soup = BeautifulSoup(content, 'html.parser')

    return [i.find('a')['href'] for i in p1_soup.findAll('div', class_='promo-box')]

In [18]:
live_page_urls('https://news.warwickshire.gov.uk/fireincidents/')

['https://news.warwickshire.gov.uk/fireincidents/2018/11/25/road-traffic-collision-princes-drive-leamington/',
 'https://news.warwickshire.gov.uk/fireincidents/2018/11/21/horse-rescue-frankton-lane-stretton-on-dunsmore/',
 'https://news.warwickshire.gov.uk/fireincidents/2018/11/18/road-traffic-collision-b4451-gaydon/',
 'https://news.warwickshire.gov.uk/fireincidents/2018/11/16/van-fire-junction-16-to-15-m40-south-2-appliances-attended/',
 'https://news.warwickshire.gov.uk/fireincidents/2018/11/16/rtc-persons-trapped-murray-road-rugby-2-appliances-attending/',
 'https://news.warwickshire.gov.uk/fireincidents/2018/11/15/house-fire-lime-grove-hurley/']

In [5]:
def arch_incid_urls(url):
    ''' takes an input of an archive page's url and outputs
    a list of the urls of the incidents on that page'''
    
    ua_header = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36'}

    req = request.Request(url, headers=ua_header)
    content = request.urlopen(req)
    p1_soup = BeautifulSoup(content, 'html.parser')
    #print(p1_soup)
    urls_list = []
    
    incids_pane = p1_soup.find('div', class_='entry-content')
    for incid in incids_pane.findAll('header', class_='entry-header'):
        urls_list.append(incid.find('a')['href'])
    return urls_list

In [6]:
def get_archive_urls(url):
    ''' takes a url and outputs the archive 
    urls listed on that page'''
    
    ua_header = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36'}

    req = request.Request(url, headers=ua_header)
    content = request.urlopen(req)
    p1_soup = BeautifulSoup(content, 'html.parser')
    
    url_list = []
    for archive in p1_soup.find('aside', id='archives-2').findAll('li'):
        url_list.append(archive.find('a')['href'])
    return url_list

### Scrape the data

In [26]:
# Start from incidents homepage
start_page = 'https://news.warwickshire.gov.uk/fireincidents/'

allrecords = []

# get archive urls from incidents homepage
for arch_url in get_archive_urls(start_page):
    ua_header = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36'}

    pn = 2

    response = requests.get(arch_url, headers=ua_header)
    # Construct archive page urls
    page_url = arch_url+'page/1'

    # increment page numbers and scrape pages until page number is beyond range of real pages
    while response.status_code == 200:
        #print(page_url) # include to check that pages have all been scraped
        for incid_url in arch_incid_urls(page_url):
            allrecords.append(incident_page_scrape(incid_url))
        
        page_url = arch_url+'page/'+str(pn)
    
        response = requests.get(page_url, headers=ua_header)
        pn += 1

https://news.warwickshire.gov.uk/fireincidents/2018/11/page/1
https://news.warwickshire.gov.uk/fireincidents/2018/11/page/2
https://news.warwickshire.gov.uk/fireincidents/2018/11/page/3
https://news.warwickshire.gov.uk/fireincidents/2018/10/page/1
https://news.warwickshire.gov.uk/fireincidents/2018/10/page/2
https://news.warwickshire.gov.uk/fireincidents/2018/09/page/1
https://news.warwickshire.gov.uk/fireincidents/2018/09/page/2
https://news.warwickshire.gov.uk/fireincidents/2018/09/page/3
https://news.warwickshire.gov.uk/fireincidents/2018/08/page/1
https://news.warwickshire.gov.uk/fireincidents/2018/08/page/2
https://news.warwickshire.gov.uk/fireincidents/2018/08/page/3
https://news.warwickshire.gov.uk/fireincidents/2018/08/page/4
https://news.warwickshire.gov.uk/fireincidents/2018/08/page/5
https://news.warwickshire.gov.uk/fireincidents/2018/08/page/6
https://news.warwickshire.gov.uk/fireincidents/2018/07/page/1
https://news.warwickshire.gov.uk/fireincidents/2018/07/page/2
https://

In [27]:
len(allrecords)

3327

In [61]:
# make dataframe from scraped data
recordsdf = pd.DataFrame(allrecords)

In [63]:
recordsdf['cleantext'] = recordsdf['text'].map(lambda x: x.replace('\n', ''))
recordsdf['text'] = recordsdf['cleantext']
recordsdf = recordsdf[['date', 'title', 'text']]

In [66]:
recordsdf.to_csv(r'C:\git\fire\data\Fire_Regions\WarwickshireFire.csv')