## Dependencies

In [1]:
from splinter import Browser
from bs4 import BeautifulSoup as bs
import time
import pandas as pd
from pprint import pprint
import warnings
warnings.filterwarnings('ignore')
import requests
import numpy as np

## Windows

In [2]:
executable_path = {"executable_path": "chromedriver.exe"}

def init_browser():
    executable_path = {"executable_path": "chromedriver.exe"}
    return Browser("chrome", **executable_path, headless=False)

browser = init_browser()

## Mac

In [None]:
# https://splinter.readthedocs.io/en/latest/drivers/chrome.html
!which chromedriver

In [None]:
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)

## 2019–20 Australian bushfire season

Scrape data related to the 2019-2020 Australian bushfire season from the following site: <https://en.wikipedia.org/wiki/2019%E2%80%9320_Australian_bushfire_season>

In [3]:
# Visit url.
url = "https://en.wikipedia.org/wiki/2019%E2%80%9320_Australian_bushfire_season"
browser.visit(url)

# Wait for page/page elements to load.
time.sleep(1)

# Scrape page into soup.
html = browser.html
soup = bs(html, "html.parser")

# Get table data using pandas.
tables = pd.read_html(url)
tables[1]

Unnamed: 0_level_0,State / territory,Fatalities,Homeslost,Area(estimated),Area(estimated),Notes
Unnamed: 0_level_1,State / territory,Fatalities,Homeslost,ha,acres,Notes
0,Australian Capital Territory,1,0,56688,140080,Area;[91][92][93] fatality[c][95]
1,New South Wales,25,2439,5400000,13300000,Area;[96] fatalities;[18][37] homes[96]
2,Northern Territory,0,5,6800000,16800000,"Area, includes mainly scrub fires, which are w..."
3,Queensland,0,48,2500000,6180000,"Area, includes scrub fires;[49] homes[97][d]"
4,South Australia,3,151,490000,1210000,Area;[49] fatalities;[102] homes (KI:65)[103] ...
5,Tasmania,0,2,36000,89000,Area;[49] homes[97]
6,Victoria,5,396,1500000,3710000,Area;[49] fatalities;[15] homes[105]
7,Western Australia,0,1,2200000,5440000,"Area, includes scrub fires;[49] homes[97]"
8,Total,34,"3,500+",18736070,46300000,[e][b][109][110] Total area estimate as of 13 ...


In [4]:
# Convert table from website to pandas dataframe.
australia_fire_2019_df = tables[1]
australia_fire_2019_df.columns = ["state", "fatalities", "homes_lost", "area_ha", "area_acres", "notes"]

australia_fire_2019_df

Unnamed: 0,state,fatalities,homes_lost,area_ha,area_acres,notes
0,Australian Capital Territory,1,0,56688,140080,Area;[91][92][93] fatality[c][95]
1,New South Wales,25,2439,5400000,13300000,Area;[96] fatalities;[18][37] homes[96]
2,Northern Territory,0,5,6800000,16800000,"Area, includes mainly scrub fires, which are w..."
3,Queensland,0,48,2500000,6180000,"Area, includes scrub fires;[49] homes[97][d]"
4,South Australia,3,151,490000,1210000,Area;[49] fatalities;[102] homes (KI:65)[103] ...
5,Tasmania,0,2,36000,89000,Area;[49] homes[97]
6,Victoria,5,396,1500000,3710000,Area;[49] fatalities;[15] homes[105]
7,Western Australia,0,1,2200000,5440000,"Area, includes scrub fires;[49] homes[97]"
8,Total,34,"3,500+",18736070,46300000,[e][b][109][110] Total area estimate as of 13 ...


In [5]:
# Remove the notes column
del australia_fire_2019_df['notes']

australia_fire_2019_df

Unnamed: 0,state,fatalities,homes_lost,area_ha,area_acres
0,Australian Capital Territory,1,0,56688,140080
1,New South Wales,25,2439,5400000,13300000
2,Northern Territory,0,5,6800000,16800000
3,Queensland,0,48,2500000,6180000
4,South Australia,3,151,490000,1210000
5,Tasmania,0,2,36000,89000
6,Victoria,5,396,1500000,3710000
7,Western Australia,0,1,2200000,5440000
8,Total,34,"3,500+",18736070,46300000


In [6]:
# Drop the totals row
australia_fire_2019_df = australia_fire_2019_df[australia_fire_2019_df.state != 'Total']

australia_fire_2019_df

Unnamed: 0,state,fatalities,homes_lost,area_ha,area_acres
0,Australian Capital Territory,1,0,56688,140080
1,New South Wales,25,2439,5400000,13300000
2,Northern Territory,0,5,6800000,16800000
3,Queensland,0,48,2500000,6180000
4,South Australia,3,151,490000,1210000
5,Tasmania,0,2,36000,89000
6,Victoria,5,396,1500000,3710000
7,Western Australia,0,1,2200000,5440000


In [7]:
# Check data types

# Change homes_lost column from type string to type int.
australia_fire_2019_df["homes_lost"] = australia_fire_2019_df["homes_lost"].astype(int)

australia_fire_2019_df.dtypes

state         object
fatalities     int64
homes_lost     int32
area_ha        int64
area_acres     int64
dtype: object

In [8]:
# Check for null values
australia_fire_2019_df.count()

state         8
fatalities    8
homes_lost    8
area_ha       8
area_acres    8
dtype: int64

In [9]:
# Convert dataframe to dictionary.
# https://stackoverflow.com/questions/26716616/convert-a-pandas-dataframe-to-a-dictionary
australia_fire_2019_dict = australia_fire_2019_df.to_dict('range')

australia_fire_2019_dict

[{'state': 'Australian Capital Territory',
  'fatalities': 1,
  'homes_lost': 0,
  'area_ha': 56688,
  'area_acres': 140080},
 {'state': 'New South Wales',
  'fatalities': 25,
  'homes_lost': 2439,
  'area_ha': 5400000,
  'area_acres': 13300000},
 {'state': 'Northern Territory',
  'fatalities': 0,
  'homes_lost': 5,
  'area_ha': 6800000,
  'area_acres': 16800000},
 {'state': 'Queensland',
  'fatalities': 0,
  'homes_lost': 48,
  'area_ha': 2500000,
  'area_acres': 6180000},
 {'state': 'South Australia',
  'fatalities': 3,
  'homes_lost': 151,
  'area_ha': 490000,
  'area_acres': 1210000},
 {'state': 'Tasmania',
  'fatalities': 0,
  'homes_lost': 2,
  'area_ha': 36000,
  'area_acres': 89000},
 {'state': 'Victoria',
  'fatalities': 5,
  'homes_lost': 396,
  'area_ha': 1500000,
  'area_acres': 3710000},
 {'state': 'Western Australia',
  'fatalities': 0,
  'homes_lost': 1,
  'area_ha': 2200000,
  'area_acres': 5440000}]

## List of major bushfires in Australia

Scrape data related to past Australian bushfire seasons from the following site:
<https://en.wikipedia.org/wiki/List_of_major_bushfires_in_Australia>

In [10]:
# Visit url.
url = requests.get("https://en.wikipedia.org/wiki/List_of_major_bushfires_in_Australia").text

# Wait for page/page elements to load.
time.sleep(1)

# Scrape page into soup.
soup = bs(url, "lxml")

In [11]:
table = soup.findAll('table')[0]
table.tbody.findAll('tr')

dates = []
names = []
states = []
area_ha = []
area_acres = []
fatalities = []
homes_destroyed = []

for row in soup.findAll('table')[0].tbody.findAll('tr'):
    try:
        dates.append(row.findAll('td')[0].text.rstrip("\n"))
        names.append(row.findAll('td')[1].text.rstrip("\n"))
        states.append(row.findAll('td')[2].text.rstrip("\n"))
        area_ha.append(row.findAll('td')[3].text.rstrip("\n"))
        area_acres.append(row.findAll('td')[4].text.rstrip("\n"))
        fatalities.append(row.findAll('td')[5].text.rstrip("\n"))
        homes_destroyed.append(row.findAll('td')[6].text.rstrip("\n"))
    except:
        print('This is a header row.')

australia_fire_seasons_dict = {
    "date": dates,
    "name": names,
    "state": states,
    "area_ha": area_ha,
    "area_acres": area_acres,
    "fatalities": fatalities,
    "homes_destroyed": homes_destroyed,
}

print(australia_fire_seasons_dict)

This is a header row.
This is a header row.
{'date': ['6 February 1851', '1 February 1898', 'February – March 1926', 'December 1938 – January 1939', '14 January – 14 February 1944', '18 November 1944', 'November 1951 – January 1952', '2 January 1955', '30 November 1957', '2 December 1957', 'January – March 1961', '14 – 16 January 1962', '16 February – 13 March 1965', '5 – 14 March 1965', '7 February 1967', '1968 – 69[clarification needed]', '29 November 1968', '8 January 1969', '1969 – 70[clarification needed]', '1974 – 1975 summer fire season(defined as October 1974 to February 1975 in Queensland only)', '12 February 1977', '17 December 1977', '4 April 1978', 'December 1979', '3 November 1980', '9 January 1983', '16 February 1983', '25 December 1984', 'Mid-January 1985', '1984 – 1985 season[clarification needed]', '14 January 1985', '27 December 1993 – 16 January 1994', '8 January 1997', '21 January 1997', '2 December 1997', '2 December 1997', '2 December 1997', '2 December 1998', '25

In [12]:
# Convert dictionary to pandas dataframe
australia_fire_seasons_df = pd.DataFrame.from_dict(australia_fire_seasons_dict,orient='index').transpose()

australia_fire_seasons_df

Unnamed: 0,date,name,state,area_ha,area_acres,fatalities,homes_destroyed
0,6 February 1851,Black Thursday bushfires,Victoria,5000000,12000000,approx. 12,Nil
1,1 February 1898,Red Tuesday bushfires,Victoria,260000,640000,12,Nil
2,February – March 1926,1926 bushfires,Victoria,390000,960000,60,1000
3,December 1938 – January 1939,Black Friday bushfires,Victoria,2000000,4900000,71,3700
4,14 January – 14 February 1944,1944 Victorian bushfires,Victoria,1000000,2500000,15–20,approx. 500
...,...,...,...,...,...,...,...
63,25 November – 2 December 2015,2015 Pinery bushfire,South Australia,85000,210000,2,91
64,January 2016,2016 Murray Road bushfire (Waroona and Harvey),Western Australia,69165,170910,2,181
65,11 – 14 February 2017,2017 New South Wales bushfires,New South Wales,52000,130000,Nil,35
66,early February 2019,Tingha bushfire,New South Wales,23419,57870,Nil,19


In [13]:
# Drop nas/null values
australia_fire_seasons_df['area_ha'].replace('', np.nan, inplace=True)
australia_fire_seasons_df = australia_fire_seasons_df.dropna(how="any")

In [14]:
# Verify count/check for null values
australia_fire_seasons_df.count()

date               57
name               57
state              57
area_ha            57
area_acres         57
fatalities         57
homes_destroyed    57
dtype: int64

In [15]:
# Check data types
australia_fire_seasons_df.dtypes

date               object
name               object
state              object
area_ha            object
area_acres         object
fatalities         object
homes_destroyed    object
dtype: object

In [16]:
# Clean up fatalities column
print(australia_fire_seasons_df["fatalities"].values)

australia_fire_seasons_df["fatalities"].replace({
    "Nil": "0",
    "approx. 12": "12",
    "15–20": "15",
    "5[e]": "5"
}, inplace=True)

print(australia_fire_seasons_df["fatalities"].values)

['approx. 12' '12' '60' '71' '15–20' '11' '2' 'Nil' 'Nil' '3' '62' 'Nil'
 'Nil' '6' '4' '2' '2' '5[e]' '75' 'Nil' 'Nil' '5' '3' '4' 'Nil' '3' '2'
 'Nil' 'Nil' '4' 'Nil' '2' '9' 'Nil' '4' '2' 'Nil' '1' '5' 'Nil' '1' '3'
 '173' '1' 'Nil' '1' 'Nil' 'Nil' 'Nil' 'Nil' '4' 'Nil' '2' '2' 'Nil' 'Nil'
 '34[j]']
['12' '12' '60' '71' '15' '11' '2' '0' '0' '3' '62' '0' '0' '6' '4' '2'
 '2' '5' '75' '0' '0' '5' '3' '4' '0' '3' '2' '0' '0' '4' '0' '2' '9' '0'
 '4' '2' '0' '1' '5' '0' '1' '3' '173' '1' '0' '1' '0' '0' '0' '0' '4' '0'
 '2' '2' '0' '0' '34[j]']


In [17]:
# Clean up area_ha column
print(australia_fire_seasons_df["area_ha"].values)

australia_fire_seasons_df["area_ha"].replace({"39,000–160,000": "39,000" }, inplace=True)

print(australia_fire_seasons_df["area_ha"].values)

['5,000,000' '260,000' '390,000' '2,000,000' '1,000,000' '4,000,000'
 '39,000–160,000' '1,800,000' '315,000' '251,000' '264,000' '40,000,000'
 '45,000,000' '117,000,000' '103,000' '54,000' '114,000' '1,000,000'
 '418,000' '500,000' '516,000' '3,500,000' '50,800' '400,000' '10,500'
 '400' '23,000' '753,314' '38,000,000' '160,000' '1,300,000' '2,110,000'
 '77,964' '30,000' '160,000' '184,000' '9,000' '1,048,000' '1,360,000'
 '12,000' '95,000' '40,000' '450,000' '20,000' '54,000' '100,000'
 '800,000' '20,000' '98,923' '52,373' '200,000' '10,016' '85,000' '69,165'
 '52,000' '23,419' '18,626,000']
['5,000,000' '260,000' '390,000' '2,000,000' '1,000,000' '4,000,000'
 '39,000' '1,800,000' '315,000' '251,000' '264,000' '40,000,000'
 '45,000,000' '117,000,000' '103,000' '54,000' '114,000' '1,000,000'
 '418,000' '500,000' '516,000' '3,500,000' '50,800' '400,000' '10,500'
 '400' '23,000' '753,314' '38,000,000' '160,000' '1,300,000' '2,110,000'
 '77,964' '30,000' '160,000' '184,000' '9,000' '1,048

In [18]:
# Clean up area_ha column
print(australia_fire_seasons_df["area_acres"].values)

australia_fire_seasons_df["area_acres"].replace({"96,000–395,000": "96,000" }, inplace=True)

print(australia_fire_seasons_df["area_acres"].values)

['12,000,000' '640,000' '960,000' '4,900,000' '2,500,000' '9,900,000'
 '96,000–395,000' '4,400,000' '780,000' '620,000' '650,000' '99,000,000'
 '110,000,000' '290,000,000' '250,000' '130,000' '280,000' '2,500,000'
 '1,030,000' '1,200,000' '1,280,000' '8,600,000' '126,000' '990,000'
 '26,000' '990' '57,000' '1,861,480' '94,000,000' '400,000' '3,200,000'
 '5,200,000' '192,650' '74,000' '400,000' '450,000' '22,000' '2,590,000'
 '3,400,000' '30,000' '230,000' '99,000' '1,100,000' '49,000' '130,000'
 '250,000' '2,000,000' '49,000' '244,440' '129,420' '490,000' '24,750'
 '210,000' '170,910' '130,000' '57,870' '46,030,000']
['12,000,000' '640,000' '960,000' '4,900,000' '2,500,000' '9,900,000'
 '96,000' '4,400,000' '780,000' '620,000' '650,000' '99,000,000'
 '110,000,000' '290,000,000' '250,000' '130,000' '280,000' '2,500,000'
 '1,030,000' '1,200,000' '1,280,000' '8,600,000' '126,000' '990,000'
 '26,000' '990' '57,000' '1,861,480' '94,000,000' '400,000' '3,200,000'
 '5,200,000' '192,650' '74,0

In [19]:
# Remove homes_destroyed column
del australia_fire_seasons_df["homes_destroyed"]

In [20]:
# Drop the last row (2019–20 Australian bushfire season)
australia_fire_seasons_df.drop(australia_fire_seasons_df.tail(1).index,inplace=True)

In [21]:
# Convert strings to integers
australia_fire_seasons_df['fatalities'] = pd.to_numeric(australia_fire_seasons_df['fatalities'],errors='coerce')

In [22]:
# Check data types
australia_fire_seasons_df.dtypes

date          object
name          object
state         object
area_ha       object
area_acres    object
fatalities     int64
dtype: object

In [23]:
# Remove commas from numbers in the area_ha and area_acres columns.
australia_fire_seasons_df['area_ha'].replace(',','', regex=True, inplace=True)
australia_fire_seasons_df['area_acres'].replace(',','', regex=True, inplace=True)

australia_fire_seasons_df.head()

Unnamed: 0,date,name,state,area_ha,area_acres,fatalities
0,6 February 1851,Black Thursday bushfires,Victoria,5000000,12000000,12
1,1 February 1898,Red Tuesday bushfires,Victoria,260000,640000,12
2,February – March 1926,1926 bushfires,Victoria,390000,960000,60
3,December 1938 – January 1939,Black Friday bushfires,Victoria,2000000,4900000,71
4,14 January – 14 February 1944,1944 Victorian bushfires,Victoria,1000000,2500000,15


In [24]:
# Convert strings to numbers.
australia_fire_seasons_df['area_ha'] = pd.to_numeric(australia_fire_seasons_df['area_ha'],errors='coerce')
australia_fire_seasons_df['area_acres'] = pd.to_numeric(australia_fire_seasons_df['area_acres'],errors='coerce')

australia_fire_seasons_df.dtypes

date          object
name          object
state         object
area_ha        int64
area_acres     int64
fatalities     int64
dtype: object

In [25]:
# Convert dataframe to dictionary.
australia_fire_seasons_dict = australia_fire_seasons_df.to_dict('range')

australia_fire_seasons_dict

[{'date': '6 February 1851',
  'name': 'Black Thursday bushfires',
  'state': 'Victoria',
  'area_ha': 5000000,
  'area_acres': 12000000,
  'fatalities': 12},
 {'date': '1 February 1898',
  'name': 'Red Tuesday bushfires',
  'state': 'Victoria',
  'area_ha': 260000,
  'area_acres': 640000,
  'fatalities': 12},
 {'date': 'February – March 1926',
  'name': '1926 bushfires',
  'state': 'Victoria',
  'area_ha': 390000,
  'area_acres': 960000,
  'fatalities': 60},
 {'date': 'December 1938 – January 1939',
  'name': 'Black Friday bushfires',
  'state': 'Victoria',
  'area_ha': 2000000,
  'area_acres': 4900000,
  'fatalities': 71},
 {'date': '14 January – 14 February 1944',
  'name': '1944 Victorian bushfires',
  'state': 'Victoria',
  'area_ha': 1000000,
  'area_acres': 2500000,
  'fatalities': 15},
 {'date': 'November 1951 – January 1952',
  'name': '1951–52 bushfires',
  'state': 'Victoria',
  'area_ha': 4000000,
  'area_acres': 9900000,
  'fatalities': 11},
 {'date': '2 January 1955',
  '