#### Install the necessary package.

In [1]:
!pip install watson-developer-cloud==1.5



#### We start by importing the following libraries.

In [2]:
import requests
import urllib.request
import time
import datetime
import csv
import re
from bs4 import BeautifulSoup

#### I've pre-defined some redundant functions here so as to reduce the complexity and size of the code

In [3]:
#function to check if we have access to the website
def access_to_server(a):
    if(access.status_code == 200):
        print("Access to PlaneCrashInfo.com: Granted")
    else:
        print("Access to PlaneCrashInfo.com: Denied")
        
#function to pause for a little so that the website server does not flag us as a spammer
def take_a_break():
    time.sleep(0.1)      #we pause for 3 seconds
    print("Sleeping for few seconds...")

#function to access the website
def access_website(url):
    access = None
    access = requests.get(url,timeout=10)
    return access   

def get_data(search_string, regex):
    pattern = re.compile(regex)
    matches = pattern.finditer(search_string)

    for match in matches:
        match = match.span()

    span = search_string[match[0]:match[1]]
    return span

#### Next, we set the url to the Plane Crash Info website and access the site with our requests library. We also set the base url to create a dictionary with years as the key.

In [4]:
base_url = "http://www.planecrashinfo.com"
access = requests.get('http://www.planecrashinfo.com/database.htm')
access_to_server(access)
source = BeautifulSoup(access.text, 'html.parser')

Access to PlaneCrashInfo.com: Granted


 #### Here we build a dictionary with the url stores as the value and the year as the key

In [5]:
a_tags = source.find_all("a")
years = {a.text.strip(): {"url": base_url + a["href"] if a["href"][0] == "/" else base_url + "/" + a["href"]} for a in a_tags if a.text.strip().isdigit()}

#### Here we pause for a little to avoid getting flagged as a spammer.

In [6]:
take_a_break()

Sleeping for few seconds...


#### Now we start collecting the unstructured data

In [7]:
done = 0
for year, database in years.items():       #first we loop for each year from 1920 until 2019
    print(year)
    take_a_break()
    response_year = access_website(database["url"])
    access_to_server(response_year)
    parser_year = BeautifulSoup(response_year.content, 'html.parser')
#    if done == 0:
#        break
    a_tags2 = parser_year.find_all("a")
    a_tags2 = [a for a in a_tags2 if "Return to Home" not in a.text]
    for a in a_tags2:

            # request to crash detail page
            response_crash = access_website(base_url + "/" + year + a["href"] if a["href"][0] == "/" else base_url + "/" + year + "/" + a["href"])
            parser_crash = BeautifulSoup(response_crash.content, 'html.parser')

            # get all table content except the first row(table title)
            tr_tags = parser_crash.find_all("tr")
            tr_tags = tr_tags[1:]

            database = [tr.find_all("td")[1].text.strip() for tr in tr_tags]
            # get all aboard
            aboard = database[9]
            all_aboard = get_data(aboard, r'^\d+|^\W')

            # get all passengers aboard
            passengers_aboard = get_data(aboard, r'(?<=\(passengers:)\d+|(?<=\(passengers:)\W')

            # get all crew aboard
            crew_aboard = get_data(
                aboard, r'(?<=crew:)\d+|(?<=crew:)\W')

            # get all fatalities
            fatalities = database[10]

            all_fatalities = get_data(fatalities, r'^\d+|^\W')

            # get all passengers fatalities
            passenger_fatalities = get_data(fatalities, r'(?<=\(passengers:)\d+|(?<=\(passengers:)\W')

            # get all crew fatalities
            crew_fatalities = get_data(fatalities, r'(?<=crew:)\d+|(?<=crew:)\W')

            # remove aboard info and store each value separately
            database.pop(9)
            database.insert(9, all_aboard)
            database.insert(10, passengers_aboard)
            database.insert(11, crew_aboard)

            # remove fatalities info and store each value separately
            database.pop(12)
            database.insert(12, all_fatalities)
            database.insert(13, passenger_fatalities)
            database.insert(14, crew_fatalities)
            take_a_break()

1920
Sleeping for few seconds...
Access to PlaneCrashInfo.com: Granted
Sleeping for few seconds...
Sleeping for few seconds...
Sleeping for few seconds...
Sleeping for few seconds...
Sleeping for few seconds...
Sleeping for few seconds...
Sleeping for few seconds...
Sleeping for few seconds...
Sleeping for few seconds...
Sleeping for few seconds...
Sleeping for few seconds...
Sleeping for few seconds...
Sleeping for few seconds...
Sleeping for few seconds...
Sleeping for few seconds...
Sleeping for few seconds...
Sleeping for few seconds...
Sleeping for few seconds...
Sleeping for few seconds...
Sleeping for few seconds...
Sleeping for few seconds...
Sleeping for few seconds...
Sleeping for few seconds...
Sleeping for few seconds...
Sleeping for few seconds...
Sleeping for few seconds...
Sleeping for few seconds...
Sleeping for few seconds...
Sleeping for few seconds...
Sleeping for few seconds...
Sleeping for few seconds...
Sleeping for few seconds...
Sleeping for few seconds...
Sleep

IndexError: list index out of range

In [11]:
import re
import requests
from bs4 import BeautifulSoup
import datetime
import csv
from time import sleep


def get_span_with_regex(search_string, regex):
    pattern = re.compile(regex)
    matches = pattern.finditer(search_string)

    for match in matches:
        match = match.span()

    span = search_string[match[0]:match[1]]
    return span


def request_to_server(url):
    response = None
    try:
        response = requests.get(url, timeout=10)
        return response
    except:
        print("sleep for 10 seconds")
        sleep(10)
        response = request_to_server(url)
        return response


base_uri = "http://www.planecrashinfo.com"
year_data = {}
csv_writer = csv.writer(open(
    "planecrashinfo_"+datetime.datetime.now().strftime("%Y%m%d%H%M%S")+".csv", 'w'))

# write data header to csv file
csv_writer.writerow(["date", "time", "location", "operator", "flight_no", "route", "ac_type", "registration", "cn_ln", "all_aboard",
                     "passengers_aboard", "crew_aboard", "all_fatalities", "passenger_fatalities", "crew_fatalities", "ground", "summary"])

# request to list of years
response = request_to_server("http://www.planecrashinfo.com/database.htm")


if(response.status_code == 200):
    parser = BeautifulSoup(response.content, 'html.parser')
    a_tags = parser.find_all("a")
    # build a dict contain the year as key and page url as value
    year_data = {a.text.strip(): {"url": base_uri + a["href"] if a["href"][0] ==
                                  "/" else base_uri + "/" + a["href"]} for a in a_tags if a.text.strip().isdigit()}

    # loop for each year
    for year, data in year_data.items():
        print(year)
        if year == 1970:
            break
        # request to list of year's crashes
        response_year = request_to_server(data["url"])
        parser_year = BeautifulSoup(response_year.content, 'html.parser')

        # get all <a> tag except  "Return to Home" link
        a_tags = parser_year.find_all("a")
        a_tags = [a for a in a_tags if "Return to Home" not in a.text]

        for a in a_tags:
            # print("-", sep=' ', end='', flush=True)

            # request to crash detail page
            response_crash = request_to_server(
                base_uri + "/" + year + a["href"] if a["href"][0] == "/" else base_uri + "/" + year + "/" + a["href"])
            parser_crash = BeautifulSoup(response_crash.content, 'html.parser')

            # get all table content except the first row(table title)
            tr_tags = parser_crash.find_all("tr")
            tr_tags = tr_tags[1:]

            # write data to csv file
            data = [tr.find_all("td")[1].text.strip() for tr in tr_tags]

            # get all aboard
            aboard = data[9]
            # pattern = re.compile(r'^\d+|^\W')
            # matches = pattern.finditer(aboard)

            # for match in matches:
            #     all_aboard_span = match.span()

            # all_aboard = aboard[all_aboard_span[0]:all_aboard_span[1]]
            all_aboard = get_span_with_regex(aboard, r'^\d+|^\W')

            # get all passengers aboard
            passengers_aboard = get_span_with_regex(
                aboard, r'(?<=\(passengers:)\d+|(?<=\(passengers:)\W')

            # get all crew aboard
            crew_aboard = get_span_with_regex(
                aboard, r'(?<=crew:)\d+|(?<=crew:)\W')

            # get all fatalities
            fatalities = data[10]

            all_fatalities = get_span_with_regex(fatalities, r'^\d+|^\W')

            # get all passengers fatalities
            passenger_fatalities = get_span_with_regex(
                fatalities, r'(?<=\(passengers:)\d+|(?<=\(passengers:)\W')

            # get all crew fatalities
            crew_fatalities = get_span_with_regex(
                fatalities, r'(?<=crew:)\d+|(?<=crew:)\W')

            # remove aboard info and store each value separately
            data.pop(9)
            data.insert(9, all_aboard)
            data.insert(10, passengers_aboard)
            data.insert(11, crew_aboard)

            # remove fatalities info and store each value separately
            data.pop(12)
            data.insert(12, all_fatalities)
            data.insert(13, passenger_fatalities)
            data.insert(14, crew_fatalities)

            csv_writer.writerow(data)

            # sleep to overcome Server connection refused error
            sleep(0.1)
else:
    print("Cannot fetch data")

1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970


IndexError: list index out of range