# Aircraft Crashes Data Collection And Cleaning

## Overview

Use web scraping to gather the data of all accidents since 1918.

## Data Collection

In [None]:
from bs4 import BeautifulSoup
import math
import pandas as pd
import re
import requests
from urllib.parse import unquote

In [61]:
# Fetch total number of accidents
root_url = 'https://www.baaa-acro.com'

response = requests.get(root_url)

if response.status_code != 200:
	print('Failed to fetch page.')
else:
	soup = BeautifulSoup(response.content, 'html.parser')
	accident_files = soup.find('div', {'class': 'total-accident-files'})
	nb_crashes = int(accident_files.text.replace(',', ''))

In [74]:
# Fetch details of every accident
nb_rows_per_page = 20
nb_pages = math.ceil(nb_crashes / nb_rows_per_page)
crash_list = []

for i in range(nb_pages):
	listing_url = '{}/crash-archives?page={}'.format(root_url, i)
	response = requests.get(listing_url)
	soup = BeautifulSoup(response.content, 'html.parser')
	anchors = soup.find_all('a', {'class': 'red-btn'})

	for a in anchors:
		link = a['href']
		print(link)
		details_url = root_url + link
		response = requests.get(details_url)
		soup = BeautifulSoup(response.content, 'html.parser')
		details = {}
		
		details_div = soup.find('div', {'class': 'crash-details'})
		details['date'] = details_div.find('div', {'class': 'crash-date'}).find('span').next_sibling.text
		details['type_of_aircraft'] = details_div.find('div', {'class': 'crash-aircraft'}).find('a').find('div').text
		operator_div = details_div.find('div', {'class': 'crash-operator'})

		# Extract airline company name from image link
		if (operator_div.find('img')):
			pattern = re.compile(r'(?<=target_id=)[\w\s\-\(\)\.\'\&]+(?= \(\d+\))')
			img_link = unquote(operator_div.find('img').parent['href'])
			details['operator'] = pattern.search(img_link).group(0)
		else:
			details['operator'] = operator_div.find('a').find('div').text

		reg_div = details_div.find('div', {'class': 'crash-registration'})
		details['registration'] = reg_div.find('div').text if reg_div else None
		
		details['flight_phase'] = details_div.find('div', {'class': 'crash-flight-phase'}).find('a').find('div').text
		details['flight_type'] = details_div.find('div', {'class': 'crash-flight-type'}).find('a').find('div').text
		
		survivors_div = details_div.find('div', {'class': 'crash-survivors'})
		details['survivors'] = survivors_div.find('a').find('div').text if survivors_div else None
		
		details['site'] = details_div.find('div', {'class': 'crash-site'}).find('a').find('div').text
		
		schedule_div = details_div.find('div', {'class': 'crash-schedule'})
		details['schedule'] = schedule_div.find('div').text if schedule_div else None
		
		msn_div = details_div.find('div', {'class': 'crash-construction-num'})
		details['msn'] = msn_div.find('div').text if msn_div else None
		
		yom_div = details_div.find('div', {'class': 'crash-yom'})
		details['yom'] = yom_div.find('div').text if yom_div else None
		
		location_details = details_div.find('div', {'class': 'crash-location'}).select('a')
		details['location'] = ', '.join(item.text.strip() for item in location_details)
		
		details['country'] = details_div.find('div', {'class': 'crash-country'}).find('a').find('div').text
		details['region'] = details_div.find('div', {'class': 'crash-region'}).find('a').find('div').text
		details['crew_on_board'] = details_div.find('div', {'class': 'crash-crew-on-board'}).find('div').text
		details['crew_fatalities'] = details_div.find('div', {'class': 'crash-crew-fatalities'}).find('div').text
		details['pax_on_board'] = details_div.find('div', {'class': 'crash-pax-on-board'}).find('div').text
		details['pax_fatalities'] = details_div.find('div', {'class': 'crash-pax-fatalities'}).find('div').text
		
		others_div = details_div.find('div', {'class': 'crash-other-fatalities'})
		details['other_fatalities'] = others_div.find('div').text if others_div else None
		
		details['total_fatalities'] = details_div.find('div', {'class': 'crash-total-fatalities'}).find('div').text
		details['circumstances'] = details_div.find('div', {'class': 'crash-circumstances'}).find('div').text
		
		crash_list.append(details)

# Export data to CSV
crashes_df = pd.DataFrame(crash_list)
crashes_df.to_csv('data/crashes_scraped_data.csv', index=False)

/crash/crash-cessna-525a-citation-cj2-mesquite-metro-1-killed
/crash/crash-antonov-32-bagdogra
/crash/crash-bae-3201-jetstream-32ep-gueppi
/crash/crash-antonov-26-omdurman-46-killed
/crash/crash-ilyushin-ii-76td-near-nyala-7-killed-0
/crash/crash-canadair-regionaljet-crj-900lr-toronto
/crash/crash-cessna-208a-cargomaster-codys-corner-1-killed
/crash/crash-learjet-35a-scottsdale-1-killed
/crash/crash-antonov-26b-100-ndele
/crash/crash-beechcraft-f90-king-air-sao-paulo-2-killed
/crash/crash-cessna-208b-grand-caravan-ex-norton-sound-10-killed
/crash/crash-beechcraft-350-super-king-air-ampatuan-4-killed
/crash/crash-socata-tbm-700-c2-leuzigen
/crash/crash-pzl-mielec-2r-arkalyk
/crash/crash-learjet-55-philadelphia-7-killed
/crash/crash-canadair-crj-701er-washington-dc-64-killed
/crash/crash-beechcraft-1900d-gpoc-unity-20-killed
/crash/crash-cessna-s550-citation-sii-el-volcan-3-killed
/crash/ground-fire-airbus-a321-busan
/crash/crash-antonov-26b-kongolo
/crash/crash-piper-pa-31-325-navajo-cr

AttributeError: 'NoneType' object has no attribute 'find'

---

## Data Cleaning

## End