In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup as bs
import re
from dateutil.parser import parse
from datetime import datetime
import pycountry_convert as pc

### Defined functions

In [2]:
# Change column names to name_part
def change_col_names(df, old_symbol, new_symbol):
    df.columns = df.columns.str.lower().str.replace(old_symbol, new_symbol)
    return df

# Dataset 1 - Historical Plane Crash Data 
This dataset contains all of the plane crashes that happened throughout history. Data are extracted using web scraping methods in R and preprocessed to create an organized and informative dataset.
All data are collected from Ronan Hubert: [Aviation Accident Bureau Archive]
(https://www.baaa-acro.com/)
This is the dataset [here]:(https://www.kaggle.com/datasets/abeperez/historical-plane-crash-data)

### 2.1. Reading, analyzing the dataset

In [3]:
avioset_aab = pd.read_csv('data/dataset_1_accident_bureau/Plane Crashes.csv')

In [4]:
# 28536 rows × 24 columns
avioset_aab

Unnamed: 0,Date,Time,Aircraft,Operator,Registration,Flight phase,Flight type,Survivors,Crash site,Schedule,...,Country,Region,Crew on board,Crew fatalities,Pax on board,PAX fatalities,Other fatalities,Total fatalities,Circumstances,Crash cause
0,1918-05-02,,De Havilland DH.4,United States Signal Corps - USSC,AS-32084,Takeoff (climb),Test,No,Airport (less than 10 km from airport),Dayton - Dayton,...,United States of America,North America,2.0,2.0,0.0,0.0,0.0,2,The single engine airplane departed Dayton-McC...,Technical failure
1,1918-06-08,,Handley Page V/1500,Handley Page Aircraft Company Ltd,E4104,Takeoff (climb),Test,Yes,Airport (less than 10 km from airport),Cricklewood - Cricklewood,...,United Kingdom,Europe,6.0,5.0,0.0,0.0,0.0,5,"Assembled at Cricklewood Airfield in May 1918,...",Technical failure
2,1918-06-11,,Avro 504,Royal Air Force - RAF,A8544,Flight,Training,Yes,"Plain, Valley",Abukir - Abukir,...,Egypt,Africa,2.0,1.0,0.0,0.0,0.0,1,The single engine aircraft was completing a lo...,Unknown
3,1918-06-19,,De Havilland DH.4,United States Signal Corps - USSC,AS-32098,Flight,Military,No,Airport (less than 10 km from airport),Wright Patterson AFB-Wright Patterson AFB,...,United States of America,North America,1.0,1.0,0.0,0.0,0.0,1,"Lt. Frank Stuart Patterson, son and nephew of ...",Technical failure
4,1918-06-24,,Breguet 14,French Air Force - Armée de l'Air,AS-4130,Landing (descent or approach),Military,Yes,,,...,France,Europe,,0.0,,0.0,0.0,0,The aircraft crashed iupon landing somewhere i...,Unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28531,2022-05-20,,Embraer EMB-110 Bandeirante,Sales Serviços Aéreos,PT-SHN,Landing (descent or approach),Cargo,Yes,Airport (less than 10 km from airport),Jundiaí – Eldorado do Sul,...,Brazil,South America,2.0,0.0,0.0,0.0,0.0,0,The airplane departed Jundiaí on a cargo fligh...,Unknown
28532,2022-05-23,18H 29M 0S,Piper PA-61 Aerostar (Ted Smith 601),Raul Ignacion Posada,N66CG,Flight,Private,No,"Plain, Valley",Celaya - Durango,...,Mexico,Central America,1.0,1.0,1.0,1.0,0.0,2,While approaching Durango Airport on a flight ...,Unknown
28533,2022-05-24,15H 40M 0S,De Havilland DHC-3 Otter,Yakutat Coastal Airlines,N703TH,Landing (descent or approach),Charter/Taxi (Non Scheduled Revenue Flight),Yes,Airport (less than 10 km from airport),Yakutat – Dry Bay,...,United States of America,North America,1.0,0.0,3.0,0.0,0.0,0,The single engine airplane departed Yakutat on...,Unknown
28534,2022-05-29,10H 7M 0S,De Havilland DHC-6 Twin Otter,Tara Air,9N-AET,Flight,Scheduled Revenue Flight,No,Mountains,Pokhara – Jomsom,...,Nepal,Asia,3.0,3.0,19.0,19.0,0.0,22,The twin engine airplane departed Pokhara City...,Human factor


<div style="background-color: #9df9ef; padding: 10px;"></div>

### 2.2. Dataset cleaning, normalization

#### 2.2.1. Let's see what data do we have in all columns. 

In [5]:
avioset_aab.dtypes

Date                 object
Time                 object
Aircraft             object
Operator             object
Registration         object
Flight phase         object
Flight type          object
Survivors            object
Crash site           object
Schedule             object
MSN                  object
YOM                 float64
Flight no.          float64
Crash location       object
Country              object
Region               object
Crew on board       float64
Crew fatalities     float64
Pax on board        float64
PAX fatalities      float64
Other fatalities    float64
Total fatalities      int64
Circumstances        object
Crash cause          object
dtype: object

<div style="background-color: #9df9ef; padding: 10px;"></div>

#### 2.2.2. Let's change column names like Event.Id into event_id.

In [6]:
# We'll make a new dataset so we don't lose the original
avioset_aab_low = avioset_aab

In [7]:
# def change_col_names(df, old_symbol, new_symbol):
#    df.columns = df.columns.str.lower().str.replace(old_symbol, new_symbol)
#    return df

In [11]:
change_col_names(avioset_aab_low, ' ', '_')

Unnamed: 0,date,time,aircraft,operator,registration,flight_phase,flight_type,survivors,crash_site,schedule,...,country,region,crew_on_board,crew_fatalities,pax_on_board,pax_fatalities,other_fatalities,total_fatalities,circumstances,crash_cause
0,1918-05-02,,De Havilland DH.4,United States Signal Corps - USSC,AS-32084,Takeoff (climb),Test,No,Airport (less than 10 km from airport),Dayton - Dayton,...,United States of America,North America,2.0,2.0,0.0,0.0,0.0,2,The single engine airplane departed Dayton-McC...,Technical failure
1,1918-06-08,,Handley Page V/1500,Handley Page Aircraft Company Ltd,E4104,Takeoff (climb),Test,Yes,Airport (less than 10 km from airport),Cricklewood - Cricklewood,...,United Kingdom,Europe,6.0,5.0,0.0,0.0,0.0,5,"Assembled at Cricklewood Airfield in May 1918,...",Technical failure
2,1918-06-11,,Avro 504,Royal Air Force - RAF,A8544,Flight,Training,Yes,"Plain, Valley",Abukir - Abukir,...,Egypt,Africa,2.0,1.0,0.0,0.0,0.0,1,The single engine aircraft was completing a lo...,Unknown
3,1918-06-19,,De Havilland DH.4,United States Signal Corps - USSC,AS-32098,Flight,Military,No,Airport (less than 10 km from airport),Wright Patterson AFB-Wright Patterson AFB,...,United States of America,North America,1.0,1.0,0.0,0.0,0.0,1,"Lt. Frank Stuart Patterson, son and nephew of ...",Technical failure
4,1918-06-24,,Breguet 14,French Air Force - Armée de l'Air,AS-4130,Landing (descent or approach),Military,Yes,,,...,France,Europe,,0.0,,0.0,0.0,0,The aircraft crashed iupon landing somewhere i...,Unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28531,2022-05-20,,Embraer EMB-110 Bandeirante,Sales Serviços Aéreos,PT-SHN,Landing (descent or approach),Cargo,Yes,Airport (less than 10 km from airport),Jundiaí – Eldorado do Sul,...,Brazil,South America,2.0,0.0,0.0,0.0,0.0,0,The airplane departed Jundiaí on a cargo fligh...,Unknown
28532,2022-05-23,18H 29M 0S,Piper PA-61 Aerostar (Ted Smith 601),Raul Ignacion Posada,N66CG,Flight,Private,No,"Plain, Valley",Celaya - Durango,...,Mexico,Central America,1.0,1.0,1.0,1.0,0.0,2,While approaching Durango Airport on a flight ...,Unknown
28533,2022-05-24,15H 40M 0S,De Havilland DHC-3 Otter,Yakutat Coastal Airlines,N703TH,Landing (descent or approach),Charter/Taxi (Non Scheduled Revenue Flight),Yes,Airport (less than 10 km from airport),Yakutat – Dry Bay,...,United States of America,North America,1.0,0.0,3.0,0.0,0.0,0,The single engine airplane departed Yakutat on...,Unknown
28534,2022-05-29,10H 7M 0S,De Havilland DHC-6 Twin Otter,Tara Air,9N-AET,Flight,Scheduled Revenue Flight,No,Mountains,Pokhara – Jomsom,...,Nepal,Asia,3.0,3.0,19.0,19.0,0.0,22,The twin engine airplane departed Pokhara City...,Human factor


In [13]:
# Changing flight_no. to flight_no
avioset_aab_low.rename(columns={'flight_no.': 'flight_no'}, inplace=True)

In [14]:
avioset_aab_low.columns

Index(['date', 'time', 'aircraft', 'operator', 'registration', 'flight_phase',
       'flight_type', 'survivors', 'crash_site', 'schedule', 'msn', 'yom',
       'flight_no', 'crash_location', 'country', 'region', 'crew_on_board',
       'crew_fatalities', 'pax_on_board', 'pax_fatalities', 'other_fatalities',
       'total_fatalities', 'circumstances', 'crash_cause'],
      dtype='object')

<div style="background-color: #9df9ef; padding: 10px;"></div>

#### 2.2.3. Let's analyze nan in each column
- TODO

<div style="background-color: #9df9ef; padding: 10px;"></div>

#### 2.2.4. Let's look at the unique values in each column 
- TODO

<div style="background-color: #9df9ef; padding: 10px;"></div>

#### 2.2.5. Let's analyze columns 6, 7, 28 
- TODO

<div style="background-color: #9df9ef; padding: 10px;"></div>

#### 2.2.6. How many na we have? Should we replace or not? 
- TODO


<div style="background-color: #9df9ef; padding: 10px;"></div>

#### 2.2.7. Let's convert date columns into datetime 
- TODO

<div style="background-color: #9df9ef; padding: 10px;"></div>

#### 2.2.8. Let's take a look on the summary of statistics for numerical columns - It's good idea first to change some dtypes !!

#### 2.2.9. Flight phases
Picture is from https://docs.flybywiresim.com/pilots-corner/advanced-guides/flight-phases/
![Flight phases](data/images/flight-phases.png)

<div style="background-color: #9df9ef; padding: 10px;"></div>