In [1]:
!pip install --quiet geocoder
print('pip install geocoder complete.')

pip install geocoder complete.


In [2]:
import pandas as pd

CRASH = '/kaggle/input/airplane-crashes/Airplane_Crashes_and_Fatalities_Since_1908.csv'

df = pd.read_csv(filepath_or_buffer=CRASH, parse_dates=['Date']).drop(columns=['Time', 'Flight #', 'cn/In', 'Aboard', 'Fatalities', 'Ground'])
df[['Operator', 'Route', 'Type', 'Registration', 'Summary']] = df[['Operator', 'Route', 'Type', 'Registration', 'Summary']].fillna(value='')
df['year'] = df['Date'].dt.year
df.head()

Unnamed: 0,Date,Location,Operator,Route,Type,Registration,Summary,year
0,1908-09-17,"Fort Myer, Virginia",Military - U.S. Army,Demonstration,Wright Flyer III,,"During a demonstration flight, a U.S. Army fly...",1908
1,1912-07-12,"AtlantiCity, New Jersey",Military - U.S. Navy,Test flight,Dirigible,,First U.S. dirigible Akron exploded just offsh...,1912
2,1913-08-06,"Victoria, British Columbia, Canada",Private,,Curtiss seaplane,,The first fatal airplane accident in Canada oc...,1913
3,1913-09-09,Over the North Sea,Military - German Navy,,Zeppelin L-1 (airship),,The airship flew into a thunderstorm and encou...,1913
4,1913-10-17,"Near Johannisthal, Germany",Military - German Navy,,Zeppelin L-2 (airship),,Hydrogen gas which was being vented was sucked...,1913


In [3]:
from plotly.express import histogram
histogram(data_frame=df, x='Date')

In [4]:
from plotly.express import scatter
scatter(data_frame=df['year'].value_counts().to_frame().reset_index(), x='year', y='count', trendline='lowess')

It seems from the histogram that the crashes per year peaked in the 1980s and our scatter trendline makes it clear that the moving average of the crashes per year peaked in 1977.

In [5]:
histogram(data_frame=df['Operator'].value_counts().head(n=50).to_frame().reset_index(), x='Operator', y='count', title='Top 50 operators by crash count')

In [6]:
histogram(data_frame=df['Type'].value_counts().head(n=50).to_frame().reset_index(), x='Type', y='count', title='Top 50 types by crash count')

If we try to pull these two views of the data together we get an entirely different story, because operators use more than one type of aircraft and types of aircraft are usually used by multiple operators.

In [7]:
scatter(data_frame=df[['Operator', 'Type']].value_counts().to_frame().reset_index().head(n=50), x='Type', y='Operator', size='count', height=900)

We can geolocate our crashes and then build a map of crashes over time.

In [8]:
from geocoder import arcgis
from arrow import now

time_start = now()
# this takes about forty minutes so sit tight
locations = {location : arcgis(location=location).latlng for location in df.dropna(subset=['Location'])['Location'].unique().tolist()}
locations_df = pd.DataFrame.from_dict(data=locations,).T
locations_df.columns = ['latitude', 'longitude']
print('found {} locations in {}'.format(len(locations_df), now() - time_start))

found 4303 locations in 0:42:47.068498


In [9]:
from plotly.express import scatter_mapbox
map_df = df.merge(right=locations_df.reset_index(), left_on='Location', right_on='index', how='inner').drop(columns=['index'])
scatter_mapbox(data_frame=map_df, lat='latitude', lon='longitude', color='year', mapbox_style='open-street-map', zoom=1, height=900, hover_data=['Date', 'Location', 'Operator', 'Summary'])