In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [24]:
#Import the data
df = pd.read_csv('../data/AviationData.csv', encoding='latin-1')
state_codes = pd.read_csv('../data/USState_Codes.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [25]:
# Change the date column to a datetime object
df['Event.Date'] = pd.to_datetime(df['Event.Date'])
#Creating a column for the year
df['Year'] = df['Event.Date'].dt.year
#Making everything in the df lowercase so we can get more accurate counts.
df = df.applymap(lambda x: x.lower() if type(x) == str else x)
# Filtering out 'Amateur.Built' aircraft
df = df[df['Amateur.Built'] == 'no']
# And only keep 'airplane' from the 'Aircraft.Category' column
df = df[df['Aircraft.Category'] == 'airplane']
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24417 entries, 5 to 88886
Data columns (total 32 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Event.Id                24417 non-null  object        
 1   Investigation.Type      24417 non-null  object        
 2   Accident.Number         24417 non-null  object        
 3   Event.Date              24417 non-null  datetime64[ns]
 4   Location                24411 non-null  object        
 5   Country                 24410 non-null  object        
 6   Latitude                19171 non-null  object        
 7   Longitude               19165 non-null  object        
 8   Airport.Code            15574 non-null  object        
 9   Airport.Name            16012 non-null  object        
 10  Injury.Severity         23604 non-null  object        
 11  Aircraft.damage         23147 non-null  object        
 12  Aircraft.Category       24417 non-null  object

In [27]:
# Only using US incidents
df = df[df['Country'] == 'united states']

# pull state info from locations
df['State'] = df['Location'].str.split(',').str[-1].str.strip().str.upper()
df['State'].value_counts()

CA    2009
TX    1700
AK    1590
FL    1498
AZ     785
CO     643
GA     629
WA     603
IL     490
NY     487
NC     480
MI     467
ID     456
OH     448
OR     434
PA     433
MO     389
AR     375
WI     363
VA     363
MN     356
UT     336
IN     320
NV     316
OK     311
KS     304
MT     304
AL     302
NM     301
NJ     299
TN     294
LA     290
SC     268
MD     235
MA     219
IA     216
MS     203
NE     194
WY     185
KY     165
ND     148
ME     147
SD     129
CT     123
HI      91
WV      89
NH      83
VT      53
PR      48
        43
RI      41
DE      24
OF      13
AO      11
PO       8
DC       6
GU       2
VI       1
GM       1
Name: State, dtype: int64

In [28]:
# Cleaning up the FAR.Description column using a .replace() 
# See here for more info on FAR: 
# https://pilotinstitute.com/part-91-vs-121-vs-135/

df['FAR.Description'] = df['FAR.Description'].replace({
    '091': 'part 91',
    'part 91: general aviation': 'part 91',
    '137': 'part 137',
    '135': 'part 135',
    '121': 'part 121',
    'part 137: agricultural': 'part 137',
    'part 135: air taxi & commuter': 'part 135',
    '129': 'part 129',
    'part 121: air carrier': 'part 121',
    '133': 'part 133',
    'part 129: foreign': 'part 129',
    'part 133: rotorcraft external load': 'part 133',
    'part 133: rotorcraft ext. load': 'part 133',
    '125' : 'part 125',
    'part 125: 20+ pax,6000+ lbs': 'part 125',
    '107': 'part 107',
    '103': 'part 103',
    '437': 'part 437',
    'part91f': 'part 91',
    'part 91 subpart f': 'part 91',
    'part 91f: special flt ops.': 'part 91',
    'part 91 subpart k: fractional': 'part 91',
    '091k': 'part 91',
    'pubu': 'public',
    'public use': 'public',
    'public aircraft': 'public',
    'armed forces': 'armf',
}
)

df['FAR.Description'].value_counts()

part 91     18388
part 137     1099
part 135      756
part 121      632
public        134
part 129       59
nusc           12
unknown        10
nusn           10
part 125        8
unk             7
part 133        2
armf            2
Name: FAR.Description, dtype: int64

In [29]:
#Now grouping the 'Purpose.of.flight' category into more general categories

df['purpose_binned'] = df['Purpose.of.flight'].replace(['public aircraft', 'public aircraft - federal', 'public aircraft - state', 'public aircraft - local', 'public aircraft - u.s. govt.'], 'public')
# Grouping 'aerial application', 'business', 'other work use', 'banner tow', 'skydiving', 'glider tow', 'air drop' into 'business'
df['purpose_binned'] = df['purpose_binned'].replace(['aerial application', 'business', 'other work use', 'banner tow', 'skydiving', 'glider tow', 'air drop'], 'business')
# Grouping 'executive/corporate', 'ferry' into 'small transport'.
df['purpose_binned'] = df['purpose_binned'].replace(['executive/corporate', 'ferry'], 'small transport')
# Adding more into the public category
df['purpose_binned'] = df['purpose_binned'].replace(['pubs', 'public', 'publ', 'firefighting'], 'public')
# Adding 'flight test' to 'instructional
df['purpose_binned'] = df['purpose_binned'].replace(['flight test'], 'instructional')
# Grouping air show categories
df['purpose_binned'] = df['purpose_binned'].replace(['air race show', 'air race/show', 'asho', 'air show'])

df['purpose_binned'].value_counts()

personal              13078
instructional          3179
business               2167
unknown                 551
positioning             337
small transport         286
aerial observation      155
public                  139
Name: purpose_binned, dtype: int64

## Filtering the 'Make' series


In [32]:
# Checking out the raw 'Make' values
df['Make'].value_counts()

cessna                          7697
piper                           4322
beech                           1522
boeing                           456
mooney                           391
                                ... 
martin charles a                   1
x-air llc                          1
ploof progressive aerodyne's       1
t bird                             1
fernandes dale r                   1
Name: Make, Length: 1015, dtype: int64

In [31]:

# Make a list of all the entries that have "cessna" in them and return their strings in a list
cessna_list = df[df['Make'].str.contains('cessna')]['Make'].unique().tolist()

#Do the same with the other big makes
grumman_list = df[df['Make'].str.contains('grumman')]['Make'].unique().tolist()
dehavilland_list = df[df['Make'].str.contains('havilland')]['Make'].unique().tolist()
cirrus_list = df[df['Make'].str.contains('cirrus')]['Make'].unique().tolist()
piper_list = df[df['Make'].str.contains('piper')]['Make'].unique().tolist()
boeing_list = df[df['Make'].str.contains('boeing')]['Make'].unique().tolist()
mooney_list = df[df['Make'].str.contains('mooney')]['Make'].unique().tolist()
airtractor_list = df[df['Make'].str.contains('air tractor')]['Make'].unique().tolist()

print(cessna_list)
print(grumman_list)
print(dehavilland_list)
print(cirrus_list)
print(piper_list)
print(boeing_list)
print(mooney_list)
print(airtractor_list)

['cessna', 'cessna aircraft co', 'cessna/air repair inc', 'cessna/weaver', 'cessna ector', 'cessna aircraft', 'cessna aircraft co.']
['grumman', 'grumman american', 'grumman-schweizer', 'grumman american avn. corp.', 'grumman american aviation', 'grumman acft eng', 'grumman acft eng cor-schweizer', 'grumman aircraft eng corp', 'grumman american aviation corp', 'grumman aircraft', 'grumman schweizer', 'grumman acft eng cor', 'grumman aircraft cor-schweizer', 'grumman american avn. corp']
['de havilland', 'dehavilland', 'dehavilland canada']
['cirrus design corp.', 'cirrus', 'cirrus design', 'cirrus design corp', 'cirrus design corporation']
['piper', 'piper-aerostar', 'piper aircraft', 'piper/cub crafters', 'piper cub crafters', 'piper aerostar', 'new piper aircraft inc', 'piper aircraft inc', 'jetprop dlx piper', "piper/wally's flyers inc", 'piper aircraft, inc.', 'new piper', 'piper / laudeman', 'piper-harris', 'piper aircraft corporation']
['boeing', 'boeing stearman', 'boeing of can

In [34]:
df['Make'] = df['Make'].replace(cessna_list, 'cessna')
df['Make'] = df['Make'].replace(grumman_list, 'grumman')
df['Make'] = df['Make'].replace(dehavilland_list, 'dehavilland')
df['Make'] = df['Make'].replace(cirrus_list, 'cirrus')
df['Make'] = df['Make'].replace(piper_list, 'piper')
df['Make'] = df['Make'].replace(mooney_list, 'mooney')
df['Make'] = df['Make'].replace(airtractor_list, 'airtractor')

# And we can now see the totals, with slightly different numbers than before.
df['Make'].value_counts()

cessna                            7730
piper                             4368
beech                             1522
boeing                             456
mooney                             437
                                  ... 
siler ronald e                       1
javron                               1
american champion aircraft cor       1
continental                          1
fernandes dale r                     1
Name: Make, Length: 968, dtype: int64