In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# Import the data
df = pd.read_csv('../data/AviationData.csv', encoding='latin-1')
state_codes = pd.read_csv('../data/USState_Codes.csv')

In [None]:
# Change the date column to a datetime object
df['Event.Date'] = pd.to_datetime(df['Event.Date'])
# Creating a column for the year
df['Year'] = df['Event.Date'].dt.year

# Making everything in the df lowercase so we can get more accurate counts
df = df.applymap(lambda x: x.lower() if type(x) == str else x)
# Renaming columns in titlecase (aesthetic purposes only)
df = df.rename(columns={'Event.Id':'Event.ID','Aircraft.damage':'Aircraft.Damage','Purpose.of.flight':'Purpose.of.Flight','Air.carrier':'Air.Carrier','Broad.phase.of.flight':'Broad.Phase.of.Flight'})

# Filtering out 'Amateur.Built' aircraft
df = df[df['Amateur.Built'] == 'no']
# And only keep 'airplane' from the 'Aircraft.Category' column
df = df[df['Aircraft.Category'] == 'airplane']

df.info()

# Geographic Filtering

In [None]:
# Only using US incidents
df = df[df['Country'] == 'united states']

# Pull state info from locations and create new column
df['State'] = df['Location'].str.split(',').str[-1].str.strip().str.upper()
df['State'].value_counts()

In [None]:
# Merge state names to abbreviated codes
df = pd.merge(df, state_codes, how='left', left_on='State', right_on='Abbreviation')
df = df.drop(['Abbreviation'], 1)
df = df.rename(columns={'State':'State.Code','US_State':"State.Name"})


# FAR Codes & Purpose of Flight

In [None]:
# Cleaning up the FAR.Description column using a .replace() 
# See here for more info on FAR: 
# https://pilotinstitute.com/part-91-vs-121-vs-135/

df['FAR.Description'] = df['FAR.Description'].replace({
    '091': 'part 91',
    'part 91: general aviation': 'part 91',
    '137': 'part 137',
    '135': 'part 135',
    '121': 'part 121',
    'part 137: agricultural': 'part 137',
    'part 135: air taxi & commuter': 'part 135',
    '129': 'part 129',
    'part 121: air carrier': 'part 121',
    '133': 'part 133',
    'part 129: foreign': 'part 129',
    'part 133: rotorcraft external load': 'part 133',
    'part 133: rotorcraft ext. load': 'part 133',
    '125' : 'part 125',
    'part 125: 20+ pax,6000+ lbs': 'part 125',
    '107': 'part 107',
    '103': 'part 103',
    '437': 'part 437',
    'part91f': 'part 91',
    'part 91 subpart f': 'part 91',
    'part 91f: special flt ops.': 'part 91',
    'part 91 subpart k: fractional': 'part 91',
    '091k': 'part 91',
    'pubu': 'public',
    'public use': 'public',
    'public aircraft': 'public',
    'armed forces': 'armf',
}
)

df['FAR.Description'].value_counts()

In [None]:
df['Purpose.of.Flight'].value_counts()

In [None]:
# Grouping the 'Purpose.of.Flight' category into more general categories
# New category 'public' to include 'public aircraft', 'public aircraft - federal', 'public aircraft - state', 'public aircraft - local', 'public aircraft - u.s. govt.'
df['Purpose.Binned'] = df['Purpose.of.Flight'].replace(['public aircraft', 'public aircraft - federal', 'public aircraft - state', 'public aircraft - local', 'public aircraft - u.s. govt.'], 'public')

# New category 'business' to include 'aerial application', 'business', 'other work use', 'banner tow', 'air drop'
df['Purpose.Binned'] = df['Purpose.Binned'].replace(['aerial application', 'business', 'other work use', 'banner tow', 'air drop'], 'business')

# New category 'recreational' to include 'skydiving', 'glider tow'
df['Purpose.Binned'] = df['Purpose.Binned'].replace(['skydiving', 'glider tow'], 'recreational')

# New category 'small transport' to include 'executive/corporate', 'ferry'
df['Purpose.Binned'] = df['Purpose.Binned'].replace(['executive/corporate', 'ferry'], 'small transport')

# Adding 'pubs', 'firefighting' to category 'public'
df['Purpose.Binned'] = df['Purpose.Binned'].replace(['pubs', 'firefighting'], 'public')

# Adding 'flight test' to category 'instructional'
df['Purpose.Binned'] = df['Purpose.Binned'].replace(['flight test'], 'instructional')

# New category 'air show' to include 'air race show', 'air race/show', 'asho'
df['Purpose.Binned'] = df['Purpose.Binned'].replace(['air race show', 'air race/show', 'asho'], 'air show')

df['Purpose.Binned'].value_counts()

## Filtering the 'Make' series


In [None]:
# Checking out the raw 'Make' values
df['Make'].value_counts()

In [None]:

# Make a list of all the entries that have "cessna" in them and return their strings in a list
cessna_list = df[df['Make'].str.contains('cessna')]['Make'].unique().tolist()

# Do the same with the other big makes
grumman_list = df[df['Make'].str.contains('grumman')]['Make'].unique().tolist()
dehavilland_list = df[df['Make'].str.contains('havilland')]['Make'].unique().tolist()
cirrus_list = df[df['Make'].str.contains('cirrus')]['Make'].unique().tolist()
piper_list = df[df['Make'].str.contains('piper')]['Make'].unique().tolist()
boeing_list = df[df['Make'].str.contains('boeing')]['Make'].unique().tolist()
mooney_list = df[df['Make'].str.contains('mooney')]['Make'].unique().tolist()
airtractor_list = df[df['Make'].str.contains('air tractor')]['Make'].unique().tolist()

print(cessna_list)
print(grumman_list)
print(dehavilland_list)
print(cirrus_list)
print(piper_list)
print(boeing_list)
print(mooney_list)
print(airtractor_list)

In [None]:
df['Make'] = df['Make'].replace(cessna_list, 'cessna')
df['Make'] = df['Make'].replace(grumman_list, 'grumman')
df['Make'] = df['Make'].replace(dehavilland_list, 'dehavilland')
df['Make'] = df['Make'].replace(cirrus_list, 'cirrus')
df['Make'] = df['Make'].replace(piper_list, 'piper')
df['Make'] = df['Make'].replace(mooney_list, 'mooney')
df['Make'] = df['Make'].replace(airtractor_list, 'airtractor')

# And we can now see the totals, with slightly different numbers than before
df['Make'].value_counts()

In [None]:
# Making a new df that only accepts the top 25 makes:
top_makes = df['Make'].value_counts().head(25).index.tolist()
top_makes_df = df[df['Make'].isin(top_makes)].copy()
top_makes_df['Make'].value_counts()

# Airports

In [None]:
# Clean airport names

df['Airport.Name'] = df['Airport.Name'].replace(['private airstrip', 'private strip', 'pvt'], 'private')
df['Airport.Name'].value_counts()

# Injury Severity

In [None]:
# Looking at the 'Injury.Severity' column
# We see that it tells us how many people died in a given incident
df['Injury.Severity'].value_counts().head(20)

In [None]:
# Split number from Fatal(#) to count number of fatalities
# Create a new column tracking the number of these fatalities
df['Num.Fatalities'] = df['Injury.Severity'].str.split('(').str[-1]
df['Num.Fatalities'] = df['Num.Fatalities'].str.split(')').str[0]
df['Num.Fatalities'] = pd.to_numeric(df['Num.Fatalities'], errors='coerce').fillna(0).astype(int)
df['Num.Fatalities'].value_counts()

# Injury Severity

In [None]:
# Update labeling in 'Injury.Severity' column, since we now have a different column that tracks total number of fatalities

df['Injury.Severity'] = df['Injury.Severity'].str.split('(').str[0]
df['Injury.Severity'].value_counts()

# Weather

In [None]:
# Clean 'Weather.Condition'

df['Weather.Condition'] = df['Weather.Condition'].str.upper()
df['Weather.Condition'].value_counts()

# VMC - Visual Meteorological Conditions - generally clear and good visibility; pilots can navigate and operate aircraft by visual reference to the ground
# IMC - Instrument Meteorological Conditions - reduced visibility due to factors like fog, rain, or low clouds; pilots may need to rely on instruments for navigation and control
# UNK - Unknown

# Save cleaned .csv

In [None]:
# Save cleaned df to a new csv file
df.to_csv('../data/AviationData_cleaned.csv', index=False)