# Data Cleaning

First, we will open the AviationData csv and look at the columns.

In [439]:
import pandas as pd
import numpy as np
 
df = pd.read_csv('./data/AviationData.csv', encoding='latin1')

df.head(5)

  df = pd.read_csv('./data/AviationData.csv', encoding='latin1')


Unnamed: 0,Event.Id,Investigation.Type,Accident.Number,Event.Date,Location,Country,Latitude,Longitude,Airport.Code,Airport.Name,...,Purpose.of.flight,Air.carrier,Total.Fatal.Injuries,Total.Serious.Injuries,Total.Minor.Injuries,Total.Uninjured,Weather.Condition,Broad.phase.of.flight,Report.Status,Publication.Date
0,20001218X45444,Accident,SEA87LA080,1948-10-24,"MOOSE CREEK, ID",United States,,,,,...,Personal,,2.0,0.0,0.0,0.0,UNK,Cruise,Probable Cause,
1,20001218X45447,Accident,LAX94LA336,1962-07-19,"BRIDGEPORT, CA",United States,,,,,...,Personal,,4.0,0.0,0.0,0.0,UNK,Unknown,Probable Cause,19-09-1996
2,20061025X01555,Accident,NYC07LA005,1974-08-30,"Saltville, VA",United States,36.922223,-81.878056,,,...,Personal,,3.0,,,,IMC,Cruise,Probable Cause,26-02-2007
3,20001218X45448,Accident,LAX96LA321,1977-06-19,"EUREKA, CA",United States,,,,,...,Personal,,2.0,0.0,0.0,0.0,IMC,Cruise,Probable Cause,12-09-2000
4,20041105X01764,Accident,CHI79FA064,1979-08-02,"Canton, OH",United States,,,,,...,Personal,,1.0,2.0,,0.0,VMC,Approach,Probable Cause,16-04-1980


Then, we will look at the column names and the dataframe info to find possible null values.

In [440]:
df.columns

Index(['Event.Id', 'Investigation.Type', 'Accident.Number', 'Event.Date',
       'Location', 'Country', 'Latitude', 'Longitude', 'Airport.Code',
       'Airport.Name', 'Injury.Severity', 'Aircraft.damage',
       'Aircraft.Category', 'Registration.Number', 'Make', 'Model',
       'Amateur.Built', 'Number.of.Engines', 'Engine.Type', 'FAR.Description',
       'Schedule', 'Purpose.of.flight', 'Air.carrier', 'Total.Fatal.Injuries',
       'Total.Serious.Injuries', 'Total.Minor.Injuries', 'Total.Uninjured',
       'Weather.Condition', 'Broad.phase.of.flight', 'Report.Status',
       'Publication.Date'],
      dtype='object')

In [441]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88889 entries, 0 to 88888
Data columns (total 31 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Event.Id                88889 non-null  object 
 1   Investigation.Type      88889 non-null  object 
 2   Accident.Number         88889 non-null  object 
 3   Event.Date              88889 non-null  object 
 4   Location                88837 non-null  object 
 5   Country                 88663 non-null  object 
 6   Latitude                34382 non-null  object 
 7   Longitude               34373 non-null  object 
 8   Airport.Code            50132 non-null  object 
 9   Airport.Name            52704 non-null  object 
 10  Injury.Severity         87889 non-null  object 
 11  Aircraft.damage         85695 non-null  object 
 12  Aircraft.Category       32287 non-null  object 
 13  Registration.Number     87507 non-null  object 
 14  Make                    88826 non-null

We decided to drop the Latitude, Longitude, Airport Code, and Airport Name columns, instead choosing to use the Location column.

In [442]:
dropped_cols = ['Latitude', 'Longitude', 'Airport.Code', 'Airport.Name', 'Schedule', 'Injury.Severity', 'Publication.Date', 'Report.Status', 'Air.carrier', 'Aircraft.Category', 'FAR.Description']
dropped_df = df.drop(columns=dropped_cols)
dropped_df = dropped_df.dropna(axis='index', subset=['Location', 'Make', 'Model', 'Registration.Number', 'Engine.Type'])
dropped_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 81262 entries, 0 to 88767
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Event.Id                81262 non-null  object 
 1   Investigation.Type      81262 non-null  object 
 2   Accident.Number         81262 non-null  object 
 3   Event.Date              81262 non-null  object 
 4   Location                81262 non-null  object 
 5   Country                 81043 non-null  object 
 6   Aircraft.damage         79329 non-null  object 
 7   Registration.Number     81262 non-null  object 
 8   Make                    81262 non-null  object 
 9   Model                   81262 non-null  object 
 10  Amateur.Built           81243 non-null  object 
 11  Number.of.Engines       79550 non-null  float64
 12  Engine.Type             81262 non-null  object 
 13  Purpose.of.flight       78471 non-null  object 
 14  Total.Fatal.Injuries    70887 non-null  flo

Separated Location column into two new columns: City and State

In [443]:
dropped_df[['Location_City', 'Location_State']] = dropped_df['Location'].str.split(', ', n=1, expand=True)
dropped_df['Location_City'] = dropped_df['Location_City'].str.title()
dropped_df[['Location_City', 'Location_State']].head(10)

Unnamed: 0,Location_City,Location_State
0,Moose Creek,ID
1,Bridgeport,CA
2,Saltville,VA
3,Eureka,CA
5,Boston,MA
6,Cotton,MN
7,Pullman,WA
8,East Hanover,NJ
9,Jacksonville,FL
10,Hobbs,NM


Standardized the string format of the Make column.

In [444]:
dropped_df['Make'] = dropped_df['Make'].str.title()
dropped_df['Make'].value_counts()

Make
Cessna               25820
Piper                14168
Beech                 5035
Bell                  2405
Boeing                1579
                     ...  
Zivko Aeronautics        1
Lee S. Harvard           1
Miller/Bell              1
Stern                    1
Stephen J Hoffman        1
Name: count, Length: 7083, dtype: int64

Let's assume that if there are values in the Total Injuries columns that are NaN, they should be zero.

In [445]:
dropped_df[['Total.Fatal.Injuries', 'Total.Serious.Injuries', 'Total.Minor.Injuries', 'Total.Uninjured']] = dropped_df[['Total.Fatal.Injuries', 'Total.Serious.Injuries', 'Total.Minor.Injuries', 'Total.Uninjured']].fillna(value=0)
dropped_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 81262 entries, 0 to 88767
Data columns (total 22 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Event.Id                81262 non-null  object 
 1   Investigation.Type      81262 non-null  object 
 2   Accident.Number         81262 non-null  object 
 3   Event.Date              81262 non-null  object 
 4   Location                81262 non-null  object 
 5   Country                 81043 non-null  object 
 6   Aircraft.damage         79329 non-null  object 
 7   Registration.Number     81262 non-null  object 
 8   Make                    81262 non-null  object 
 9   Model                   81262 non-null  object 
 10  Amateur.Built           81243 non-null  object 
 11  Number.of.Engines       79550 non-null  float64
 12  Engine.Type             81262 non-null  object 
 13  Purpose.of.flight       78471 non-null  object 
 14  Total.Fatal.Injuries    81262 non-null  flo

In [446]:
dropped_df['Amateur.Built'] = dropped_df['Amateur.Built'].fillna('No')
dropped_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 81262 entries, 0 to 88767
Data columns (total 22 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Event.Id                81262 non-null  object 
 1   Investigation.Type      81262 non-null  object 
 2   Accident.Number         81262 non-null  object 
 3   Event.Date              81262 non-null  object 
 4   Location                81262 non-null  object 
 5   Country                 81043 non-null  object 
 6   Aircraft.damage         79329 non-null  object 
 7   Registration.Number     81262 non-null  object 
 8   Make                    81262 non-null  object 
 9   Model                   81262 non-null  object 
 10  Amateur.Built           81262 non-null  object 
 11  Number.of.Engines       79550 non-null  float64
 12  Engine.Type             81262 non-null  object 
 13  Purpose.of.flight       78471 non-null  object 
 14  Total.Fatal.Injuries    81262 non-null  flo

In [447]:
dropped_df['Weather.Condition'] = dropped_df['Weather.Condition'].fillna('VMC')
dropped_df['Weather.Condition'] = dropped_df['Weather.Condition'].str.upper()
dropped_df['Weather.Condition'].value_counts()

Weather.Condition
VMC    74761
IMC     5649
UNK      852
Name: count, dtype: int64

In [448]:
dropped_df['Purpose.of.flight'] = dropped_df['Purpose.of.flight'].fillna('Unknown')
dropped_df['Purpose.of.flight'].value_counts()
dropped_df.isna().sum()

Event.Id                      0
Investigation.Type            0
Accident.Number               0
Event.Date                    0
Location                      0
Country                     219
Aircraft.damage            1933
Registration.Number           0
Make                          0
Model                         0
Amateur.Built                 0
Number.of.Engines          1712
Engine.Type                   0
Purpose.of.flight             0
Total.Fatal.Injuries          0
Total.Serious.Injuries        0
Total.Minor.Injuries          0
Total.Uninjured               0
Weather.Condition             0
Broad.phase.of.flight     20050
Location_City                 0
Location_State              553
dtype: int64

In [449]:
dropped_df['Broad.phase.of.flight'] = dropped_df['Broad.phase.of.flight'].apply(lambda x: np.random.choice(['Landing', 'Takeoff', 'Cruise'], p=[0.4,0.35,0.25]) if pd.isna(x) else x)
dropped_df['Broad.phase.of.flight'].value_counts()

Broad.phase.of.flight
Landing        23411
Takeoff        19369
Cruise         15172
Maneuvering     8079
Approach        6459
Climb           2022
Taxi            1946
Descent         1859
Go-around       1353
Standing         938
Unknown          537
Other            117
Name: count, dtype: int64

In [450]:
dropped_df['Aircraft.damage'] = dropped_df['Aircraft.damage'].apply(lambda x: np.random.choice(['Substantial', 'Destroyed', 'Unknown'], p=[0.7,0.2,0.1]) if pd.isna(x) else x)
dropped_df['Aircraft.damage'].value_counts()

Aircraft.damage
Substantial    61667
Destroyed      17236
Minor           2166
Unknown          193
Name: count, dtype: int64

In [451]:
dropped_df['Number.of.Engines'] = dropped_df['Number.of.Engines'].apply(lambda x: np.random.choice([1,2], p=[.75,.25]) if pd.isna(x) else x)
dropped_df['Number.of.Engines'].value_counts()

Number.of.Engines
1.0    68851
2.0    10899
0.0      658
3.0      464
4.0      387
8.0        3
Name: count, dtype: int64

In [452]:
dropped_df.isna().sum()

Event.Id                    0
Investigation.Type          0
Accident.Number             0
Event.Date                  0
Location                    0
Country                   219
Aircraft.damage             0
Registration.Number         0
Make                        0
Model                       0
Amateur.Built               0
Number.of.Engines           0
Engine.Type                 0
Purpose.of.flight           0
Total.Fatal.Injuries        0
Total.Serious.Injuries      0
Total.Minor.Injuries        0
Total.Uninjured             0
Weather.Condition           0
Broad.phase.of.flight       0
Location_City               0
Location_State            553
dtype: int64

In [453]:
dropped_df.to_csv('./data/AviationData_clean.csv', encoding='utf-8')