In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('./Data/aviation-accident-data-2023-05-16.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23967 entries, 0 to 23966
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   date          23967 non-null  object
 1   type          23967 non-null  object
 2   registration  22419 non-null  object
 3   operator      23963 non-null  object
 4   fatalities    20029 non-null  object
 5   location      23019 non-null  object
 6   country       23967 non-null  object
 7   cat           23967 non-null  object
 8   year          23967 non-null  object
dtypes: object(9)
memory usage: 1.6+ MB


In [3]:
df.rename(columns={'cat': 'accident_category'}, inplace=True)

In [4]:
df.duplicated().sum()   
df.drop_duplicates(inplace=True)

In [5]:
df['aircraft_fatalities'] = df['fatalities'].apply(
    lambda x: int(str(x).split('+')[0].strip()) 
    if isinstance(x, str) and '+' in x and str(x).split('+')[0].strip().isdigit()
    else int(x) if str(x).isdigit()
    else np.nan
)

In [6]:
df['ground_fatalities'] = df['fatalities'].apply(
    lambda x: int(str(x).split('+')[1].strip()) 
    if isinstance(x, str) and '+' in x and str(x).split('+')[1].strip().isdigit()
    else 0 if str(x).isdigit()
    else np.nan
)

In [7]:
df.rename(columns={'fatalities': 'total_fatalities'}, inplace=True)
df['total_fatalities'] = df['total_fatalities'].apply(
    lambda x: sum(int(part.strip()) for part in str(x).split('+')) 
    if isinstance(x, str) and '+' in x 
    else int(x) if str(x).isdigit() 
    else x
)

In [8]:
df['total_fatalities'].value_counts()

0.0      10705
2.0       1136
1.0       1061
3.0        992
4.0        891
         ...  
124.0        1
188.0        1
166.0        1
149.0        1
181.0        1
Name: total_fatalities, Length: 203, dtype: int64

In [9]:
df.isna().sum()

date                      0
type                      0
registration           1434
operator                  4
total_fatalities       3833
location                932
country                   0
accident_category         0
year                      0
aircraft_fatalities    3833
ground_fatalities      3833
dtype: int64

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23852 entries, 0 to 23966
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   date                 23852 non-null  object 
 1   type                 23852 non-null  object 
 2   registration         22418 non-null  object 
 3   operator             23848 non-null  object 
 4   total_fatalities     20019 non-null  float64
 5   location             22920 non-null  object 
 6   country              23852 non-null  object 
 7   accident_category    23852 non-null  object 
 8   year                 23852 non-null  object 
 9   aircraft_fatalities  20019 non-null  float64
 10  ground_fatalities    20019 non-null  float64
dtypes: float64(3), object(8)
memory usage: 2.2+ MB


In [11]:
#if df has more than 3 rows with missing values, drop the rows
df = df[df.isna().sum(axis=1) <= 3]

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22946 entries, 4 to 23966
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   date                 22946 non-null  object 
 1   type                 22946 non-null  object 
 2   registration         21682 non-null  object 
 3   operator             22943 non-null  object 
 4   total_fatalities     20019 non-null  float64
 5   location             22780 non-null  object 
 6   country              22946 non-null  object 
 7   accident_category    22946 non-null  object 
 8   year                 22946 non-null  object 
 9   aircraft_fatalities  20019 non-null  float64
 10  ground_fatalities    20019 non-null  float64
dtypes: float64(3), object(8)
memory usage: 2.1+ MB


In [13]:
df.isna().sum()

date                      0
type                      0
registration           1264
operator                  3
total_fatalities       2927
location                166
country                   0
accident_category         0
year                      0
aircraft_fatalities    2927
ground_fatalities      2927
dtype: int64

In [14]:
df['total_fatalities'].fillna(df['total_fatalities'].median(), inplace=True)
df['aircraft_fatalities'].fillna(df['aircraft_fatalities'].median(), inplace=True)
df['ground_fatalities'].fillna(df['ground_fatalities'].median(), inplace=True)

In [15]:
df.isna().sum()

date                      0
type                      0
registration           1264
operator                  3
total_fatalities          0
location                166
country                   0
accident_category         0
year                      0
aircraft_fatalities       0
ground_fatalities         0
dtype: int64

In [16]:
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df['year'] = pd.to_datetime(df['year'], errors='coerce')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22946 entries, 4 to 23966
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   date                 22581 non-null  datetime64[ns]
 1   type                 22946 non-null  object        
 2   registration         21682 non-null  object        
 3   operator             22943 non-null  object        
 4   total_fatalities     22946 non-null  float64       
 5   location             22780 non-null  object        
 6   country              22946 non-null  object        
 7   accident_category    22946 non-null  object        
 8   year                 22907 non-null  datetime64[ns]
 9   aircraft_fatalities  22946 non-null  float64       
 10  ground_fatalities    22946 non-null  float64       
dtypes: datetime64[ns](2), float64(3), object(6)
memory usage: 2.1+ MB


In [17]:
df.dropna(subset=['date', 'year'], inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22581 entries, 61 to 23966
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   date                 22581 non-null  datetime64[ns]
 1   type                 22581 non-null  object        
 2   registration         21327 non-null  object        
 3   operator             22578 non-null  object        
 4   total_fatalities     22581 non-null  float64       
 5   location             22444 non-null  object        
 6   country              22581 non-null  object        
 7   accident_category    22581 non-null  object        
 8   year                 22581 non-null  datetime64[ns]
 9   aircraft_fatalities  22581 non-null  float64       
 10  ground_fatalities    22581 non-null  float64       
dtypes: datetime64[ns](2), float64(3), object(6)
memory usage: 2.1+ MB


In [18]:
df.isna().sum()

date                      0
type                      0
registration           1254
operator                  3
total_fatalities          0
location                137
country                   0
accident_category         0
year                      0
aircraft_fatalities       0
ground_fatalities         0
dtype: int64

In [19]:
df['registration'].fillna('Unknown', inplace= True)
df['location'].fillna('Unknown location', inplace= True)
df['operator'].fillna('Unknown', inplace= True)


In [20]:
df.isna().sum()

date                   0
type                   0
registration           0
operator               0
total_fatalities       0
location               0
country                0
accident_category      0
year                   0
aircraft_fatalities    0
ground_fatalities      0
dtype: int64

In [21]:
#convert total_fatalities, ground_fatalities and aircraft_fatalities to int
df['total_fatalities'] = df['total_fatalities'].astype(int)
df['ground_fatalities'] = df['ground_fatalities'].astype(int)
df['aircraft_fatalities'] = df['aircraft_fatalities'].astype(int)

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22581 entries, 61 to 23966
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   date                 22581 non-null  datetime64[ns]
 1   type                 22581 non-null  object        
 2   registration         22581 non-null  object        
 3   operator             22581 non-null  object        
 4   total_fatalities     22581 non-null  int32         
 5   location             22581 non-null  object        
 6   country              22581 non-null  object        
 7   accident_category    22581 non-null  object        
 8   year                 22581 non-null  datetime64[ns]
 9   aircraft_fatalities  22581 non-null  int32         
 10  ground_fatalities    22581 non-null  int32         
dtypes: datetime64[ns](2), int32(3), object(6)
memory usage: 1.8+ MB


In [23]:
df.isna().sum()

date                   0
type                   0
registration           0
operator               0
total_fatalities       0
location               0
country                0
accident_category      0
year                   0
aircraft_fatalities    0
ground_fatalities      0
dtype: int64

In [24]:
df.duplicated().sum()

0

In [25]:
df.to_csv('./Data/cleaned_aviation_data.csv', index=False)