# Mitigating Bird Strikes in Aviation Project - Part 1

In [1]:
# Importing libraries

import numpy as np #for numerical computation
import pandas as pd #for data reading and manipulation
import matplotlib.pyplot as plt #to visualize data
import statistics as st
import seaborn as sns #to visualize data
from scipy.stats import kurtosis
from scipy.stats import norm #probability distribution function
from tabulate import tabulate #to print table
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Reading data from existing .xlsx file using read_excel() of pandas module and storing into a dataframe
birdstrike_data = pd.read_excel("D:/Documents/NIIT/Course7/Sprint4/DS1_C7_S4_Project_BirdStrike_Data.xlsx")
birdstrike_data

Unnamed: 0,Record ID,Aircraft: Type,Airport: Name,Altitude bin,Aircraft: Make/Model,Wildlife: Number struck,Wildlife: Number Struck Actual,Effect: Impact to flight,FlightDate,Effect: Indicated Damage,...,Remains of wildlife sent to Smithsonian,Remarks,Wildlife: Size,Conditions: Sky,Wildlife: Species,Pilot warned of birds or wildlife?,Cost: Total $,Feet above ground,Number of people injured,Is Aircraft Large?
0,202152,Airplane,LAGUARDIA NY,> 1000 ft,B-737-400,Over 100,859,Engine Shut Down,2000-11-23,Caused damage,...,False,FLT 753. PILOT REPTD A HUNDRED BIRDS ON UNKN T...,Medium,No Cloud,Unknown bird - medium,N,30736,1500.0,0,Yes
1,208159,Airplane,DALLAS/FORT WORTH INTL ARPT,< 1000 ft,MD-80,Over 100,424,,2001-07-25,Caused damage,...,False,102 CARCASSES FOUND. 1 LDG LIGHT ON NOSE GEAR ...,Small,Some Cloud,Rock pigeon,Y,0,0.0,0,No
2,207601,Airplane,LAKEFRONT AIRPORT,< 1000 ft,C-500,Over 100,261,,2001-09-14,No damage,...,False,FLEW UNDER A VERY LARGE FLOCK OF BIRDS OVER AP...,Small,No Cloud,European starling,N,0,50.0,0,No
3,215953,Airplane,SEATTLE-TACOMA INTL,< 1000 ft,B-737-400,Over 100,806,Precautionary Landing,2002-09-05,No damage,...,False,"NOTAM WARNING. 26 BIRDS HIT THE A/C, FORCING A...",Small,Some Cloud,European starling,Y,0,50.0,0,Yes
4,219878,Airplane,NORFOLK INTL,< 1000 ft,CL-RJ100/200,Over 100,942,,2003-06-23,No damage,...,False,NO DMG REPTD.,Small,No Cloud,European starling,N,0,50.0,0,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25553,321151,Airplane,REDDING MUNICIPAL,> 1000 ft,EMB-120,1,1,,2011-12-30,No damage,...,False,DUCK? NO DMG REPTD.,Large,Overcast,Unknown bird - large,N,0,1500.0,0,No
25554,319677,Airplane,ORLANDO INTL,< 1000 ft,A-321,1,1,,2011-12-30,No damage,...,False,,Small,Some Cloud,Tree swallow,Y,0,0.0,0,No
25555,319680,,,,EC-135,,1,,NaT,No damage,...,False,STRUCK BIRD ON RT FRONT DURING T/O. BIRD REPTD...,,No Cloud,Unknown bird - small,,0,,0,
25556,319679,Airplane,DETROIT METRO WAYNE COUNTY ARPT,< 1000 ft,B-757-200,1,1,,2011-12-31,No damage,...,False,PILOTS REPORT STRIKING UNKNOWN BIRD ON RWY 21L...,Medium,Some Cloud,Unknown bird - medium,Y,0,0.0,0,Yes


In [3]:
# Printing a tuple with the details number of rows and columns 
print(birdstrike_data.shape) 

(25558, 26)


In [4]:
# Printing the information about the dataset
birdstrike_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25558 entries, 0 to 25557
Data columns (total 26 columns):
 #   Column                                   Non-Null Count  Dtype         
---  ------                                   --------------  -----         
 0   Record ID                                25558 non-null  int64         
 1   Aircraft: Type                           25429 non-null  object        
 2   Airport: Name                            25429 non-null  object        
 3   Altitude bin                             25429 non-null  object        
 4   Aircraft: Make/Model                     25558 non-null  object        
 5   Wildlife: Number struck                  25429 non-null  object        
 6   Wildlife: Number Struck Actual           25558 non-null  int64         
 7   Effect: Impact to flight                 2078 non-null   object        
 8   FlightDate                               25429 non-null  datetime64[ns]
 9   Effect: Indicated Damage               

In [5]:
# Counting missing values in each column
missing_values = birdstrike_data.isnull().sum()
print('Missing values in each column:\n', missing_values)

Missing values in each column:
 Record ID                                      0
Aircraft: Type                               129
Airport: Name                                129
Altitude bin                                 129
Aircraft: Make/Model                           0
Wildlife: Number struck                      129
Wildlife: Number Struck Actual                 0
Effect: Impact to flight                   23480
FlightDate                                   129
Effect: Indicated Damage                       0
Aircraft: Number of engines?                 267
Aircraft: Airline/Operator                   129
Origin State                                 449
When: Phase of flight                        129
Conditions: Precipitation                  23543
Remains of wildlife collected?                 0
Remains of wildlife sent to Smithsonian        0
Remarks                                     4771
Wildlife: Size                               129
Conditions: Sky                      

In [6]:
#Separate the columns into categorical and numerical data types and display them.
def separate_columns(df):
    cat_cols = []
    num_cols = []
    
    for col in df.columns:
        if df[col].dtype == 'object':
            cat_cols.append(col)
        else:
            num_cols.append(col)
    return cat_cols, num_cols

cat_cols, num_cols = separate_columns(birdstrike_data)

table = [cat_cols, num_cols]
print(tabulate({'Categorical': cat_cols, 'Numerical': num_cols}, headers = ['Categorical columns:', 'Numerical columns:']))

Categorical columns:                Numerical columns:
----------------------------------  ---------------------------------------
Aircraft: Type                      Record ID
Airport: Name                       Wildlife: Number Struck Actual
Altitude bin                        FlightDate
Aircraft: Make/Model                Remains of wildlife collected?
Wildlife: Number struck             Remains of wildlife sent to Smithsonian
Effect: Impact to flight            Cost: Total $
Effect: Indicated Damage            Feet above ground
Aircraft: Number of engines?        Number of people injured
Aircraft: Airline/Operator
Origin State
When: Phase of flight
Conditions: Precipitation
Remarks
Wildlife: Size
Conditions: Sky
Wildlife: Species
Pilot warned of birds or wildlife?
Is Aircraft Large?


In [8]:
# Grouping by manufacturer and replacing the missing values with mean for numerical columns
numerical_cols = birdstrike_data.select_dtypes(include=['float64', 'int64']).columns
birdstrike_data[numerical_cols] = birdstrike_data[numerical_cols].fillna(birdstrike_data[numerical_cols].mean())

# Replacing missing values in categorical columns with "Not Available", excluding NaN values
categorical_cols = birdstrike_data.select_dtypes(include=['object']).columns
for col in categorical_cols:
    birdstrike_data[col] = birdstrike_data[col].fillna('Not Available').replace('NaN', 'Not Available', regex=False)

# Handling missing values in FlightDate (avoiding conversion for 'Not Available')
def to_datetime_if_valid(date_str):
    if date_str != 'Not Available':
        return pd.to_datetime(date_str, format='%d-%m-%Y %H:%M:%S')
    else:
        return date_str  # Keep 'Not Available' as it is

birdstrike_data['FlightDate'] = birdstrike_data['FlightDate'].apply(to_datetime_if_valid)

# Counting missing values in each column after data preprocessing
missing_values = birdstrike_data.isnull().sum()
print('Missing values in each column:\n', missing_values)

Missing values in each column:
 Record ID                                  0
Aircraft: Type                             0
Airport: Name                              0
Altitude bin                               0
Aircraft: Make/Model                       0
Wildlife: Number struck                    0
Wildlife: Number Struck Actual             0
Effect: Impact to flight                   0
FlightDate                                 0
Effect: Indicated Damage                   0
Aircraft: Number of engines?               0
Aircraft: Airline/Operator                 0
Origin State                               0
When: Phase of flight                      0
Conditions: Precipitation                  0
Remains of wildlife collected?             0
Remains of wildlife sent to Smithsonian    0
Remarks                                    0
Wildlife: Size                             0
Conditions: Sky                            0
Wildlife: Species                          0
Pilot warned of birds o

In [10]:
#saving data frame in local system
birdstrike_data.to_excel('birdstrike_data.xlsx', index = False) 

In [9]:
import os
#checking the current working directory
print(os.getcwd()) 

d:\Documents\NIIT\Course7\Sprint4
