# Wildlife Strike Analysis

*This notebook should be used to record all of your analysis.*

## Imports

In [1]:
import pandas as pd

from redshift_connector import connect

import altair as alt

from scipy.stats import zscore

### Making a Connection

In [2]:
def get_db_connection():
    return connect(host="c17-redshift-cluster.cdq12ms5gjyk.eu-west-2.redshift.amazonaws.com",
                   database="dw_air_travel",
                   user="najma_hassan",
                   password="Nnajma_71",
                   port=5439)


conn = get_db_connection()
curs = conn.cursor()

**How significant a problem are wildlife strikes?**


1) Get the relevant columns from wildlife_strike table to answer this question and load into pd dataframe

In [73]:
query = """SELECT incident_date ,cost_repairs, indicated_damage, 
nr_injuries,nr_fatalities
FROM wildlife_strike"""
curs.execute(query)
strikes_significance = curs.fetch_dataframe()

2) Clean the data

In [82]:
strikes_significance['incident_date'] = pd.to_datetime(strikes_significance['incident_date']).dt.year

In [84]:
# Only have rows where year is after 2010
strikes_significance = strikes_significance[strikes_significance['incident_date'] >= 2010]

In [None]:
def convert_str_to_ints(df, column_name):
    """Replaces empty strings with 0 and converts numeric strings to integers."""
    df[column_name] = df[column_name].replace('', 0)
    df[column_name] = pd.to_numeric(df[column_name]).astype(int)
    return df 
    

In [None]:
strikes_significance = convert_str_to_ints(strikes_significance, 'nr_injuries')

strikes_significance = convert_str_to_ints(strikes_significance, 'nr_fatalities')

strikes_significance = convert_str_to_ints(strikes_significance, 'cost_repairs')

strikes_significance = convert_str_to_ints(strikes_significance, 'indicated_damage')

In [85]:
strikes_significance.head()

Unnamed: 0,incident_date,cost_repairs,indicated_damage,nr_injuries,nr_fatalities
87761,2010,0,0,0,0
87762,2010,0,0,0,0
87763,2010,0,0,0,0
87764,2010,0,0,0,0
87765,2010,0,0,0,0


3) Make the Visualisations

_Annual trend chart_

In [93]:
annual_strikes = strikes_significance.groupby('incident_date').size().reset_index(name='Total Incidents')

In [98]:
yearly_trends = alt.Chart(annual_strikes, title = 'Yearly Trend of Wildlife Strikes (2010-2024)').mark_line().encode(
    x = alt.X('incident_date', title='Year of Incident'),
    y = 'Total Incidents',
    tooltip=['incident_date', 'Total Incidents']
)
yearly_trends

_Cost repairs chart_

In [113]:
strikes_significance['indicated_damage'].value_counts()
damages = pd.DataFrame(columns=['Damage Reported', 'Count'], data=[['Yes', 9303], ['No', 193929]])

In [138]:
strikes_significance[strikes_significance['incident_date'] == 2024 ]['indicated_damage'].value_counts()
damages_2024 = pd.DataFrame(columns=['Damage Reported', 'Count'], data=[
                       ['Yes', 347], ['No', 7310]])

In [129]:
damages_chart = alt.Chart(damages, title = 'Proportion of Reported Damage in Wildlife Strikes').mark_arc().encode(
    theta='Count',
    color= alt.Color('Damage Reported').scale(scheme='bluegreen'),
    tooltip=['Count', 'Damage Reported'],
)
damages_chart

In [141]:
damages_2024_chart = alt.Chart(damages_2024, title='Proportion of Reported Damage in Wildlife Strikes 2024').mark_arc().encode(
    theta='Count',
    color=alt.Color('Damage Reported').scale(scheme='bluegreen'),
    tooltip=['Count', 'Damage Reported'],
)
damages_2024_chart

In [149]:
injuries = strikes_significance.groupby('incident_date')['nr_injuries'].sum().reset_index()
fatalities = strikes_significance.groupby(
    'incident_date')['nr_fatalities'].sum().reset_index()

In [179]:
injury_fatalities = injuries.merge(fatalities)
injury_fatalities


Unnamed: 0,incident_date,nr_injuries,nr_fatalities
0,2010,14,0
1,2011,13,1
2,2012,18,2
3,2013,10,1
4,2014,10,0
5,2015,6,0
6,2016,7,4
7,2017,6,6
8,2018,11,3
9,2019,7,3


In [193]:
injuries_fatalities_chart = alt.Chart(injury_fatalities).transform_fold(['nr_injuries', 'nr_fatalities'], as_=['incident_type', 'count']).mark_bar().encode(
     x=alt.X('incident_date:O', title='Incident Year'),
    y = alt.Y('count:Q', title='Total Injuries and Fatalities'),
    color=alt.Color('incident_type:N').scale(scheme='bluepurple'),
    tooltip=['nr_injuries','nr_fatalities']
)

In [158]:
cost_repairs_by_year = strikes_significance.groupby('incident_date')['cost_repairs'].sum().reset_index()

In [194]:
cost_repairs_chart = alt.Chart(cost_repairs_by_year, title = 'Total Cost Repairs Per year').mark_bar().encode(
    x = alt.X('cost_repairs:Q', title='Cost of Repairs'),
    y= alt.Y('incident_date:O', title='Year'),
    tooltip=['cost_repairs','incident_date'],
    color = alt.Color('cost_repairs').legend(None).scale(scheme='blues')
)


**Are strikes by particular animals more likely/dangerous than others?**


In [200]:
strikes_query = """SELECT species, indicated_damage,cost_repairs, nr_injuries, nr_fatalities 
FROM wildlife_strike"""
curs.execute(strikes_query)
animal_details = curs.fetch_dataframe()

In [202]:
animal_details = convert_str_to_ints(animal_details, 'nr_injuries')

animal_details= convert_str_to_ints(
    animal_details, 'nr_fatalities')

animal_details = convert_str_to_ints(
    animal_details, 'cost_repairs')

animal_details= convert_str_to_ints(
    animal_details, 'indicated_damage')

In [293]:
animal_details['species'].value_counts()

species
Unknown bird - small        50726
Unknown bird - medium       38709
Unknown bird                27860
Mourning dove               15331
Barn swallow                10083
                            ...  
Long-tailed weasel              1
Little owl                      1
Rook                            1
Broad-tailed hummingbird        1
Frigatebirds                    1
Name: count, Length: 916, dtype: int64

In [258]:
top_10 = animal_details.groupby('species').size().sort_values(ascending=False).head(10).reset_index(name='num_strikes')


In [259]:
bottom_10 = animal_details.groupby('species').size().sort_values(
    ascending=False).tail(10).reset_index(name='num_strikes')

In [292]:
animal_num_strikes = pd.concat([top_10, bottom_10])


In [263]:
animal_cost_effect = animal_details.groupby('species')['cost_repairs'].sum().reset_index()

In [291]:
animals_count_and_cost = animal_num_strikes.merge(animal_cost_effect)


In [270]:
animals_chart = alt.Chart(animals_count_and_cost, title ='Strike Frequency and Repair Costs by Top 10 and Bottom 10 Species' ).mark_circle(size=60).encode(
    x= alt.X('num_strikes', title='Number of Strikes'),
    y= alt.Y('cost_repairs', title='Total Cost Repairs'),
    color='species',
    tooltip=['species', 'num_strikes', 'cost_repairs']

)

**When and in what conditions are strikes most likely?**


In [272]:
conditions_query = """SELECT incident_date, time_of_day, sky, precipitation, phase_of_flight, height, speed
FROM wildlife_strike"""
curs.execute(conditions_query)
strike_conditions = curs.fetch_dataframe()

In [294]:
strike_conditions = convert_str_to_ints(strike_conditions, 'height')

strike_conditions= convert_str_to_ints(
    strike_conditions, 'speed')

In [None]:
def convert_nulls_to_unknown():
    """Replaces empty strings with 'unknown'."""
    def convert_str_to_ints(df, column_name):
    df[column_name] = df[column_name].replace('', 0)
    df[column_name] = pd.to_numeric(df[column_name]).astype(int)
    return df 

In [295]:
strike_conditions.head()

Unnamed: 0,incident_date,time_of_day,sky,precipitation,phase_of_flight,height,speed
0,1990-09-19,Day,Some Cloud,,Approach,100,135
1,1990-08-07,Night,No Cloud,,Climb,200,70
2,1990-07-13,Day,Some Cloud,,Landing Roll,0,125
3,1990-10-09,Day,Overcast,Rain,Climb,300,180
4,1990-02-22,Night,No Cloud,,Approach,2000,190


**Which airlines/airports/states would be likely potential customers for any of this technology?**
