In [208]:
# for data
import pandas as pd
import numpy as np
from pandasgui import show
import os
import json

# for plotting
import matplotlib.pyplot as plt
import seaborn as sns

In [209]:
parent_folder = os.path.dirname(os.getcwd())

In [210]:
json_path = os.path.join(parent_folder, "data", "jsons", "row-data")

def make_json(encoder, filename: str):
    categories = encoder.classes_
    # encoded_mapping = dict(zip(categories, encoded_values))
    encoded_mapping = {k: str(v) for k, v in enumerate(categories)}

    file_path = os.path.join(json_path, f"{filename}.json")
    with open(file_path, 'w') as f:
        f.write(json.dumps(encoded_mapping))


In [211]:
aviation_data_path = os.path.join(
    parent_folder, 'data', 'clean_aviation_data_2.csv')


df = pd.read_csv(aviation_data_path, encoding='latin-1')

In [212]:
# index = df[df["location"] == "Zolfo Springs, FL"].index[0]
# print(index)


In [213]:
# gui = show(df)

In [214]:
df.dtypes

Unnamed: 0               int64
location                object
injury_severity         object
aircraft_damage         object
make                    object
model                   object
number_of_engines      float64
engine_type             object
weather_conditions      object
phase_of_flight         object
Year                     int64
Month                    int64
Day                      int64
injuries               float64
pax_onboard            float64
fatality_percentage    float64
survived               float64
AmateurBuilt            object
dtype: object

In [215]:
df['injury_severity'] = df['injury_severity'].str.replace('\d+', '')

In [216]:
cols = ['injury_severity']
for col in cols:
    df[col] = df[col].map(lambda x: str(x).lstrip(' ()').rstrip(' ()'))

In [217]:
df = df.rename(columns={'injury_severity': 'target',
               'AmateurBuilt': 'amateur_built'})

In [218]:
# gui = show(df)

In [219]:
df.replace({'target': {'Incident': 'Non-Fatal'}}, inplace=True)

In [220]:
df.head()

Unnamed: 0.1,Unnamed: 0,location,target,aircraft_damage,make,model,number_of_engines,engine_type,weather_conditions,phase_of_flight,Year,Month,Day,injuries,pax_onboard,fatality_percentage,survived,amateur_built
0,0,"CAMBRIA, NY",Non-Fatal,Destroyed,Unknown,64,1.0,Reciprocating,VMC,CLIMB,1982,6,13,1.0,1.0,0.0,1.0,No
1,1,"MCWHORTER, KY",Non-Fatal,Destroyed,Unknown,KR-2,1.0,Reciprocating,VMC,CRUISE,1982,7,1,2.0,2.0,0.0,2.0,Yes
2,2,"FREDERICK, MD",Fatal,Destroyed,Unknown,WINDWAGON,1.0,Reciprocating,VMC,APPROACH,1982,7,16,1.0,1.0,100.0,0.0,Yes
3,3,"VENTURA, CA",Non-Fatal,Destroyed,Unknown,MIDGET MUSTANG,1.0,Reciprocating,VMC,MANEUVERING,1982,8,21,0.0,1.0,0.0,1.0,Yes
4,4,"SIDNEY, NE",Non-Fatal,Substantial,Unknown,SKYBOLT,1.0,Reciprocating,VMC,LANDING,1982,8,24,0.0,1.0,0.0,1.0,Yes


In [221]:
df.target.value_counts(ascending=False)

Non-Fatal    39511
Fatal         9150
Name: target, dtype: int64

In [222]:
df.dtypes

Unnamed: 0               int64
location                object
target                  object
aircraft_damage         object
make                    object
model                   object
number_of_engines      float64
engine_type             object
weather_conditions      object
phase_of_flight         object
Year                     int64
Month                    int64
Day                      int64
injuries               float64
pax_onboard            float64
fatality_percentage    float64
survived               float64
amateur_built           object
dtype: object

In [223]:
df.head()

Unnamed: 0.1,Unnamed: 0,location,target,aircraft_damage,make,model,number_of_engines,engine_type,weather_conditions,phase_of_flight,Year,Month,Day,injuries,pax_onboard,fatality_percentage,survived,amateur_built
0,0,"CAMBRIA, NY",Non-Fatal,Destroyed,Unknown,64,1.0,Reciprocating,VMC,CLIMB,1982,6,13,1.0,1.0,0.0,1.0,No
1,1,"MCWHORTER, KY",Non-Fatal,Destroyed,Unknown,KR-2,1.0,Reciprocating,VMC,CRUISE,1982,7,1,2.0,2.0,0.0,2.0,Yes
2,2,"FREDERICK, MD",Fatal,Destroyed,Unknown,WINDWAGON,1.0,Reciprocating,VMC,APPROACH,1982,7,16,1.0,1.0,100.0,0.0,Yes
3,3,"VENTURA, CA",Non-Fatal,Destroyed,Unknown,MIDGET MUSTANG,1.0,Reciprocating,VMC,MANEUVERING,1982,8,21,0.0,1.0,0.0,1.0,Yes
4,4,"SIDNEY, NE",Non-Fatal,Substantial,Unknown,SKYBOLT,1.0,Reciprocating,VMC,LANDING,1982,8,24,0.0,1.0,0.0,1.0,Yes


In [224]:
df.columns

Index(['Unnamed: 0', 'location', 'target', 'aircraft_damage', 'make', 'model',
       'number_of_engines', 'engine_type', 'weather_conditions',
       'phase_of_flight', 'Year', 'Month', 'Day', 'injuries', 'pax_onboard',
       'fatality_percentage', 'survived', 'amateur_built'],
      dtype='object')

In [225]:
df['location'].unique()

array(['CAMBRIA, NY', 'MCWHORTER, KY', 'FREDERICK, MD', ..., 'KENTON, OH',
       'SPRUCE CREEK, FL', 'FLORENCE, NJ'], dtype=object)

In [226]:
from sklearn.preprocessing import LabelEncoder
import warnings

# Ignore all warnings
warnings.filterwarnings('ignore')

# Define the columns to be encoded
columns_to_encode = ['target', 'location', 'aircraft_damage', 'make', 'model',
                     'engine_type', 'weather_conditions', 'phase_of_flight', 'amateur_built']

for column in columns_to_encode:
    # Create a new LabelEncoder for each column
    labelencoder = LabelEncoder()

    # Encode the column
    encoded_values = df[column] = labelencoder.fit_transform(
        df[column].values)

    # Save the encoded mapping to a JSON file
    make_json(labelencoder, column)

In [227]:
df.head()

Unnamed: 0.1,Unnamed: 0,location,target,aircraft_damage,make,model,number_of_engines,engine_type,weather_conditions,phase_of_flight,Year,Month,Day,injuries,pax_onboard,fatality_percentage,survived,amateur_built
0,0,1725,1,0,2962,975,1.0,0,2,1,1982,6,13,1.0,1.0,0.0,1.0,0
1,1,7690,1,0,2962,3850,1.0,0,2,2,1982,7,1,2.0,2.0,0.0,2.0,1
2,2,4220,0,0,2962,6614,1.0,0,2,0,1982,7,16,1.0,1.0,100.0,0.0,1
3,3,12732,1,0,2962,4232,1.0,0,2,6,1982,8,21,0.0,1.0,0.0,1.0,1
4,4,11329,1,2,2962,5814,1.0,0,2,5,1982,8,24,0.0,1.0,0.0,1.0,1


In [232]:
df = df[['target', 'location', 'aircraft_damage',
         'make', 'model', 'number_of_engines', 'engine_type',
         'weather_conditions', 'phase_of_flight', 'Year',
                 'Month', 'Day', 'injuries', 'pax_onboard', 'fatality_percentage',
                 'survived', 'amateur_built']]

In [235]:
df.head()

Unnamed: 0,target,location,aircraft_damage,make,model,number_of_engines,engine_type,weather_conditions,phase_of_flight,Year,Month,Day,injuries,pax_onboard,fatality_percentage,survived,amateur_built
0,1,1725,0,2962,975,1.0,0,2,1,1982,6,13,1.0,1.0,0.0,1,0
1,1,7690,0,2962,3850,1.0,0,2,2,1982,7,1,2.0,2.0,0.0,2,1
2,0,4220,0,2962,6614,1.0,0,2,0,1982,7,16,1.0,1.0,100.0,0,1
3,1,12732,0,2962,4232,1.0,0,2,6,1982,8,21,0.0,1.0,0.0,1,1
4,1,11329,2,2962,5814,1.0,0,2,5,1982,8,24,0.0,1.0,0.0,1,1


In [233]:
encoded_aviation_data_path = os.path.join(
    parent_folder, 'data', 'encoded_aviation_data_3.csv')

df.to_csv(encoded_aviation_data_path)

In [234]:
viz_aviation_data_path = os.path.join(
    parent_folder, 'data', 'viz_aviation_data.csv')

df.to_csv(viz_aviation_data_path)