In [83]:
# for data
import pandas as pd
import numpy as np
from pandasgui import show
import os
import json

# for plotting
import matplotlib.pyplot as plt
import seaborn as sns

In [84]:
parent_folder = os.path.dirname(os.getcwd())

In [85]:
json_path = os.path.join(parent_folder, "data", "jsons", "row-data")

def make_json(encoder, encoded_values, filename: str):
    categories = encoder.classes_
    encoded_mapping = dict(zip(categories, encoded_values))
    encoded_mapping = {str(k): str(v) for k, v in encoded_mapping.items()}

    file_path = os.path.join(json_path, f"{filename}.json")
    with open(file_path, 'w') as f:
        f.write(json.dumps(encoded_mapping))


In [86]:
aviation_data_path = os.path.join(
    parent_folder, 'data', 'clean_aviation_data_2.csv')


df = pd.read_csv(aviation_data_path, encoding='latin-1')

In [87]:
# gui = show(df)

In [88]:
df.dtypes

Unnamed: 0               int64
location                object
injury_severity         object
aircraft_damage         object
make                    object
model                   object
number_of_engines      float64
engine_type             object
weather_conditions      object
phase_of_flight         object
Year                     int64
Month                    int64
Day                      int64
injuries               float64
pax_onboard            float64
fatality_percentage    float64
survived               float64
AmateurBuilt            object
dtype: object

In [89]:
df['injury_severity'] = df['injury_severity'].str.replace('\d+', '')

In [90]:
cols = ['injury_severity']
for col in cols:
    df[col] = df[col].map(lambda x: str(x).lstrip(' ()').rstrip(' ()'))

In [91]:
df = df.rename(columns={'injury_severity': 'target',
               'AmateurBuilt': 'amateur_built'})

In [92]:
# gui = show(df)

In [93]:
df.replace({'target': {'Incident': 'Non-Fatal'}}, inplace=True)

In [94]:
df.head()

Unnamed: 0.1,Unnamed: 0,location,target,aircraft_damage,make,model,number_of_engines,engine_type,weather_conditions,phase_of_flight,Year,Month,Day,injuries,pax_onboard,fatality_percentage,survived,amateur_built
0,0,"CAMBRIA, NY",Non-Fatal,Destroyed,Unknown,64,1.0,Reciprocating,VMC,CLIMB,1982,6,13,1.0,1.0,0.0,1.0,No
1,1,"MCWHORTER, KY",Non-Fatal,Destroyed,Unknown,KR-2,1.0,Reciprocating,VMC,CRUISE,1982,7,1,2.0,2.0,0.0,2.0,Yes
2,2,"FREDERICK, MD",Fatal,Destroyed,Unknown,WINDWAGON,1.0,Reciprocating,VMC,APPROACH,1982,7,16,1.0,1.0,100.0,0.0,Yes
3,3,"VENTURA, CA",Non-Fatal,Destroyed,Unknown,MIDGET MUSTANG,1.0,Reciprocating,VMC,MANEUVERING,1982,8,21,0.0,1.0,0.0,1.0,Yes
4,4,"SIDNEY, NE",Non-Fatal,Substantial,Unknown,SKYBOLT,1.0,Reciprocating,VMC,LANDING,1982,8,24,0.0,1.0,0.0,1.0,Yes


In [95]:
df.target.value_counts(ascending=False)

Non-Fatal    39511
Fatal         9150
Name: target, dtype: int64

In [96]:
df.dtypes

Unnamed: 0               int64
location                object
target                  object
aircraft_damage         object
make                    object
model                   object
number_of_engines      float64
engine_type             object
weather_conditions      object
phase_of_flight         object
Year                     int64
Month                    int64
Day                      int64
injuries               float64
pax_onboard            float64
fatality_percentage    float64
survived               float64
amateur_built           object
dtype: object

In [97]:
df.head()

Unnamed: 0.1,Unnamed: 0,location,target,aircraft_damage,make,model,number_of_engines,engine_type,weather_conditions,phase_of_flight,Year,Month,Day,injuries,pax_onboard,fatality_percentage,survived,amateur_built
0,0,"CAMBRIA, NY",Non-Fatal,Destroyed,Unknown,64,1.0,Reciprocating,VMC,CLIMB,1982,6,13,1.0,1.0,0.0,1.0,No
1,1,"MCWHORTER, KY",Non-Fatal,Destroyed,Unknown,KR-2,1.0,Reciprocating,VMC,CRUISE,1982,7,1,2.0,2.0,0.0,2.0,Yes
2,2,"FREDERICK, MD",Fatal,Destroyed,Unknown,WINDWAGON,1.0,Reciprocating,VMC,APPROACH,1982,7,16,1.0,1.0,100.0,0.0,Yes
3,3,"VENTURA, CA",Non-Fatal,Destroyed,Unknown,MIDGET MUSTANG,1.0,Reciprocating,VMC,MANEUVERING,1982,8,21,0.0,1.0,0.0,1.0,Yes
4,4,"SIDNEY, NE",Non-Fatal,Substantial,Unknown,SKYBOLT,1.0,Reciprocating,VMC,LANDING,1982,8,24,0.0,1.0,0.0,1.0,Yes


In [98]:
df.columns

Index(['Unnamed: 0', 'location', 'target', 'aircraft_damage', 'make', 'model',
       'number_of_engines', 'engine_type', 'weather_conditions',
       'phase_of_flight', 'Year', 'Month', 'Day', 'injuries', 'pax_onboard',
       'fatality_percentage', 'survived', 'amateur_built'],
      dtype='object')

In [99]:
df['location'].unique()

array(['CAMBRIA, NY', 'MCWHORTER, KY', 'FREDERICK, MD', ..., 'KENTON, OH',
       'SPRUCE CREEK, FL', 'FLORENCE, NJ'], dtype=object)

In [100]:

# # Encoding categorical data values (Transforming object data types to integers)
# from sklearn.preprocessing import LabelEncoder
# labelencoder = LabelEncoder()

# # label vs hot_encoder???

# # Encode target column
# target_encoded_values = df.iloc[:, 0] = labelencoder.fit_transform(df.iloc[:, 0].values)
# make_json(labelencoder, target_encoded_values, 'target')

# # Encode location column
# location_encoded_values = df.iloc[:, 1] = labelencoder.fit_transform(df.iloc[:, 1].values)
# make_json(labelencoder, location_encoded_values, 'location')

# # Encode aircraft_damage
# aircraft_damage_encoded_values = df.iloc[:, 2] = labelencoder.fit_transform(df.iloc[:, 2].values)
# make_json(labelencoder, aircraft_damage_encoded_values, 'aircraft_damage')

# # Encode make
# make_encoded_values = df.iloc[:, 3] = labelencoder.fit_transform(df.iloc[:, 3].values)
# make_json(labelencoder, make_encoded_values, 'make')

# # Encode model
# model_encoded_values = df.iloc[:, 4] = labelencoder.fit_transform(df.iloc[:, 4].values)
# make_json(labelencoder, model_encoded_values, 'model')

# # Encode engine_type
# engine_type_encoded_values = df.iloc[:, 6] = labelencoder.fit_transform(df.iloc[:, 6].values)
# make_json(labelencoder, engine_type_encoded_values, 'engine_type')

# # Encode weather_conditions
# weather_conditions_encoded_values = df.iloc[:, 7] = labelencoder.fit_transform(df.iloc[:, 7].values)
# make_json(labelencoder, weather_conditions_encoded_values, 'weather_conditions')

# # Encode phase_of_flight
# phase_of_flight_encoded_values = df.iloc[:, 8] = labelencoder.fit_transform(df.iloc[:, 8].values)
# make_json(labelencoder, phase_of_flight_encoded_values, 'phase_of_flight')

# # Encode amateur_built
# amateur_built_encoded_values = df.iloc[:, 16] = labelencoder.fit_transform(df.iloc[:, 16].values)
# make_json(labelencoder, amateur_built_encoded_values, 'amateur_built')

In [101]:
from sklearn.preprocessing import LabelEncoder
import warnings

# Ignore all warnings
warnings.filterwarnings('ignore')

# Define the columns to be encoded
columns_to_encode = ['location', 'aircraft_damage', 'make', 'model',
                     'engine_type', 'weather_conditions', 'phase_of_flight', 'amateur_built']

for column in columns_to_encode:
    # Create a new LabelEncoder for each column
    labelencoder = LabelEncoder()

    # Encode the column
    encoded_values = df[column] = labelencoder.fit_transform(
        df[column].values)
    
    # x = list(zip(labelencoder.classes_, encoded_values))

    print(column, ": ", labelencoder.classes_)
    print("encoded_values", len(encoded_values))
    print("labelencoder.classes_", len(labelencoder.classes_))
    print("labelencoder.classes_", len(labelencoder))
    print("\n\n")

    # Save the encoded mapping to a JSON file
    make_json(labelencoder, encoded_values, column)

location :  ['(N) SKWENTNA, AK' '0WASSO, OK' '1 1/2 MI.N. MAY, KS' ... 'ZWINGLE, IA'
 'Zhejiang Province, P.R. China, China' 'Zolfo Springs, FL']
encoded_values 48661
labelencoder.classes_ 13797



aircraft_damage :  ['Destroyed' 'Minor' 'Substantial']
encoded_values 48661
labelencoder.classes_ 3



make :  ['1ST FTR GP' '67 FLYING DUTCHMAN' '85 MANISTA' ... 'ZIVKO AERONAUTICS'
 'ZUKOWSKI' 'ZWART']
encoded_values 48661
labelencoder.classes_ 3213



model :  ['&GCBC' '(MODIFIED)' '(SOLOY CONVERSION)' ... 'ZUNI II' 'Zlin 50LX'
 'Zodiac 601 XL']
encoded_values 48661
labelencoder.classes_ 6702



engine_type :  ['Reciprocating' 'Turbo Fan' 'Turbo Jet' 'Turbo Prop' 'Turbo Shaft'
 'Unknown']
encoded_values 48661
labelencoder.classes_ 6



weather_conditions :  ['IMC' 'UNK' 'VMC']
encoded_values 48661
labelencoder.classes_ 3



phase_of_flight :  ['APPROACH' 'CLIMB' 'CRUISE' 'DESCENT' 'GO-AROUND' 'LANDING' 'MANEUVERING'
 'OTHER' 'STANDING' 'TAKEOFF' 'TAXI' 'UNKNOWN']
encoded_values 48661
labe

In [102]:
df.head()

Unnamed: 0.1,Unnamed: 0,location,target,aircraft_damage,make,model,number_of_engines,engine_type,weather_conditions,phase_of_flight,Year,Month,Day,injuries,pax_onboard,fatality_percentage,survived,amateur_built
0,0,1725,Non-Fatal,0,2962,975,1.0,0,2,1,1982,6,13,1.0,1.0,0.0,1.0,0
1,1,7690,Non-Fatal,0,2962,3850,1.0,0,2,2,1982,7,1,2.0,2.0,0.0,2.0,1
2,2,4220,Fatal,0,2962,6614,1.0,0,2,0,1982,7,16,1.0,1.0,100.0,0.0,1
3,3,12732,Non-Fatal,0,2962,4232,1.0,0,2,6,1982,8,21,0.0,1.0,0.0,1.0,1
4,4,11329,Non-Fatal,2,2962,5814,1.0,0,2,5,1982,8,24,0.0,1.0,0.0,1.0,1


In [103]:
# Encode model
df.iloc[:, 4] = labelencoder.fit_transform(df.iloc[:, 4].values)

In [104]:
# Encode phase_of_flight
df.iloc[:, 8] = labelencoder.fit_transform(df.iloc[:, 8].values)

In [105]:
# Encode amateur_built
df.iloc[:, 16] = labelencoder.fit_transform(df.iloc[:, 16].values)

In [106]:
df.head()

Unnamed: 0.1,Unnamed: 0,location,target,aircraft_damage,make,model,number_of_engines,engine_type,weather_conditions,phase_of_flight,Year,Month,Day,injuries,pax_onboard,fatality_percentage,survived,amateur_built
0,0,1725,Non-Fatal,0,2962,975,1.0,0,2,1,1982,6,13,1.0,1.0,0.0,1,0
1,1,7690,Non-Fatal,0,2962,3850,1.0,0,2,2,1982,7,1,2.0,2.0,0.0,2,1
2,2,4220,Fatal,0,2962,6614,1.0,0,2,0,1982,7,16,1.0,1.0,100.0,0,1
3,3,12732,Non-Fatal,0,2962,4232,1.0,0,2,6,1982,8,21,0.0,1.0,0.0,1,1
4,4,11329,Non-Fatal,2,2962,5814,1.0,0,2,5,1982,8,24,0.0,1.0,0.0,1,1


In [107]:
df = df[['target', 'location', 'aircraft_damage',
         'make', 'model', 'number_of_engines', 'engine_type',
         'weather_conditions', 'phase_of_flight', 'Year',
                 'Month', 'Day', 'injuries', 'pax_onboard', 'fatality_percentage',
                 'survived', 'amateur_built']]

In [108]:
encoded_aviation_data_path = os.path.join(
    parent_folder, 'data', 'encoded_aviation_data_3.csv')

df.to_csv(encoded_aviation_data_path)

In [109]:
viz_aviation_data_path = os.path.join(
    parent_folder, 'data', 'viz_aviation_data.csv')

df.to_csv(viz_aviation_data_path)