<b>Import Pandas for later use</b>

In [1]:
from pandas import Series, DataFrame
import pandas as pd

<b>Read the excel in</b>

In [None]:
pd.read_csv()

In [2]:
df = pd.read_csv('Flight Risk JSON.csv')

In [3]:
df[:2]

Unnamed: 0,date,plane_type,loc,country,ref,airline,fat,px,cat,phase,...,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27,Unnamed: 28,Unnamed: 29,Unnamed: 30,Unnamed: 31
0,d,t,l,c,r,o,f,px,cat,p,...,,,,,,,,,,
1,1993-01-06,Dash 8-311,near Paris Charles de Gualle,France,D-BEAT,Lufthansa Cityline,4,20,A1,approach,...,,,,,,,,,,


In [4]:
# remove the first row 
df = df.drop(0)

In [5]:
# Inspect the columns
df.columns

Index([u'date', u'plane_type', u'loc', u'country', u'ref', u'airline', u'fat',
       u'px', u'cat', u'phase', u'cert', u'meta', u'cause', u'notes',
       u'Unnamed: 14', u'Unnamed: 15', u'Unnamed: 16', u'Unnamed: 17',
       u'Unnamed: 18', u'Unnamed: 19', u'Unnamed: 20', u'Unnamed: 21',
       u'Unnamed: 22', u'Unnamed: 23', u'Unnamed: 24', u'Unnamed: 25',
       u'Unnamed: 26', u'Unnamed: 27', u'Unnamed: 28', u'Unnamed: 29',
       u'Unnamed: 30', u'Unnamed: 31'],
      dtype='object')

In [6]:
# take columns before 'notes' because 'notes' contains long string information which we will not use
df = df.loc[:,['date','plane_type','loc','country','ref','airline','fat','px','cat','phase','cert','meta','cause']]

In [7]:
# Inspect the data
df[:5]

Unnamed: 0,date,plane_type,loc,country,ref,airline,fat,px,cat,phase,cert,meta,cause
1,1993-01-06,Dash 8-311,near Paris Charles de Gualle,France,D-BEAT,Lufthansa Cityline,4,20,A1,approach,confirmed,human_error,pilot & ATC error
2,1993-01-09,Hawker Siddeley HS-748-234 Srs,near Surabaya Airport,Indonesia,PK-IHE,Bouraq Indonesia,15,29,A1,initial_climb,probable,mechanical,engine failure
3,1993-01-31,Shorts SC.7 Skyvan 3-100,Mt. Kapur,Indonesia,9M-PID,Pan Malaysian Air Transport,14,29,A1,en_route,probable,weather,low visibility
4,1993-02-08,Tupolev 154M,near Tehran-Mehra...,Iran,EP-ITD,Iran Air Tours,131,67,A1,en_route,confirmed,human_error,"pilot error, collision"
5,1993-02-28,Dornier 228-201,near Orchid Island,Taiwan,B-12238,Formosa Airlines,6,22,A1,approach,probable,weather,bad weather


In [8]:
# save the processed dataframe to csv so that we can later use in Tableau 
df.to_csv('processed.csv')

In [9]:
# Sort the values on 'fat' column(fatality) 
df.sort_values(by = 'fat', ascending = False)[:3]

Unnamed: 0,date,plane_type,loc,country,ref,airline,fat,px,cat,phase,cert,meta,cause
98,1995-12-07,Tupolev 154B,near Grossevichi,Russia,RA-85164,Khabarovsk United Air,98,59,A1,en_route,confirmed,mechanical,fuel imbalance
246,2001-09-11,Boeing 767-222,"New York, NY",USA,N612UA,United Airlines,965,165,H1,en_route,confirmed,criminal,intentionally crashed
346,2006-10-29,Boeing 737-2B7,Abuja International Airport,Nigeria,5N-BFK,ADC Airlines,96,59,A1,initial_climb,confirmed,human_error,pilot error


In [10]:
# number in 'fat' are string not numeric value so convert this column
# fill NaN value before any conversion 
df.fat = df.fillna(0).fat.astype(int)

In [11]:
df.sort_values(by = 'fat', ascending = False)[:10]
# This table shows the first two are related to 9/11 attack 
# But I want to see apart from criminal activity, 
# What other cause acount most?
# Leaving row 4 because although meta cuase says it is criminal, but no detail cause was recorded
# So take row 3 to 12 to make a new dataframe, columns will be just ['date','loc','country','fat','meta','cause']

Unnamed: 0,date,plane_type,loc,country,ref,airline,fat,px,cat,phase,cert,meta,cause
247,2001-09-11,Boeing 767-223ER,"New York, NY",USA,N334AA,American Airlines,1692,216,H1,en_route,confirmed,criminal,intentionally crashed
246,2001-09-11,Boeing 767-222,"New York, NY",USA,N612UA,United Airlines,965,165,H1,en_route,confirmed,criminal,intentionally crashed
125,1996-11-12,Boeing 747-168B,"Charkhi Dadri, Haryana",India,HZ-AIH,Saudi Arabian Airlines,312,98,A1,en_route,confirmed,human_error,"pilot error, mid air collision"
433,2014-07-17,Boeing 777-2H6ER,near Donetsk,Ukraine,9M-MRD,Malaysia Airlines,298,96,A1,en_route,suspected,criminal,unknown
253,2001-11-12,Airbus A300B4-605R,"Belle Harbor, NY",USA,N14053,American Airlines,265,91,A1,en_route,suspected,human_error,pilot error
43,1994-04-26,Airbus A300b4-622r,Nagoya-Komaki,Japan,B-1816,China Airlines,264,91,A1,approach,probable,human_error,pilot error
432,2014-03-08,Boeing 777-2H6ER,South China Sea,"Malaysia, Pacific Ocean",9M-MRO,Malaysia Airlines,239,87,A1,en_route,unknown,unknown,unknown
102,1996-01-08,Antonov An-32B,N'Dolo Airport,DR Congo (then Zaire),RA-262222,Moscow Airways,237,87,A1,initial_climb,probable,human_error,overload
150,1997-09-26,Airbus A300,near Medan-Polonia Airport,Indonesia,PK-GAI,Garuda Indonesia,234,86,A1,approach,probable,human_error,atc error & bad weather
115,1996-07-17,Boeing 747-131,"near East Moriches, NY",USA,N93119,Trans World Airlines,230,86,A1,en_route,confirmed,mechanical,fuel tank explosion


In [12]:
# Make a new dataframe according to sorted fat number
df2 = df.sort_values(by = 'fat', ascending = False)

In [13]:
# Take the top 10 excluding 9/11 related two incidents
df2.reset_index().loc[2:11, ['date','loc','country','fat','meta']].reset_index().drop('index', axis = 1)

Unnamed: 0,date,loc,country,fat,meta
0,1996-11-12,"Charkhi Dadri, Haryana",India,312,human_error
1,2014-07-17,near Donetsk,Ukraine,298,criminal
2,2001-11-12,"Belle Harbor, NY",USA,265,human_error
3,1994-04-26,Nagoya-Komaki,Japan,264,human_error
4,2014-03-08,South China Sea,"Malaysia, Pacific Ocean",239,unknown
5,1996-01-08,N'Dolo Airport,DR Congo (then Zaire),237,human_error
6,1997-09-26,near Medan-Polonia Airport,Indonesia,234,human_error
7,1996-07-17,"near East Moriches, NY",USA,230,mechanical
8,1998-09-02,"Atlantic Ocean near St Margarets Bay, Nova Scotia",Canada,229,mechanical
9,1997-08-06,Nimitz Hill,Guam,228,human_error


In [14]:
# save this table to a csv file
df2.reset_index().loc[2:11, ['date','loc','country','fat','meta']].reset_index().drop('index', axis = 1).to_csv('top10.csv')