## Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from math import radians, cos, sin, asin, sqrt
from datetime import datetime as dt

%matplotlib inline

In [None]:
# import tensorflow as tf
# print("tf version = ", tf.__version__)
# with tf.device("/gpu:0"):
#     a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3], name='a')
#     b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[3, 2], name='b')
#     c = tf.matmul(a, b)
# with tf.Session() as sess:
#     print (sess.run(c))

## Read data

In [2]:
admin = pd.read_csv('./AirTracks/all_csv/Jan2017/admin.csv')

In [3]:
admin.head(2)

Unnamed: 0,FlightId,FlightCallsign,AircraftModel,AircraftRegistration,Airline,Origin,Destination,SchdeuledDeparture,ScheduledArrival,RealDeparture,EstimatedArrival,FlightTime
0,c244ac4,SAS1749,ATR 72-600,OY-JZE,SAS,Tallinn Lennart Meri Airport,Stockholm Arlanda Airport,1484156000.0,1484161000.0,1484159000.0,,3762.0
1,c32d217,OKA2925,Boeing 737-8AS,B-5578,OKAir,Changsha Huanghua International Airport,Hangzhou Xiaoshan International Airport,1484694000.0,1484698000.0,1484694000.0,1484698000.0,3850.0


## Rename columns

In [4]:
admin.rename(columns={
    'FlightId': 'flight_id',
    'FlightCallsign': 'flight_callsign',
    'AircraftModel': 'aircraft_model',
    'AircraftRegistration': 'aircraft_registration',
    'SchdeuledDeparture': 'scheduled_departure_utc',
    'ScheduledArrival': 'scheduled_arrival_utc',
    'RealDeparture': 'real_departure_utc',
    'EstimatedArrival': 'estimated_arrival_utc',
    'FlightTime': 'real_flight_duration'
     }, inplace=True)

admin.columns = admin.columns.map(lambda x: x.lower())

In [5]:
admin.columns

Index(['flight_id', 'flight_callsign', 'aircraft_model',
       'aircraft_registration', 'airline', 'origin', 'destination',
       'scheduled_departure_utc', 'scheduled_arrival_utc',
       'real_departure_utc', 'estimated_arrival_utc', 'real_flight_duration'],
      dtype='object')

In [6]:
admin.dtypes

flight_id                   object
flight_callsign             object
aircraft_model              object
aircraft_registration       object
airline                     object
origin                      object
destination                 object
scheduled_departure_utc    float64
scheduled_arrival_utc      float64
real_departure_utc         float64
estimated_arrival_utc      float64
real_flight_duration       float64
dtype: object

## Handling null data

In [7]:
admin.isnull().sum()

flight_id                       0
flight_callsign              3851
aircraft_model              18818
aircraft_registration       20741
airline                     24109
origin                      41361
destination                 88792
scheduled_departure_utc     23538
scheduled_arrival_utc       23538
real_departure_utc          64318
estimated_arrival_utc      502967
real_flight_duration       160380
dtype: int64

In [8]:
admin.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1158127 entries, 0 to 1158126
Data columns (total 12 columns):
 #   Column                   Non-Null Count    Dtype  
---  ------                   --------------    -----  
 0   flight_id                1158127 non-null  object 
 1   flight_callsign          1154276 non-null  object 
 2   aircraft_model           1139309 non-null  object 
 3   aircraft_registration    1137386 non-null  object 
 4   airline                  1134018 non-null  object 
 5   origin                   1116766 non-null  object 
 6   destination              1069335 non-null  object 
 7   scheduled_departure_utc  1134589 non-null  float64
 8   scheduled_arrival_utc    1134589 non-null  float64
 9   real_departure_utc       1093809 non-null  float64
 10  estimated_arrival_utc    655160 non-null   float64
 11  real_flight_duration     997747 non-null   float64
dtypes: float64(5), object(7)
memory usage: 106.0+ MB


In [None]:
# Drop rows where both origin and destination are NaN
# admin[~(admin['origin'].isnull() & admin['destination'].isnull())]

In [None]:
# Drop rows where either origin and destination are NaN
# admin.dropna(subset=['origin', 'destination'], inplace=True)

In [9]:
# ls_object_columns = admin.loc[:, admin.dtypes == object].columns.tolist()
str_cols = admin.columns[admin.dtypes==object]

In [10]:
# Fill NaN with empty string
admin[str_cols] = admin[str_cols].fillna('')

In [11]:
# Strip leading and trailing spaces in object columns
admin[str_cols] = admin[str_cols].apply(lambda x: x.str.strip())

## Convert to datetime

In [12]:
def get_utc_datetime(value):
    try:
        return dt.fromtimestamp(value)
    except:
        return pd.NaT
    
admin['scheduled_departure_dt'] = admin['scheduled_departure_utc'].map(get_utc_datetime)
admin['scheduled_arrival_dt'] = admin['scheduled_arrival_utc'].map(get_utc_datetime)
admin['real_departure_dt'] = admin['real_departure_utc'].map(get_utc_datetime)
admin['estimated_arrival_dt'] = admin['estimated_arrival_utc'].map(get_utc_datetime)

**ScheduledArrival and ScheduledDeparture**

In [13]:
admin['scheduled_arrival_dt'][:5]

0   2017-01-12 02:55:00
1   2017-01-18 08:15:00
2   2017-01-18 06:55:00
3   2017-01-19 16:00:00
4   1970-01-01 07:30:00
Name: scheduled_arrival_dt, dtype: datetime64[ns]

In [14]:
admin['scheduled_departure_dt'][:5]

0   2017-01-12 01:40:00
1   2017-01-18 06:55:00
2   2017-01-18 02:40:00
3   2017-01-19 13:35:00
4   1970-01-01 07:30:00
Name: scheduled_departure_dt, dtype: datetime64[ns]

Observation: There are some dates where year = 1970. Check if there are dates that do not fall within Jan 2017

In [15]:
admin['scheduled_arrival_year'] = admin['scheduled_arrival_dt'].dt.year.astype('Int64')
admin['scheduled_arrival_month'] = admin['scheduled_arrival_dt'].dt.month.astype('Int64')
admin['scheduled_arrival_day'] = admin['scheduled_arrival_dt'].dt.day.astype('Int64')

In [16]:
admin['scheduled_departure_year'] = admin['scheduled_departure_dt'].dt.year.astype('Int64')
admin['scheduled_departure_month'] = admin['scheduled_departure_dt'].dt.month.astype('Int64')
admin['scheduled_departure_day'] = admin['scheduled_departure_dt'].dt.day.astype('Int64')

In [17]:
admin['scheduled_arrival_year'].value_counts(dropna=False).sort_index()

1970     128300
2016        691
2017    1005598
NaN       23538
Name: scheduled_arrival_year, dtype: Int64

In [18]:
admin['scheduled_arrival_month'].value_counts(dropna=False).sort_index()

1      1132784
2         1114
12         691
NaN      23538
Name: scheduled_arrival_month, dtype: Int64

In [19]:
admin['scheduled_arrival_day'].value_counts(dropna=False).sort_index()

1      155243
2       33402
3       32255
4       31822
5       32915
6       36249
7       28600
8       21304
9        4243
10      14092
11      32730
12      30556
13      10439
14      34495
15      33462
16      37084
17      35475
18      35181
19      35782
20      35365
21      35349
22      33942
23      36959
24      37419
25      38499
26      40999
27      41485
28      40693
29      38743
30      40729
31      39078
NaN     23538
Name: scheduled_arrival_day, dtype: Int64

In [20]:
admin['scheduled_departure_year'].value_counts(dropna=False).sort_index()

1970     128300
2016       3524
2017    1002765
NaN       23538
Name: scheduled_departure_year, dtype: Int64

In [21]:
admin['scheduled_departure_month'].value_counts(dropna=False).sort_index()

1      1131064
2            1
12        3524
NaN      23538
Name: scheduled_departure_month, dtype: Int64

In [22]:
admin['scheduled_departure_day'].value_counts(dropna=False).sort_index()

1      155169
2       33510
3       32118
4       31752
5       33169
6       36585
7       28289
8       17350
9        7585
10      14064
11      33359
12      29961
13      10971
14      34679
15      33698
16      36879
17      35448
18      35284
19      35626
20      35689
21      35272
22      34125
23      36959
24      37288
25      38642
26      41338
27      41430
28      40776
29      38452
30      40733
31      38389
NaN     23538
Name: scheduled_departure_day, dtype: Int64

In [23]:
admin['scheduled_arrival_dt'].isnull().sum()

23538

In [24]:
admin['scheduled_departure_dt'].isnull().sum()

23538

In [25]:
schedule_arrival_missing_index = admin.loc[admin['scheduled_arrival_dt'].isnull(), 'scheduled_arrival_dt'].index.tolist()
# # returns the same result
# admin['ScheduledArrival'][admin['ScheduledArrival'].isnull()].index

In [26]:
schedule_departure_missing_index = admin.loc[admin['scheduled_departure_dt'].isnull(), 'scheduled_departure_dt'].index.tolist()

In [27]:
np.array_equal(schedule_departure_missing_index, schedule_arrival_missing_index)

True

Observation: Rows with NaT for ScheduledArrival also had NaT for ScheduledDeparture

In [28]:
estimated_arrival_missing_index = admin.loc[admin['estimated_arrival_dt'].isnull(), 'estimated_arrival_dt'].index.tolist()

In [29]:
len(estimated_arrival_missing_index)

502967

In [30]:
len(list(set(estimated_arrival_missing_index).intersection(set(schedule_departure_missing_index))))

23538

EstimatedArrival shares common missing values for ScheduledArrival and ScheduledDeparture

In [31]:
# # Calculate scheduled flight time in timedelta type
# admin['scheduled_flight_time'] = admin['scheduled_arrival'] - admin['scheduled_departure']

# # Convert scheduled flight time from timedelta to seconds
# admin['scheduled_flight_time'] = admin['scheduled_flight_time'].map(lambda x: x.total_seconds())

In [32]:
admin['scheduled_flight_duration'] = admin['scheduled_arrival_utc'] - admin['scheduled_departure_utc']

**FlightTime**

In [33]:
admin.dtypes

flight_id                            object
flight_callsign                      object
aircraft_model                       object
aircraft_registration                object
airline                              object
origin                               object
destination                          object
scheduled_departure_utc             float64
scheduled_arrival_utc               float64
real_departure_utc                  float64
estimated_arrival_utc               float64
real_flight_duration                float64
scheduled_departure_dt       datetime64[ns]
scheduled_arrival_dt         datetime64[ns]
real_departure_dt            datetime64[ns]
estimated_arrival_dt         datetime64[ns]
scheduled_arrival_year                Int64
scheduled_arrival_month               Int64
scheduled_arrival_day                 Int64
scheduled_departure_year              Int64
scheduled_departure_month             Int64
scheduled_departure_day               Int64
scheduled_flight_duration       

In [34]:
# (admin['estimated_arrival'] - admin['real_departure'])[1].total_seconds()
(admin['estimated_arrival_utc'] - admin['real_departure_utc'])[1]

4135.0

In [35]:
admin.shape

(1158127, 23)

In [36]:
admin['real_flight_duration'].isnull().sum()

160380

In [37]:
real_flight_duration_null_idx = admin[admin['real_flight_duration'].isnull()].index.tolist()

In [38]:
real_departure_null_idx = admin[admin['real_departure_utc'].isnull()].index.tolist()

In [39]:
print(len(real_flight_duration_null_idx))
len(real_departure_null_idx)

160380


64318

In [40]:
len(set(real_flight_duration_null_idx).intersection(real_departure_null_idx))

40619

In [41]:
# admin.dropna(subset=['flight_time'], inplace=True)

In [42]:
# admin['real_departure'].isnull().sum()flight_time_null_idx

In [43]:
# def get_h_m_s(value):
#     h = value // 3600
#     m = value % 3600 // 60
#     s = value % 60
#     try:
#         return int(h), int(m), int(s)
#     except:
#         return np.nan

In [44]:
# admin['flight_time'].apply(get_h_m_s)

In [45]:
# import datetime
# str(datetime.timedelta(seconds=7144))

In [46]:
# # Reference: https://stackoverflow.com/questions/775049/how-do-i-convert-seconds-to-hours-minutes-and-seconds
# m, s = divmod(7144, 60)
# h, m = divmod(m, 60)
# h, m, s

In [47]:
# # Reference: https://stackoverflow.com/questions/1384406/convert-seconds-to-hhmmss-in-python
# s = 7144
# m = s // 60
# h = m // 60
# h, m%60, s%60

## Analysis of a single flight (flight_id = c244ac4)

In [48]:
admin.columns

Index(['flight_id', 'flight_callsign', 'aircraft_model',
       'aircraft_registration', 'airline', 'origin', 'destination',
       'scheduled_departure_utc', 'scheduled_arrival_utc',
       'real_departure_utc', 'estimated_arrival_utc', 'real_flight_duration',
       'scheduled_departure_dt', 'scheduled_arrival_dt', 'real_departure_dt',
       'estimated_arrival_dt', 'scheduled_arrival_year',
       'scheduled_arrival_month', 'scheduled_arrival_day',
       'scheduled_departure_year', 'scheduled_departure_month',
       'scheduled_departure_day', 'scheduled_flight_duration'],
      dtype='object')

In [49]:
admin[admin.flight_id == 'c244ac4']

Unnamed: 0,flight_id,flight_callsign,aircraft_model,aircraft_registration,airline,origin,destination,scheduled_departure_utc,scheduled_arrival_utc,real_departure_utc,...,scheduled_arrival_dt,real_departure_dt,estimated_arrival_dt,scheduled_arrival_year,scheduled_arrival_month,scheduled_arrival_day,scheduled_departure_year,scheduled_departure_month,scheduled_departure_day,scheduled_flight_duration
0,c244ac4,SAS1749,ATR 72-600,OY-JZE,SAS,Tallinn Lennart Meri Airport,Stockholm Arlanda Airport,1484156000.0,1484161000.0,1484159000.0,...,2017-01-12 02:55:00,2017-01-12 02:27:23,NaT,2017,1,12,2017,1,12,4500.0


In [50]:
import datetime
str(datetime.timedelta(seconds=admin.loc[admin.flight_id == 'c244ac4', 'real_flight_duration'].values[0]))

'1:02:42'

## Route

In [51]:
admin['route'] = admin.origin + " -> " + admin.destination

In [52]:
admin['route'].value_counts()[:20]

 ->                                                                                              35666
Seoul Gimpo International Airport -> Jeju International Airport                                   2058
Jeju International Airport -> Seoul Gimpo International Airport                                   1971
Sydney Kingsford Smith Airport -> Melbourne Airport                                               1554
Ho Chi Minh City International Airport -> Hanoi Noi Bai International Airport                     1466
Melbourne Airport -> Sydney Kingsford Smith Airport                                               1454
Hanoi Noi Bai International Airport -> Ho Chi Minh City International Airport                     1402
Delhi Indira Gandhi International Airport -> Mumbai Chhatrapati Shivaji International Airport     1233
Mumbai Chhatrapati Shivaji International Airport -> Delhi Indira Gandhi International Airport     1195
Dubai International Airport ->                                           

In [53]:
# admin.dropna(subset=['real_flight_duration']).isnull().sum()

In [54]:
admin.isnull().sum()

flight_id                         0
flight_callsign                   0
aircraft_model                    0
aircraft_registration             0
airline                           0
origin                            0
destination                       0
scheduled_departure_utc       23538
scheduled_arrival_utc         23538
real_departure_utc            64318
estimated_arrival_utc        502967
real_flight_duration         160380
scheduled_departure_dt        23538
scheduled_arrival_dt          23538
real_departure_dt             64318
estimated_arrival_dt         502967
scheduled_arrival_year        23538
scheduled_arrival_month       23538
scheduled_arrival_day         23538
scheduled_departure_year      23538
scheduled_departure_month     23538
scheduled_departure_day       23538
scheduled_flight_duration     23538
route                             0
dtype: int64

In [55]:
mask = admin['route'] == 'Taiwan Taoyuan International Airport -> Hong Kong International Airport'
df_twn_hkg = admin[mask]

In [56]:
df_twn_hkg.sort_values(by='scheduled_departure_dt')['scheduled_departure_dt'].head(32)

930360    1970-01-01 07:30:00
554909    1970-01-01 07:30:00
311509    1970-01-01 07:30:00
959131    1970-01-01 07:30:00
268583    1970-01-01 07:30:00
792492    1970-01-01 07:30:00
1069001   1970-01-01 07:30:00
431010    1970-01-01 07:30:00
1069647   1970-01-01 07:30:00
957928    1970-01-01 07:30:00
138451    1970-01-01 07:30:00
1104197   1970-01-01 07:30:00
1102567   1970-01-01 07:30:00
469801    1970-01-01 07:30:00
1034334   1970-01-01 07:30:00
315696    1970-01-01 07:30:00
337403    1970-01-01 07:30:00
509209    1970-01-01 07:30:00
708040    1970-01-01 07:30:00
24196     1970-01-01 07:30:00
266773    1970-01-01 07:30:00
789218    1970-01-01 07:30:00
17778     1970-01-01 07:30:00
113366    1970-01-01 07:30:00
757536    1970-01-01 07:30:00
730102    1970-01-01 07:30:00
254181    1970-01-01 07:30:00
93302     1970-01-01 07:30:00
1060331   1970-01-01 07:30:00
833060    1970-01-01 07:30:00
749480    1970-01-01 07:30:00
369045    2016-12-31 22:00:00
Name: scheduled_departure_dt, dtype: dat

In [57]:
df_twn_hkg['scheduled_departure_dt'].sort_values()[:32]

930360    1970-01-01 07:30:00
554909    1970-01-01 07:30:00
311509    1970-01-01 07:30:00
959131    1970-01-01 07:30:00
268583    1970-01-01 07:30:00
792492    1970-01-01 07:30:00
1069001   1970-01-01 07:30:00
431010    1970-01-01 07:30:00
1069647   1970-01-01 07:30:00
957928    1970-01-01 07:30:00
138451    1970-01-01 07:30:00
1104197   1970-01-01 07:30:00
1102567   1970-01-01 07:30:00
469801    1970-01-01 07:30:00
1034334   1970-01-01 07:30:00
315696    1970-01-01 07:30:00
337403    1970-01-01 07:30:00
509209    1970-01-01 07:30:00
708040    1970-01-01 07:30:00
24196     1970-01-01 07:30:00
266773    1970-01-01 07:30:00
789218    1970-01-01 07:30:00
17778     1970-01-01 07:30:00
113366    1970-01-01 07:30:00
757536    1970-01-01 07:30:00
730102    1970-01-01 07:30:00
254181    1970-01-01 07:30:00
93302     1970-01-01 07:30:00
1060331   1970-01-01 07:30:00
833060    1970-01-01 07:30:00
749480    1970-01-01 07:30:00
369045    2016-12-31 22:00:00
Name: scheduled_departure_dt, dtype: dat

In [58]:
df_twn_hkg['scheduled_arrival_dt'].sort_values()[:32]

138451    1970-01-01 07:30:00
792492    1970-01-01 07:30:00
959131    1970-01-01 07:30:00
266773    1970-01-01 07:30:00
1069001   1970-01-01 07:30:00
431010    1970-01-01 07:30:00
1069647   1970-01-01 07:30:00
93302     1970-01-01 07:30:00
268583    1970-01-01 07:30:00
930360    1970-01-01 07:30:00
469801    1970-01-01 07:30:00
315696    1970-01-01 07:30:00
337403    1970-01-01 07:30:00
113366    1970-01-01 07:30:00
509209    1970-01-01 07:30:00
1034334   1970-01-01 07:30:00
1102567   1970-01-01 07:30:00
1104197   1970-01-01 07:30:00
554909    1970-01-01 07:30:00
833060    1970-01-01 07:30:00
311509    1970-01-01 07:30:00
708040    1970-01-01 07:30:00
957928    1970-01-01 07:30:00
1060331   1970-01-01 07:30:00
254181    1970-01-01 07:30:00
757536    1970-01-01 07:30:00
24196     1970-01-01 07:30:00
730102    1970-01-01 07:30:00
17778     1970-01-01 07:30:00
789218    1970-01-01 07:30:00
749480    1970-01-01 07:30:00
369045    2016-12-31 23:30:00
Name: scheduled_arrival_dt, dtype: datet

In [59]:
df_twn_hkg[df_twn_hkg['scheduled_departure_dt'].isnull()].index

Int64Index([841654], dtype='int64')

In [60]:
df_twn_hkg.loc[841654]

flight_id                                                              c0d8908
flight_callsign                                                         CAL641
aircraft_model                                                 Airbus A350-941
aircraft_registration                                                  B-18901
airline                                                         China Airlines
origin                                    Taiwan Taoyuan International Airport
destination                                    Hong Kong International Airport
scheduled_departure_utc                                                    NaN
scheduled_arrival_utc                                                      NaN
real_departure_utc                                                         NaN
estimated_arrival_utc                                                      NaN
real_flight_duration                                                      5438
scheduled_departure_dt                              

In [61]:
df_twn_hkg[df_twn_hkg['real_flight_duration'].isnull()].index

Int64Index([  24196,  131209,  254181,  266773,  268583,  431010,  509209,
             631005,  708040,  792492,  833060,  860491,  930360,  957928,
             959131, 1034334, 1060331, 1069001, 1104197, 1157632],
           dtype='int64')

In [62]:
# (df_twn_hkg['scheduled_arrival_year'].dropna().astype(int).astype(str) + '-' + \
# df_twn_hkg['scheduled_arrival_month'].dropna().astype(int).astype(str) + '-' + \
# df_twn_hkg['scheduled_arrival_day'].dropna().astype(int).astype(str)) \
# .value_counts() \
# .sort_index()
df_twn_hkg['scheduled_arrival_dt'].dt.date.value_counts(dropna=False).sort_index()

1970-01-01    31
2016-12-31     1
2017-01-01    35
2017-01-02    36
2017-01-03    38
2017-01-04    33
2017-01-05    39
2017-01-06    42
2017-01-07    31
2017-01-08    26
2017-01-09     6
2017-01-10    11
2017-01-11    36
2017-01-12    38
2017-01-13    12
2017-01-14    39
2017-01-15    49
2017-01-16    35
2017-01-17    43
2017-01-18    42
2017-01-19    37
2017-01-20    38
2017-01-21    46
2017-01-22    36
2017-01-23    39
2017-01-24    43
2017-01-25    44
2017-01-26    46
2017-01-27    40
2017-01-28    47
2017-01-29    38
2017-01-30    40
2017-01-31    33
NaN            1
Name: scheduled_arrival_dt, dtype: int64

In [63]:
df_twn_hkg['scheduled_departure_dt'].dt.date.value_counts(dropna=False).sort_index()

1970-01-01    31
2016-12-31     3
2017-01-01    34
2017-01-02    37
2017-01-03    39
2017-01-04    33
2017-01-05    39
2017-01-06    44
2017-01-07    29
2017-01-08    23
2017-01-09     8
2017-01-10    11
2017-01-11    37
2017-01-12    37
2017-01-13    15
2017-01-14    39
2017-01-15    45
2017-01-16    37
2017-01-17    43
2017-01-18    43
2017-01-19    36
2017-01-20    40
2017-01-21    43
2017-01-22    35
2017-01-23    40
2017-01-24    44
2017-01-25    45
2017-01-26    44
2017-01-27    43
2017-01-28    45
2017-01-29    36
2017-01-30    39
2017-01-31    33
NaN            1
Name: scheduled_departure_dt, dtype: int64

In [64]:
flights_in_1970 = df_twn_hkg[(df_twn_hkg['scheduled_arrival_dt'].dt.date == pd.Timestamp("1970-01-01 00:00:00")) & 
                             (df_twn_hkg['scheduled_departure_dt'].dt.date == pd.Timestamp("1970-01-01 00:00:00"))]
flights_in_1970.sort_values(by='real_departure_dt')

Unnamed: 0,flight_id,flight_callsign,aircraft_model,aircraft_registration,airline,origin,destination,scheduled_departure_utc,scheduled_arrival_utc,real_departure_utc,...,real_departure_dt,estimated_arrival_dt,scheduled_arrival_year,scheduled_arrival_month,scheduled_arrival_day,scheduled_departure_year,scheduled_departure_month,scheduled_departure_day,scheduled_flight_duration,route
749480,c0b7b08,EVA6521,Boeing 747-45EF(SCD),B-16481,EVA Air Cargo,Taiwan Taoyuan International Airport,Hong Kong International Airport,0.0,0.0,1483229000.0,...,2017-01-01 08:07:58,NaT,1970,1,1,1970,1,1,0.0,Taiwan Taoyuan International Airport -> Hong K...
1034334,c1011b8,UPS54,Boeing 747-4R7(F),N582UP,UPS,Taiwan Taoyuan International Airport,Hong Kong International Airport,0.0,0.0,1483423000.0,...,2017-01-03 13:52:36,NaT,1970,1,1,1970,1,1,0.0,Taiwan Taoyuan International Airport -> Hong K...
1104197,c108418,FDX9171,McDonnell Douglas MD-11F,N623FE,FedEx,Taiwan Taoyuan International Airport,Hong Kong International Airport,0.0,0.0,1483442000.0,...,2017-01-03 19:15:53,NaT,1970,1,1,1970,1,1,0.0,Taiwan Taoyuan International Airport -> Hong K...
113366,c12a899,CAL6843,Boeing 747-409(F),B-18720,China Airlines Cargo,Taiwan Taoyuan International Airport,Hong Kong International Airport,0.0,0.0,1483516000.0,...,2017-01-04 15:51:28,NaT,1970,1,1,1970,1,1,0.0,Taiwan Taoyuan International Airport -> Hong K...
930360,c1a53ee,FDX169,Boeing 777-FS2,N855FD,FedEx,Taiwan Taoyuan International Airport,Hong Kong International Airport,0.0,0.0,1483777000.0,...,2017-01-07 16:16:04,NaT,1970,1,1,1970,1,1,0.0,Taiwan Taoyuan International Airport -> Hong K...
1069647,c1a80e2,EVA6529,Boeing 747-45EF(SCD),B-16481,EVA Air Cargo,Taiwan Taoyuan International Airport,Hong Kong International Airport,0.0,0.0,1483786000.0,...,2017-01-07 18:47:22,NaT,1970,1,1,1970,1,1,0.0,Taiwan Taoyuan International Airport -> Hong K...
17778,c1bc4fa,EVA6521,Boeing 747-45E(BDSF),B-16406,EVA Air Cargo,Taiwan Taoyuan International Airport,Hong Kong International Airport,0.0,0.0,1483836000.0,...,2017-01-08 08:42:00,2017-01-08 10:14:01,1970,1,1,1970,1,1,0.0,Taiwan Taoyuan International Airport -> Hong K...
268583,c21164c,FDX9171,McDonnell Douglas MD-11F,N573FE,FedEx,Taiwan Taoyuan International Airport,Hong Kong International Airport,0.0,0.0,1484047000.0,...,2017-01-10 19:16:18,NaT,1970,1,1,1970,1,1,0.0,Taiwan Taoyuan International Airport -> Hong K...
730102,c2328ba,CAL6843,Boeing 747-409(F),B-18723,China Airlines Cargo,Taiwan Taoyuan International Airport,Hong Kong International Airport,0.0,0.0,1484120000.0,...,2017-01-11 15:40:58,NaT,1970,1,1,1970,1,1,0.0,Taiwan Taoyuan International Airport -> Hong K...
509209,c2a9dd4,FDX169,Boeing 777-FS2,N861FD,FedEx,Taiwan Taoyuan International Airport,Hong Kong International Airport,0.0,0.0,1484382000.0,...,2017-01-14 16:21:43,NaT,1970,1,1,1970,1,1,0.0,Taiwan Taoyuan International Airport -> Hong K...


In [73]:
flights_in_1970.shape

(31, 24)

In [74]:
flights_in_1970['flight_callsign'].value_counts()

EVA6521    6
UPS54      4
FDX169     4
FDX9171    3
CAL6843    3
CPA3331    2
EVA6529    2
CAL2919    2
AHK3331    1
CPA25      1
NCA699     1
CRK9269    1
HDA867     1
Name: flight_callsign, dtype: int64

In [75]:
flights_in_1970['aircraft_model'].value_counts()

Boeing 747-45E(BDSF)        5
Boeing 777-FS2              4
McDonnell Douglas MD-11F    3
Boeing 747-45EF(SCD)        3
Boeing 747-409(F)           3
Boeing 747-4R7(F)           3
Boeing 737-8FH              2
Boeing 777-267              2
Airbus A330-342             1
Boeing 747-481(F)           1
Airbus A3004F-605R          1
Boeing 747-44A(F)           1
Airbus A330-243F            1
Airbus A300F4-605R          1
Name: aircraft_model, dtype: int64

In [76]:
flights_in_1970['aircraft_registration'].value_counts()

B-16406    4
B-16481    3
B-18657    2
N582UP     2
B-HNB      2
B-LAA      1
N583UP     1
B-18710    1
N623FE     1
B-18720    1
N574UP     1
N857FD     1
N612FE     1
B-16401    1
N861FD     1
B-LNZ      1
B-LDC      1
N573FE     1
B-LDA      1
N855FD     1
B-18723    1
JA04KZ     1
N885FD     1
Name: aircraft_registration, dtype: int64

In [77]:
flights_in_1970['airline'].value_counts()

EVA Air Cargo                                     8
FedEx                                             7
UPS                                               4
China Airlines Cargo                              3
Air Hong Kong (DHL cs)                            2
China Airlines                                    2
Cathay Pacific                                    2
Hong Kong Airlines                                1
Cathay Dragon                                     1
Nippon Cargo Airlines (Green freighter Livery)    1
Name: airline, dtype: int64

In [79]:
df_twn_hkg.loc[df_twn_hkg['airline'] == 'EVA Air Cargo', 'real_flight_duration'].value_counts()

4473.0    8
4151.0    8
5000.0    7
4300.0    4
5235.0    3
Name: real_flight_duration, dtype: int64

## Trails

In [80]:
trails = pd.read_feather('./AirTracks/all_csv/Jan2017/trails.feather')

In [81]:
trails.rename(columns={
    'FlightId': 'flight_id',
    'Timestamp': 'timestamp_utc'
     }, inplace=True)

trails.columns = trails.columns.map(lambda x: x.lower())

In [None]:
# trails.loc[trails['flight_id'] == 'c0b7b08', 'timestamp'].max() - \
# trails.loc[trails['flight_id'] == 'c0b7b08', 'timestamp'].min()

In [170]:
# flight_c24c29f = trails.loc[trails['flight_id'] == 'c24c29f', :]

In [171]:
# flight_c24c29f.head()

In [172]:
# flight_c24c29f.loc[71866, 'timestamp_utc'] - \
# admin.loc[admin['flight_id'] == 'c24c29f', 'real_departure_utc']

In [208]:
trails[trails.flight_id.isin(df_twn_hkg.flight_id.unique())].shape

(289890, 7)

In [189]:
combined_twn_hkg = pd.merge(trails,
                            df_twn_hkg,
                            on='flight_id')
# del trails, df_twn_hkg

In [207]:
combined_twn_hkg.shape

(289890, 31)

In [194]:
combined_twn_hkg.head(3)

Unnamed: 0,flight_id,timestamp_utc,latitude,longitude,altitude,heading,speed,flight_callsign,aircraft_model,aircraft_registration,airline,origin,destination,scheduled_departure_utc,scheduled_arrival_utc,real_departure_utc,estimated_arrival_utc,real_flight_duration,scheduled_departure_dt,scheduled_arrival_dt,real_departure_dt,estimated_arrival_dt,scheduled_arrival_year,scheduled_arrival_month,scheduled_arrival_day,scheduled_departure_year,scheduled_departure_month,scheduled_departure_day,scheduled_flight_duration,route
0,c36108c,1484814608,25.078995,121.23764,0,317,0,CPA405,Airbus A330-343,B-HLU,Cathay Pacific (Oneworld livery),Taiwan Taoyuan International Airport,Hong Kong International Airport,1484814000.0,1484822000.0,1484816000.0,1484821000.0,5550.0,2017-01-19 16:20:00,2017-01-19 18:25:00,2017-01-19 16:49:23,2017-01-19 18:18:40,2017,1,19,2017,1,19,7500.0,Taiwan Taoyuan International Airport -> Hong Kong International Airport
1,c36108c,1484814671,25.078762,121.237846,0,317,9,CPA405,Airbus A330-343,B-HLU,Cathay Pacific (Oneworld livery),Taiwan Taoyuan International Airport,Hong Kong International Airport,1484814000.0,1484822000.0,1484816000.0,1484821000.0,5550.0,2017-01-19 16:20:00,2017-01-19 18:25:00,2017-01-19 16:49:23,2017-01-19 18:18:40,2017,1,19,2017,1,19,7500.0,Taiwan Taoyuan International Airport -> Hong Kong International Airport
2,c36108c,1484814702,25.078505,121.238098,0,312,10,CPA405,Airbus A330-343,B-HLU,Cathay Pacific (Oneworld livery),Taiwan Taoyuan International Airport,Hong Kong International Airport,1484814000.0,1484822000.0,1484816000.0,1484821000.0,5550.0,2017-01-19 16:20:00,2017-01-19 18:25:00,2017-01-19 16:49:23,2017-01-19 18:18:40,2017,1,19,2017,1,19,7500.0,Taiwan Taoyuan International Airport -> Hong Kong International Airport


In [195]:
combined_twn_hkg.sort_values(by=['flight_id', 'timestamp_utc'],
                             inplace=True)

In [220]:
combined_twn_hkg[combined_twn_hkg.duplicated(subset=['flight_id', 'timestamp_utc'])]

Unnamed: 0,flight_id,timestamp_utc,latitude,longitude,altitude,heading,speed,flight_callsign,aircraft_model,aircraft_registration,airline,origin,destination,scheduled_departure_utc,scheduled_arrival_utc,real_departure_utc,estimated_arrival_utc,real_flight_duration,scheduled_departure_dt,scheduled_arrival_dt,real_departure_dt,estimated_arrival_dt,scheduled_arrival_year,scheduled_arrival_month,scheduled_arrival_day,scheduled_departure_year,scheduled_departure_month,scheduled_departure_day,scheduled_flight_duration,route,time_since_real_departure
136362,c126012,1483506833,22.582352,117.917992,34000,237,461,CPA475,Boeing 777-367(ER),B-KPQ,Cathay Pacific,Taiwan Taoyuan International Airport,Hong Kong International Airport,1483500000.0,1483508000.0,1483505000.0,1483510000.0,5725.0,2017-01-04 11:20:00,2017-01-04 13:25:00,2017-01-04 12:35:52,2017-01-04 14:14:21,2017,1,4,2017,1,4,7500.0,Taiwan Taoyuan International Airport -> Hong Kong International Airport,2281.0


In [221]:
combined_twn_hkg[(combined_twn_hkg['flight_id'] == 'c126012') &
                 (combined_twn_hkg['timestamp_utc'] == 1483506833)]

Unnamed: 0,flight_id,timestamp_utc,latitude,longitude,altitude,heading,speed,flight_callsign,aircraft_model,aircraft_registration,airline,origin,destination,scheduled_departure_utc,scheduled_arrival_utc,real_departure_utc,estimated_arrival_utc,real_flight_duration,scheduled_departure_dt,scheduled_arrival_dt,real_departure_dt,estimated_arrival_dt,scheduled_arrival_year,scheduled_arrival_month,scheduled_arrival_day,scheduled_departure_year,scheduled_departure_month,scheduled_departure_day,scheduled_flight_duration,route,time_since_real_departure
136361,c126012,1483506833,22.684389,118.088074,34000,237,461,CPA475,Boeing 777-367(ER),B-KPQ,Cathay Pacific,Taiwan Taoyuan International Airport,Hong Kong International Airport,1483500000.0,1483508000.0,1483505000.0,1483510000.0,5725.0,2017-01-04 11:20:00,2017-01-04 13:25:00,2017-01-04 12:35:52,2017-01-04 14:14:21,2017,1,4,2017,1,4,7500.0,Taiwan Taoyuan International Airport -> Hong Kong International Airport,2281.0
136362,c126012,1483506833,22.582352,117.917992,34000,237,461,CPA475,Boeing 777-367(ER),B-KPQ,Cathay Pacific,Taiwan Taoyuan International Airport,Hong Kong International Airport,1483500000.0,1483508000.0,1483505000.0,1483510000.0,5725.0,2017-01-04 11:20:00,2017-01-04 13:25:00,2017-01-04 12:35:52,2017-01-04 14:14:21,2017,1,4,2017,1,4,7500.0,Taiwan Taoyuan International Airport -> Hong Kong International Airport,2281.0


In [226]:
combined_twn_hkg.loc[136361-3:136361+4, ['flight_id', 'timestamp_utc', 'latitude', 'longitude', 'heading', 'speed']]

Unnamed: 0,flight_id,timestamp_utc,latitude,longitude,heading,speed
136358,c126012,1483506598,22.827221,118.327026,237,459
136359,c126012,1483506673,22.739691,118.180527,237,460
136360,c126012,1483506723,22.684389,118.088074,237,460
136362,c126012,1483506833,22.582352,117.917992,237,461
136363,c126012,1483506938,22.43413,117.668556,237,462
136364,c126012,1483507003,22.357731,117.540176,237,461
136365,c126012,1483507018,22.345169,117.51149,247,461


Drop 136361 since it has the same latitude and longitude as 136360

In [232]:
combined_twn_hkg = combined_twn_hkg.drop(136361)
combined_twn_hkg.shape

KeyError: '[136361] not found in axis'

In [227]:
combined_twn_hkg['time_since_real_departure'] = combined_twn_hkg['timestamp_utc'] - combined_twn_hkg['real_departure_utc']

In [228]:
# Shift latitude and longitude by 1
# for comparison of distance
unique_flight_ids = combined_twn_hkg.flight_id.unique().tolist()

df_prev_lat_lon = pd.DataFrame(columns=['flight_id', 'timestamp_utc', 'prev_latitude', 'prev_longitude'])

for flight_id in unique_flight_ids:
    dummy_df = combined_twn_hkg.loc[combined_twn_hkg['flight_id'] == flight_id, 
                                    ['flight_id', 'timestamp_utc', 'latitude', 'longitude']]
    dummy_df[['latitude', 'longitude']] = dummy_df[['latitude', 'longitude']].shift(1)
    dummy_df.rename(columns={'latitude': 'prev_latitude',
                             'longitude': 'prev_longitude'}, inplace=True)
    df_prev_lat_lon = pd.concat([df_prev_lat_lon, dummy_df])
df_prev_lat_lon

Unnamed: 0,flight_id,timestamp_utc,prev_latitude,prev_longitude
87482,c0aabc0,1483193923,,
87483,c0aabc0,1483193942,25.089270,121.240158
87484,c0aabc0,1483194080,25.089512,121.240349
87485,c0aabc0,1483194271,25.089546,121.240372
87486,c0aabc0,1483194304,25.089550,121.240372
...,...,...,...,...
227685,c52959e,1485870339,22.308723,113.916298
227686,c52959e,1485870348,22.308483,113.916603
227687,c52959e,1485870355,22.308449,113.916817
227688,c52959e,1485870366,22.308659,113.916946


In [229]:
# print(len(unique_flight_ids))
# df_prev_lat_lon[['prev_latitude', 'prev_longitude']].isnull().sum()

In [230]:
merged_df = pd.merge(combined_twn_hkg,
                     df_prev_lat_lon,
                     how='left',
                     left_on=['flight_id', 'timestamp_utc'],
                     right_on=['flight_id', 'timestamp_utc'])
merged_df.shape

(289889, 33)

In [216]:
# np.array_equal(combined_twn_hkg.flight_id.value_counts().sort_index().values,
#                df_prev_lat_lon.flight_id.value_counts().sort_index().values)

In [217]:
# np.array_equal(combined_twn_hkg.timestamp_utc.tolist(),
#                df_prev_lat_lon.timestamp_utc.tolist())

In [100]:
def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles
    return c * r

## Plot of all flights (Jakarta Soekarno Hatta International Airport -> Singapore Changi Airport)

In [None]:
# Reference: https://pbpython.com/pandas-qcut-cut.html
# Reference: https://stackoverflow.com/questions/50145702/pandas-cut-doesnt-bin-zero-values
cut_labels = ['0 <= speed < 100', 
              '100 <= speed < 200',
              '200 <= speed < 300',
              '300 <= speed < 400',
              '400 <= speed < 500',
              '500 <= speed < 600',
              '600 <= speed < 700',
              '700 <= speed < 800']
cut_bins = [-np.inf, 99, 199, 299, 399, 499, 599, 699, 799]
combined_jkt_sin['speed_interval'] = pd.cut(combined_jkt_sin['speed'], bins=cut_bins, labels=cut_labels)

# pd.cut(combined_jkt_sin['speed'], bins=np.linspace(0, 800, 9))

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))
sns.scatterplot(ax = ax, x='longitude', y='latitude', data=combined_jkt_sin)
plt.show()

In [None]:
holding_stack_flight_ids = combined_jkt_sin.loc[combined_jkt_sin['latitude'] > 1.5, 'flight_id'].unique().tolist()

## Plot of all flights that did not fly a holding pattern (Jakarta Soekarno Hatta International Airport -> Singapore Changi Airport)

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))
sns.scatterplot(ax = ax, x='longitude', y='latitude', hue='speed_interval', marker='<',
#                 palette=sns.color_palette("Blues", 8),
                palette=sns.cubehelix_palette(8),
                data=combined_jkt_sin[~combined_jkt_sin['flight_id'].isin(holding_stack_flight_ids)])
plt.show();

## Plot of all flights that flew a holding pattern (Jakarta Soekarno Hatta International Airport -> Singapore Changi Airport)

In [None]:
combined_jkt_sin.loc[combined_jkt_sin['flight_id'].isin(holding_stack_flight_ids), 'speed_interval'].unique()

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))
sns.scatterplot(ax = ax, x='longitude', y='latitude', hue='speed_interval', marker='<',
#                 palette=sns.color_palette("Blues", 7),
                palette=sns.cubehelix_palette(7),
                data=combined_jkt_sin[combined_jkt_sin['flight_id'].isin(holding_stack_flight_ids)])
plt.show();

In [None]:
combined_jkt_sin['real_departure'].min()

In [None]:
combined_jkt_sin['real_departure'].max()