## Import libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

%matplotlib inline

In [None]:
# import tensorflow as tf
# print("tf version = ", tf.__version__)
# with tf.device("/gpu:0"):
#     a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3], name='a')
#     b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[3, 2], name='b')
#     c = tf.matmul(a, b)
# with tf.Session() as sess:
#     print (sess.run(c))

## Read data

In [None]:
admin = pd.read_csv('./AirTracks/all_csv/Jan2017/admin.csv')

In [None]:
admin.head(2)

In [None]:
# trails = pd.read_feather('./AirTracks/all_csv/Jan2017/trails.feather')
# trails = pd.read_csv('./AirTracks/all_csv/Jan2017/trails.csv', low_memory=True)

## Rename columns

In [None]:
admin.rename(columns={
    'FlightId': 'flight_id',
    'FlightCallsign': 'flight_callsign',
    'AircraftModel': 'aircraft_model',
    'AircraftRegistration': 'aircraft_registration',
    'SchdeuledDeparture': 'scheduled_departure',
    'ScheduledArrival': 'scheduled_arrival',
    'RealDeparture': 'real_departure',
    'EstimatedArrival': 'estimated_arrival',
    'FlightTime': 'flight_time'
     }, inplace=True)

admin.columns = admin.columns.map(lambda x: x.lower())

In [None]:
admin.columns

In [None]:
admin.dtypes

## Handling null data

In [None]:
admin.isnull().sum()

In [None]:
admin.info()

In [None]:
# Drop rows where both origin and destination are NaN
# admin[~(admin['origin'].isnull() & admin['destination'].isnull())]

In [None]:
# Drop rows where either origin and destination are NaN
admin.dropna(subset=['origin', 'destination'], inplace=True)

In [None]:
# ls_object_columns = admin.loc[:, admin.dtypes == object].columns.tolist()
str_cols = admin.columns[admin.dtypes==object]

In [None]:
# Fill NaN with empty string
admin[str_cols] = admin[str_cols].fillna('')

In [None]:
# Strip leading and trailing spaces in object columns
admin[str_cols] = admin[str_cols].apply(lambda x: x.str.strip())

## Convert to datetime

In [None]:
def get_utc_datetime(value):
    try:
        return datetime.fromtimestamp(value)
    except:
        return pd.NaT
    
admin['scheduled_departure'] = admin['scheduled_departure'].map(get_utc_datetime)
admin['scheduled_arrival'] = admin['scheduled_arrival'].map(get_utc_datetime)
admin['real_departure'] = admin['real_departure'].map(get_utc_datetime)
admin['estimated_arrival'] = admin['estimated_arrival'].map(get_utc_datetime)

**ScheduledArrival and ScheduledDeparture**

In [None]:
admin['scheduled_arrival'][:5]

In [None]:
admin['scheduled_departure'][:5]

Observation: There are some dates where year = 1970. Check if there are dates that do not fall within Jan 2017

In [None]:
admin['scheduled_arrival_year'] = admin['scheduled_arrival'].dt.year
admin['scheduled_arrival_month'] = admin['scheduled_arrival'].dt.month
admin['scheduled_arrival_day'] = admin['scheduled_arrival'].dt.day

In [None]:
admin['scheduled_departure_year'] = admin['scheduled_departure'].dt.year
admin['scheduled_departure_month'] = admin['scheduled_departure'].dt.month
admin['scheduled_departure_day'] = admin['scheduled_departure'].dt.day

In [None]:
admin['scheduled_arrival_year'].value_counts(dropna=False).sort_index()

In [None]:
admin['scheduled_arrival_month'].value_counts(dropna=False).sort_index()

In [None]:
admin['scheduled_arrival_day'].value_counts(dropna=False).sort_index()

In [None]:
admin['scheduled_arrival'].isnull().sum()

In [None]:
admin['scheduled_departure'].isnull().sum()

In [None]:
schedule_arrival_missing_index = admin.loc[admin['scheduled_arrival'].isnull(),'scheduled_arrival'].index.tolist()
# # returns the same result
# admin['ScheduledArrival'][admin['ScheduledArrival'].isnull()].index

In [None]:
schedule_departure_missing_index = admin.loc[admin['scheduled_departure'].isnull(),'scheduled_departure'].index.tolist()

In [None]:
np.array_equal(schedule_departure_missing_index, schedule_arrival_missing_index)

Observation: Rows with NaT for ScheduledArrival also had NaT for ScheduledDeparture

In [None]:
estimated_arrival_missing_index = admin.loc[admin['estimated_arrival'].isnull(), 'estimated_arrival'].index.tolist()

In [None]:
len(estimated_arrival_missing_index)

In [None]:
len(list(set(estimated_arrival_missing_index).intersection(set(schedule_departure_missing_index))))

EstimatedArrival shares common missing values for ScheduledArrival and ScheduledDeparture

In [None]:
# Calculate scheduled flight time in timedelta type
admin['scheduled_flight_time'] = admin['scheduled_arrival'] - admin['scheduled_departure']

In [None]:
# Convert scheduled flight time from timedelta to seconds
admin['scheduled_flight_time'] = admin['scheduled_flight_time'].map(lambda x: x.total_seconds())

**FlightTime**

In [None]:
(admin['estimated_arrival'] - admin['real_departure'])[1].total_seconds()

In [None]:
admin.shape

In [None]:
admin.dropna(subset=['flight_time'], inplace=True)

In [None]:
admin['flight_time'].isnull().sum()

In [None]:
admin['real_departure'].isnull().sum()

In [None]:
def get_h_m_s(value):
    h = value // 3600
    m = value % 3600 // 60
    s = value % 60
    try:
        return int(h), int(m), int(s)
    except:
        return np.nan

In [None]:
admin['flight_time'].apply(get_h_m_s)

In [None]:
import datetime
str(datetime.timedelta(seconds=7144))

In [None]:
# Reference: https://stackoverflow.com/questions/775049/how-do-i-convert-seconds-to-hours-minutes-and-seconds
m, s = divmod(7144, 60)
h, m = divmod(m, 60)
h, m, s

In [None]:
# Reference: https://stackoverflow.com/questions/1384406/convert-seconds-to-hhmmss-in-python
s = 7144
m = s // 60
h = m // 60
h, m%60, s%60

## Analysis of a single flight (flight_id = c244ac4)

In [None]:
admin.columns

In [None]:
admin[admin.flight_id == 'c244ac4']

In [None]:
str(datetime.timedelta(seconds=admin.loc[admin.flight_id == 'c244ac4', 'flight_time'].values[0]))

## Singapore Changi Airport

In [None]:
df_origin_sca = admin.loc[admin.origin.str.contains('Changi', na=False)]

In [None]:
df_dest_sca = admin.loc[admin.destination.str.contains('Changi', na=False)]

In [None]:
print(df_origin_sca.shape)
df_dest_sca.shape

In [None]:
df_origin_sca.destination.value_counts()[:10]

In [None]:
df_dest_sca.origin.value_counts()[:10]

## Route

In [None]:
admin['route'] = admin.origin + " -> " + admin.destination

In [None]:
admin['route'].value_counts()[:20]

In [None]:
# admin.dropna(subset=['flight_time']).isnull().sum()

In [None]:
admin.isnull().sum()

In [None]:
# mask = df_dest_sca.origin.str.contains('Jakarta Soekarno Hatta International Airport')
# df_jkt_sin = df_dest_sca.loc[mask]
# del admin

In [None]:
mask = admin['route'] == 'Taiwan Taoyuan International Airport -> Hong Kong International Airport'
df_twn_hkg = admin[mask]
del admin

In [None]:
df_twn_hkg.sort_values(by='scheduled_departure').head(20)

In [None]:
df_twn_hkg['scheduled_departure'].sort_values()[:15]

In [None]:
df_twn_hkg['scheduled_arrival'].sort_values()[:15]

In [None]:
df_twn_hkg[df_twn_hkg['scheduled_departure'].isnull()].index

In [None]:
df_twn_hkg.loc[841654]

In [None]:
# (df_twn_hkg['scheduled_arrival_year'].dropna().astype(int).astype(str) + '-' + \
# df_twn_hkg['scheduled_arrival_month'].dropna().astype(int).astype(str) + '-' + \
# df_twn_hkg['scheduled_arrival_day'].dropna().astype(int).astype(str)) \
# .value_counts() \
# .sort_index()
df_twn_hkg['scheduled_arrival'].dt.date.value_counts(dropna=False).sort_index()

In [None]:
df_twn_hkg['scheduled_departure'].dt.date.value_counts(dropna=False).sort_index()

In [None]:
flights_in_1970 = df_twn_hkg[(df_twn_hkg['scheduled_arrival'].dt.date == pd.Timestamp("1970-01-01 00:00:00")) & 
                             (df_twn_hkg['scheduled_departure'].dt.date == pd.Timestamp("1970-01-01 00:00:00"))]
flights_in_1970.sort_values(by='real_departure')

In [None]:
flights_in_1970['flight_callsign'].value_counts()

In [None]:
flights_in_1970['aircraft_model'].value_counts()

In [None]:
flights_in_1970['aircraft_registration'].value_counts()

In [None]:
flights_in_1970['airline'].value_counts()

In [None]:
df_twn_hkg.loc[df_twn_hkg['airline'] == 'EVA Air Cargo', 'flight_time'].value_counts()

In [None]:
trails = pd.read_feather('./AirTracks/all_csv/Jan2017/trails.feather')

In [None]:
trails.rename(columns={
    'FlightId': 'flight_id'
     }, inplace=True)

trails.columns = trails.columns.map(lambda x: x.lower())

In [None]:
trails.loc[trails['flight_id'] == 'c0b7b08', 'timestamp'].max() - \
trails.loc[trails['flight_id'] == 'c0b7b08', 'timestamp'].min()

In [None]:
flight_c24c29f = trails.loc[trails['flight_id'] == 'c24c29f', :]

In [None]:
flight_c24c29f.head()

In [None]:
datetime.fromtimestamp(flight_c24c29f.loc[71866, 'timestamp'].astype(float))

In [None]:
admin.loc[admin['flight_id'] == 'c24c29f', 'real_departure'].values[0]

In [None]:
from datetime import datetime
(datetime.fromtimestamp(flight_c24c29f.loc[71866, 'timestamp'].astype(float)) - \
admin.loc[admin['flight_id'] == 'c24c29f', 'real_departure']).map(lambda x: int(x.total_seconds()))

In [None]:
combined_jkt_sin = pd.merge(trails,
                            df_jkt_sin,
                            on='flight_id')
del trails, df_jkt_sin

In [None]:
combined_jkt_sin.head(3)

## Plot of all flights (Jakarta Soekarno Hatta International Airport -> Singapore Changi Airport)

In [None]:
# Reference: https://pbpython.com/pandas-qcut-cut.html
# Reference: https://stackoverflow.com/questions/50145702/pandas-cut-doesnt-bin-zero-values
cut_labels = ['0 <= speed < 100', 
              '100 <= speed < 200',
              '200 <= speed < 300',
              '300 <= speed < 400',
              '400 <= speed < 500',
              '500 <= speed < 600',
              '600 <= speed < 700',
              '700 <= speed < 800']
cut_bins = [-np.inf, 99, 199, 299, 399, 499, 599, 699, 799]
combined_jkt_sin['speed_interval'] = pd.cut(combined_jkt_sin['speed'], bins=cut_bins, labels=cut_labels)

# pd.cut(combined_jkt_sin['speed'], bins=np.linspace(0, 800, 9))

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))
sns.scatterplot(ax = ax, x='longitude', y='latitude', data=combined_jkt_sin)
plt.show()

In [None]:
holding_stack_flight_ids = combined_jkt_sin.loc[combined_jkt_sin['latitude'] > 1.5, 'flight_id'].unique().tolist()

## Plot of all flights that did not fly a holding pattern (Jakarta Soekarno Hatta International Airport -> Singapore Changi Airport)

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))
sns.scatterplot(ax = ax, x='longitude', y='latitude', hue='speed_interval', marker='<',
#                 palette=sns.color_palette("Blues", 8),
                palette=sns.cubehelix_palette(8),
                data=combined_jkt_sin[~combined_jkt_sin['flight_id'].isin(holding_stack_flight_ids)])
plt.show();

## Plot of all flights that flew a holding pattern (Jakarta Soekarno Hatta International Airport -> Singapore Changi Airport)

In [None]:
combined_jkt_sin.loc[combined_jkt_sin['flight_id'].isin(holding_stack_flight_ids), 'speed_interval'].unique()

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))
sns.scatterplot(ax = ax, x='longitude', y='latitude', hue='speed_interval', marker='<',
#                 palette=sns.color_palette("Blues", 7),
                palette=sns.cubehelix_palette(7),
                data=combined_jkt_sin[combined_jkt_sin['flight_id'].isin(holding_stack_flight_ids)])
plt.show();

In [None]:
combined_jkt_sin['real_departure'].min()

In [None]:
combined_jkt_sin['real_departure'].max()