https://www.transtats.bts.gov/Fields.asp?Table_ID=236

In [1]:
from __future__ import division #, print_function # Imports from __future__ since we're running Python 2

In [2]:
#%load_ext autoreload
%reload_ext autoreload
%autoreload 2

In [3]:
import os
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
random_state = 0
%matplotlib inline
plt.style.use('ggplot')
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from datetime import datetime
from sklearn.ensemble import IsolationForest
from helpers.feature_engineering import dateStrToDayYear, getMappingOfSimilarCategoricalColumns, \
    compareSimilarCategoricalColumns 
from helpers.my_one_hot_encoder import MyOneHotEncoder
from helpers.py_helpers import is_number
from scipy.stats import skew, kurtosis
from helpers.outliers import MyOutliers
from sklearn.preprocessing import StandardScaler
from helpers.plot_helper import scatter_2d_label
from sklearn.decomposition import PCA # Import the PCA module
from sklearn.manifold import TSNE
from sklearn.decomposition import KernelPCA
from sklearn.manifold import MDS
from sklearn.dummy import DummyClassifier
from sklearn.manifold import Isomap
from helpers.performance_issues import subsample_keeping_class_proportions
from flights_delay.feature_processing import FlightDelayFeatureProcessing
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score

In [4]:
rng = np.random.RandomState(seed=random_state)
n_jobs = 3 #we prefer to have it run in only a few cores instead of all of them to avoid making the system unstable

# Flying to New York City - Use all training data

In [5]:
target_col = 'IS_DELAYED'

In [6]:
path_data = os.path.realpath(os.path.join(os.getcwd(), '../Data', 'train_data_numerical_normalized.csv'))
assert os.path.isfile(path_data)
path_data

'/home/student/pligor.george@gmail.com/msc_Artificial_Intelligence/dme_Data_Mining/dmedatarats/Data/train_data_numerical_normalized.csv'

In [7]:
df = pd.read_csv(path_data, delimiter = ',', header=0)
df.shape

(433495, 539)

In [8]:
df.sample(10, random_state=random_state)

Unnamed: 0,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,FL_NUM,DEP_TIME,DEP_DELAY,DEP_DELAY_GROUP,DISTANCE,DISTANCE_GROUP,...,DEP_TIME_BLK_9,DEP_TIME_BLK_10,DEP_TIME_BLK_11,DEP_TIME_BLK_12,DEP_TIME_BLK_13,DEP_TIME_BLK_14,DEP_TIME_BLK_15,DEP_TIME_BLK_16,DEP_TIME_BLK_17,IS_DELAYED
378736,-1.354511,-1.61941,0.369682,-0.96274,-0.954384,-0.001447,0.722605,1.164551,-0.233592,-0.25187,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
423345,1.358389,1.030006,-0.99447,0.553547,-0.977922,1.588134,0.195416,0.329982,2.114479,2.266057,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,False
150004,-0.450211,-0.441892,1.279117,0.553547,-0.775886,0.397407,-0.458298,-0.504587,-0.317054,-0.25187,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
233834,-0.450211,-0.736272,1.165437,-0.96274,2.407655,-0.785537,-0.289598,-0.504587,-0.6203,-0.611574,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
406842,-0.450211,-0.441892,-0.880791,1.564406,-0.389467,0.8235,0.174328,0.329982,-0.408862,-0.25187,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,False
336024,-1.354511,-1.325031,-1.449188,-0.457311,-0.664733,-2.330368,-0.690261,-0.921871,0.167027,0.107834,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
9065,-0.450211,-0.736272,-0.653432,1.564406,-0.312314,-0.752461,-0.247423,-0.087302,-0.408862,-0.25187,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
134634,1.358389,1.324385,-1.335508,0.553547,2.312195,0.553058,1.650457,2.416404,-0.552139,-0.611574,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
256092,0.454089,0.441246,0.938079,-0.457311,0.678905,-1.135751,-0.500473,-0.504587,-1.050131,-0.971278,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
297582,0.454089,0.146867,-0.767112,1.058976,2.584845,-0.989828,-0.289598,-0.504587,0.784647,0.827241,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False


In [9]:
df.info()  #here we are verifying that everything is numerical

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 433495 entries, 0 to 433494
Columns: 539 entries, QUARTER to IS_DELAYED
dtypes: bool(1), float64(538)
memory usage: 1.7 GB


In [10]:
XX = df.drop(labels=[target_col], axis=1)
yy = df[target_col]
XX.shape, yy.shape

((433495, 538), (433495,))

In [11]:
del df  #to save memory

In [12]:
columns_not_one_hot = np.array([col for col in XX.columns if not is_number(col[-1])])
columns_not_one_hot

array(['QUARTER', 'MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'FL_NUM',
       'DEP_TIME', 'DEP_DELAY', 'DEP_DELAY_GROUP', 'DISTANCE',
       'DISTANCE_GROUP', 'YDAY'], 
      dtype='|S15')

In [13]:
XX[columns_not_one_hot].describe()

Unnamed: 0,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,FL_NUM,DEP_TIME,DEP_DELAY,DEP_DELAY_GROUP,DISTANCE,DISTANCE_GROUP,YDAY
count,433495.0,433495.0,433495.0,433495.0,433495.0,433495.0,433495.0,433495.0,433495.0,433495.0,433495.0
mean,5.320645e-13,2.46634e-13,1.056013e-14,1.652396e-13,1.189981e-14,1.761014e-14,-8.641548e-14,-8.012561e-14,4.514881e-13,4.637949e-13,2.712315e-14
std,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001
min,-1.354511,-1.61941,-1.676546,-1.468169,-1.087113,-2.591083,-3.410555,-0.9218714,-1.338075,-1.330982,-1.749922
25%,-0.4502109,-0.7362718,-0.8807909,-0.9627398,-0.7850394,-0.8341777,-0.3739479,-0.5045869,-0.7162816,-0.6115742,-0.8562147
50%,-0.4502109,-0.1475127,0.02864386,0.04811833,-0.2495455,-0.01117555,-0.3106853,-0.5045869,-0.1556937,-0.2518703,-0.01055624
75%,0.4540889,0.735626,0.8243993,1.058976,0.3389093,0.8040441,-0.07872222,-0.0873025,0.4633178,0.4675374,0.8543217
max,1.358389,1.618765,1.733834,1.564406,3.776139,2.076487,29.99213,4.920111,5.462705,2.266057,1.757639


### Subsampling

In [14]:
sss = StratifiedShuffleSplit(test_size=0.5, random_state=random_state, n_splits=1)

In [15]:
_, survivors = sss.split(XX, yy).next()

In [16]:
survivors

array([337054, 157315, 332854, ..., 301122,  50102, 211954])

In [17]:
XX_sub = XX.iloc[survivors]
yy_sub = yy[survivors]
XX_sub.shape, yy_sub.shape

((216748, 538), (216748,))

### Dummy Classifier

In [18]:
dc = DummyClassifier(random_state=random_state).fit(XX, yy)

In [19]:
dc.score(XX, yy)

0.66220602313752175

In [20]:
dummy_preds = dc.predict(XX)

In [21]:
arr = f1_score(y_true=yy, y_pred=dummy_preds, average=None)  #'binary'
arr

array([ 0.78490239,  0.21366126])

In [22]:
f1_score(y_true=yy, y_pred=dummy_preds)  #'binary'

0.21366126087423479

In [23]:
precision_score(y_true=yy, y_pred=dummy_preds)

0.21376227623407043

In [24]:
f1_score(y_true=yy, y_pred=dummy_preds, average='weighted')  #'binary'

0.66214804182359899

### Isolation Forest
From target we know that the contamination is the percentage of delayed flights

In [25]:
contamination = len(yy[yy]) / len(yy)
contamination

0.2148905985074799

In [26]:
ad = IsolationForest(#n_estimators=100, #max_samples=60,
                     contamination=contamination, random_state=random_state, n_jobs=n_jobs)
ad

IsolationForest(bootstrap=False, contamination=0.214890598507,
        max_features=1.0, max_samples='auto', n_estimators=100, n_jobs=3,
        random_state=0, verbose=0)

In [27]:
ad.fit(XX_sub)

IsolationForest(bootstrap=False, contamination=0.214890598507,
        max_features=1.0, max_samples='auto', n_estimators=100, n_jobs=3,
        random_state=0, verbose=0)

In [28]:
preds = ad.predict(XX)  #+1 is inlier while -1 is outlier

In [29]:
preds.shape

(433495,)

In [30]:
preds[:60]

array([ 1,  1, -1,  1,  1,  1,  1, -1, -1,  1, -1,  1,  1,  1,  1,  1, -1,
        1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1, -1,  1,  1,  1,
        1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1, -1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1])

In [31]:
y_pred = preds.copy()

In [32]:
y_pred[y_pred == 1] = False  #no, it is not delayed

In [33]:
y_pred[y_pred == -1] = True  #yes, it is delayed

In [34]:
y_pred = y_pred.astype(np.bool)

In [35]:
arr = f1_score(y_true=yy, y_pred=y_pred, average=None)  #'binary'
arr

array([ 0.79112751,  0.23602157])

In [36]:
f1_score(y_true=yy, y_pred=y_pred) #'binary'

0.23602157469485988

In [37]:
precision_score(y_true=yy, y_pred=y_pred)

0.23622970211850736

In [39]:
f1_score(y_true=yy, y_pred=y_pred, average='weighted')  

0.67184046255063168

### Conclusion
The F1 scores either for the is_delayed==True class or the weighted one are a little bit higher than the dummy classifier. Meaning that our approach didn't work with the current features