<a href="https://colab.research.google.com/github/nv-hiep/flight_delay_prediction/blob/master/step2_read_data_features_selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Step 2: FEATURES SELECTION**

**Open this notebook from google drive**<br>
**Go to "Edit" -> "Notebook settings" and enable GPU.**


**Connect and authorize google drive with google colab:**

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')
# !ls

#Import Libraries



In [None]:
import os
import numpy   as np
import pandas  as pd
import seaborn as sns

import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

%matplotlib inline  

# Data directory

In [None]:
data_dir    = '/content/gdrive/My Drive/data'
%cd '/content/gdrive/My Drive/data'

current_dir = os.getcwd()
print(current_dir)
data_path = os.path.join(data_dir, 'flights', '')
print(data_path)

# Read flight data

In [None]:
df = pd.read_csv(os.path.join(data_path, 'merged_data_janfeb.csv') )
df.head()

In [None]:
print(df.shape)
print(len(df.columns))
print(df.columns)

In [None]:
# Check null values
df.isnull().sum()

In [None]:
# OK! No more null/nan values
# Check the datatypes
df.info()

In [None]:
df.columns

# Drop unnecessary columns 

**Drop the columns 'AIRLINE_NAME', 'YEAR', 'QUARTER' because they are not necessary. I only consider data in January and February 2017. And AIRLINE = OP_UNIQUE_CARRIER**

In [None]:
df.drop(['AIRLINE_NAME', 'YEAR', 'QUARTER'], axis=1, inplace=True)

# Create TARGETS as classes

**Instead of predicting the delay time in minutes, I'll predict in delay intervals:**

**0: ARR_DELAY <= 0 for no delay**

**1: 0 < ARR_DELAY <= 30 (minutes)**

**2: 30 < ARR_DELAY <= 60 (minutes)**

**3: 60 < ARR_DELAY <= 120 (minutes)**

**43: 120 < ARR_DELAY**

In [None]:
# Create delayed labels, if the flights are delayed more than 30 minutes (delay_thresh)
df['DELAYED'] = df['ARR_DELAY'].apply(lambda x: 0 if x <= 0 else 1 if (x > 0 and x <= 30) else 2 if (x > 30 and x <= 60) else 3 if (x > 60 and x <= 120) else 4)

In [None]:
df.head()

In [None]:
df.columns

# Convert time of Departure and arrival

In [None]:
def convert_time(x):
  x = x/100.
  return round( int(x) + (x - int(x))*100/60, 2)

In [None]:
# The actual departure time
df['CRS_DEP_TIME'] = df['CRS_DEP_TIME'].apply(convert_time)
df['CRS_DEP_TIME'] = df['CRS_DEP_TIME'].apply(lambda x:0 if x==24 else x)

df['DEP_TIME'] = df['DEP_TIME'].apply(convert_time)
df['DEP_TIME'] = df['DEP_TIME'].apply(lambda x:0 if x==24 else x)

# The actual arrival time
df['CRS_ARR_TIME'] = df['CRS_ARR_TIME'].apply(convert_time)
df['CRS_ARR_TIME'] = df['CRS_ARR_TIME'].apply(lambda x:0 if x==24 else x)

df['ARR_TIME'] = df['ARR_TIME'].apply(convert_time)
df['ARR_TIME'] = df['ARR_TIME'].apply(lambda x:0 if x==24 else x)

# Label - Encoding the categorical feautures

Here I should use One-hot-coding method, however this method will produce many more columns, so it would take a long time to train.

I will use the Label-encoder.

In [None]:
from sklearn.preprocessing import LabelEncoder

lb_make = LabelEncoder()
df['ORIGIN'] = lb_make.fit_transform(df['ORIGIN'])
df['DEST'] = lb_make.transform(df['DEST'])

df['OP_UNIQUE_CARRIER'] = lb_make.fit_transform(df['OP_UNIQUE_CARRIER'])
df['TAIL_NUM'] = lb_make.fit_transform(df['TAIL_NUM'])

In [None]:
# dummy_fields = ['DAY_OF_WEEK', 'OP_UNIQUE_CARRIER', 'TAIL_NUM', 'ORIGIN', 'DEST']
# for x in dummy_fields:
#   dummy = pd.get_dummies(df[x], drop_first=False, prefix=x)
#   df = pd.concat( [df, dummy], axis=1)
# df_sub.drop(dummy_fields, axis=1, inplace=True)

# Remove Missing Values

In [None]:
# Missing values (in percent)
df_missing = (df.isnull().sum() / len(df)).sort_values(ascending = False)
df_missing

In [None]:
# Identify missing values above threshold
df_missing = df_missing.index[df_missing > 0.75]

In [None]:
print('There are %d columns with more than 75%% missing values' % len(df_missing))

In [None]:
# Let's drop the columns, one-hot encode the dataframes, and then align the columns of the dataframes.
df.drop(df_missing, axis=1, inplace=True)

# Drop Correlated Variables

Collinear variables are those which are highly correlated with one another. These can decrease the model's availablility to learn, decrease model interpretability, and decrease generalization performance on the test set. Clearly, these are three things we want to increase, so removing collinear variables is a useful step. We will establish an admittedly arbitrary threshold for removing collinear variables, and then remove one out of any pair of variables that is above that threshold.

In [None]:
# Threshold for removing correlated variables
threshold = 0.9

# Correlation matrix with absolute values
corr_matrix = df.corr().abs()
corr_matrix.head()

In [None]:
# Upper triangle of correlations
# upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Lower triangle of correlations
lower_matrx = corr_matrix.where(np.tril(np.ones(corr_matrix.shape), k=-1).astype(np.bool))
lower_matrx.head()

In [None]:
plt.figure(figsize=(14,10))

# Generate a mask for the upper triangle
# mask = np.triu(np.ones_like(corr, dtype=bool))

plt.title('Pearson Correlation of Features', y=1.05, size=20, color='R')

sns.heatmap(lower_matrx, linewidths=0.1, vmax=1.0,
            square=True, cmap=plt.cm.RdBu_r, linecolor='white', annot=True);
plt.show();

In [None]:
# Select columns with correlations above threshold
features_to_drop = [column for column in lower_matrx.columns if any(lower_matrx[column] > threshold)]

print('There are %d columns to remove.' % (len(features_to_drop)))
features_to_drop

In [None]:
df.drop(features_to_drop, axis=1, inplace=True)

In [None]:
df.head()

# Feature Selection using Feature Importances

Perform a feature removal by first removing all zero importance features from the model. If this leaves too many features, then we can consider removing the features with the lowest importance. We will use a Gradient Boosted Model from the LightGBM library to assess feature importances. If you're used to the Scikit-Learn library, the LightGBM library has an API that makes deploying the model very similar to using a Scikit-Learn model.

Since the LightGBM model does not need missing values to be imputed, we can directly fit on the training data. We will use Early Stopping to determine the optimal number of iterations and run the model twice, averaging the feature importances to try and avoid overfitting to a certain set of features.

**NOTE:**

**To save time, I will use ONLY 20,000 samples of January 2017 for the feature selection and 5000 samples of January 2017 the training in the next step.**

In [None]:
# Modeling 
import lightgbm as lgb

df = df[ df.MONTH == 1 ]

y_train = df['DELAYED']
df.drop( ['DELAYED', 'ARR_DELAY', 'MONTH'], axis=1, inplace=True )

X_train = df.copy()
del df

X_train = X_train[:20_000]
y_train = y_train[:20_000]

# Initialize an empty array to hold feature importances
feature_importances = np.zeros(X_train.shape[1])

# Create the model with several hyperparameters
model = lgb.LGBMClassifier(objective='binary', boosting_type = 'goss', n_estimators = 10000, class_weight = 'balanced')

X_train.head()

In [None]:
# Import
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics         import roc_auc_score

# Fit the model twice to avoid overfitting
nruns = 2
for i in range(nruns):
    
    # Split into training and validation set
    train_features, valid_features, train_y, valid_y = train_test_split(X_train, y_train, test_size = 0.25, random_state = i)
    
    # Train using early stopping
    model.fit(train_features, train_y, early_stopping_rounds=100, eval_set = [(valid_features, valid_y)], 
              eval_metric = 'logloss', verbose = 200)
    
    # Record the feature importances
    feature_importances += model.feature_importances_

In [None]:
# Make sure to average feature importances! 
feature_importances = feature_importances / nruns
feature_importances = pd.DataFrame({'feature': list(X_train.columns), 'importance': feature_importances}).sort_values('importance', ascending = False)

feature_importances.head()

In [None]:
# Find the features with zero importance
zero_features = list(feature_importances[feature_importances['importance'] == 0.0]['feature'])

print('There are %d features with 0.0 importance' % len(zero_features))
print(zero_features)

feature_importances.tail()

In [None]:
def plot_feature_importances(df, threshold = 0.9):
    """
    Plot 15 most important features and the cumulative importance of features.
    Prints the number of features needed to reach threshold cumulative importance.
    
    Parameters
    --------
    df : dataframe
        Dataframe of feature importances. Columns must be feature and importance
    threshold : float, default = 0.9
        Threshold for prining information about cumulative importances
        
    Return
    --------
    df : dataframe
        Dataframe ordered by feature importances with a normalized column (sums to 1)
        and a cumulative importance column
    
    """
    
    plt.rcParams['font.size'] = 18
    
    # Sort features according to importance
    df = df.sort_values('importance', ascending = False).reset_index()
    
    # Normalize the feature importances to add up to one
    df['importance_normalized'] = df['importance'] / df['importance'].sum()
    df['cumulative_importance'] = np.cumsum(df['importance_normalized'])

    # Make a horizontal bar chart of feature importances
    plt.figure(figsize = (10, 6))
    ax = plt.subplot()
    
    # Need to reverse the index to plot most important on top
    ax.barh(list(reversed(list(df.index[:15]))), 
            df['importance_normalized'].head(15), 
            align = 'center', edgecolor = 'k')
    
    # Set the yticks and labels
    ax.set_yticks(list(reversed(list(df.index[:15]))))
    ax.set_yticklabels(df['feature'].head(15))
    
    # Plot labeling
    plt.xlabel('Normalized Importance'); plt.title('Feature Importances')
    plt.show()
    
    # Cumulative importance plot
    plt.figure(figsize = (8, 6))
    plt.plot(list(range(len(df))), df['cumulative_importance'], 'r-')
    plt.xlabel('Number of Features'); plt.ylabel('Cumulative Importance'); 
    plt.title('Cumulative Feature Importance');
    plt.show();
    
    importance_index = np.min(np.where(df['cumulative_importance'] > threshold))
    print('%d features required for %0.2f of cumulative importance' % (importance_index + 1, threshold))
    
    return df

In [None]:
# Normalized feature importance
norm_feature_importances = plot_feature_importances(feature_importances)

In [None]:
X_train.drop( zero_features, axis=1, inplace=True )
print('Training shape: ', X_train.shape)

At this point, we can re-run the model to see if it identifies any more features with zero importance. In a way, we are implementing our own form of recursive feature elimination. Since we are repeating work, we should probably put the zero feature importance identification code in a function.

In [None]:
def identify_zero_importance_features(X, y, iterations = 2):
    """
    Identify zero importance features in a training dataset based on the 
    feature importances from a gradient boosting model. 
    
    Parameters
    --------
    X : dataframe
        Training features
        
    y : np.array
        Labels for training data
        
    iterations : integer, default = 2
        Number of cross validation splits to use for determining feature importances
    """
    
    # Initialize an empty array to hold feature importances
    feature_importances = np.zeros(X.shape[1])

    # Create the model with several hyperparameters
    model = lgb.LGBMClassifier(objective='binary', boosting_type = 'goss', n_estimators = 10000, class_weight = 'balanced')
    
    # Fit the model multiple times to avoid overfitting
    for i in range(iterations):

        # Split into training and validation set
        train_features, valid_features, train_y, valid_y = train_test_split(X, y, test_size = 0.25, random_state = i)

        # Train using early stopping
        model.fit(train_features, train_y, early_stopping_rounds=100, eval_set = [(valid_features, valid_y)], 
                  eval_metric = 'logloss', verbose = 200)

        # Record the feature importances
        feature_importances += model.feature_importances_ / iterations
    
    feature_importances = pd.DataFrame({'feature': list(X.columns), 'importance': feature_importances}).sort_values('importance', ascending = False)
    
    # Find the features with zero importance
    zero_features = list(feature_importances[feature_importances['importance'] == 0.0]['feature'])
    print('\nThere are %d features with 0.0 importance' % len(zero_features))
    
    return zero_features, feature_importances

In [None]:
second_round_zero_features, feature_importances = identify_zero_importance_features(X_train, y_train)

In [None]:
norm_feature_importances = plot_feature_importances(feature_importances, threshold = 0.95)

In [None]:
feature_importances

In [None]:
norm_feature_importances

In [None]:
# Threshold for cumulative importance
threshold = 0.95


# Extract the features to keep
features_to_keep = list(norm_feature_importances[norm_feature_importances['cumulative_importance'] < threshold]['feature'])

print('Number of feautures of keep: ', len(features_to_keep))
print('Features to keep: ', features_to_keep)

# Create new datasets with smaller features
X_keep = X_train[features_to_keep]
X_keep

In [None]:
X_keep = X_keep.assign(DELAYED = y_train)
X_keep

In [None]:
X_keep.columns

In [None]:
# Re-arrange
X_keep[ ['DAY_OF_MONTH', 'OP_CARRIER_FL_NUM', 'TAIL_NUM', 'ORIGIN', 'DEST',
       'DEP_TIME', 'ARR_TIME', 'CRS_ARR_TIME', 'DISTANCE', 'TAXI_OUT',
       'TAXI_IN', 'CARRIER_DELAY', 'NAS_DELAY', 'LATE_AIRCRAFT_DELAY', 'DELAYED']]

# Save the cleaned data

In [None]:
X_keep.to_csv( os.path.join(data_path, 'cleaned_data_jan_20klines.csv'), index=False)