In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy import sparse
import scipy.sparse.linalg
import networkx as nx

from sklearn.model_selection import train_test_split
import sklearn

import time,random

---
Creating features.

In [2]:
flights = pd.read_csv('flights_new.csv')

In [3]:
month_feature = pd.get_dummies(flights.MONTH,prefix='Month',sparse=True)
hour_feature = pd.get_dummies(flights.time_hour,prefix='Hour',sparse=True)
# org_airport = pd.get_dummies(flights.ORIGIN_AIRPORT,prefix='Org',sparse=True)
# des_airport = pd.get_dummies(flights.DESTINATION_AIRPORT,prefix='Des',sparse=True)
org_state = pd.get_dummies(flights.DEPARTURE_STATE,prefix='Org',sparse=True)
airline_feature = pd.get_dummies(flights.AIRLINE,sparse=True)
day_of_week_feature = pd.get_dummies(flights.DAY_OF_WEEK,prefix='week',sparse=True)

In [4]:
flights.head()

Unnamed: 0,DATE,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,SCHEDULED_ARRIVAL,DEPARTURE_DELAY,ARRIVAL_DELAY,AIR_TIME,DISTANCE,time_hour,DEPARTURE_CITY,DEPARTURE_STATE,DESTINATION_CITY,DESTINATION_STATE
0,2015-01-01,1,1,4,AS,98,ANC,SEA,00:05:00,04:30:00,-11.0,-22.0,169.0,1448,0,Anchorage,AK,Seattle,WA
1,2015-01-01,1,1,4,AS,108,ANC,SEA,00:45:00,05:09:00,-4.0,-14.0,173.0,1448,0,Anchorage,AK,Seattle,WA
2,2015-01-01,1,1,4,DL,1560,ANC,SEA,00:45:00,05:15:00,-14.0,-24.0,171.0,1448,0,Anchorage,AK,Seattle,WA
3,2015-01-01,1,1,4,AS,134,ANC,SEA,01:55:00,06:33:00,-15.0,-35.0,170.0,1448,1,Anchorage,AK,Seattle,WA
4,2015-01-01,1,1,4,AS,114,ANC,SEA,02:20:00,06:40:00,-11.0,-12.0,176.0,1448,2,Anchorage,AK,Seattle,WA


In [5]:
numeric_feature = flights[['AIR_TIME','DISTANCE']]

---

---
### Linear regression

In [6]:
#df = pd.concat([month_feature,hour_feature,airline_feature,day_of_week_feature,numeric_feature], axis=1)
df = pd.concat([month_feature,hour_feature,airline_feature,numeric_feature], axis=1)
delay = flights[['DEPARTURE_DELAY','ARRIVAL_DELAY']]
data = pd.concat([df,delay],axis=1)

data_ = data.loc[data.Month_8==1].copy()
# drop all zeros and all ones columns
data_ = data_.loc[:,data_.any()]
#data_ = data_.loc[:,~data_.all()]
data_.drop(columns=['Month_8'],inplace=True)
data_ = data_.loc[(data_.DEPARTURE_DELAY>-30)& (data_.DEPARTURE_DELAY<30)]

In [7]:
from sklearn import linear_model
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR

In [8]:
data_sample = data_#.sample(n=100000)

In [9]:
X = data_sample.iloc[:,:-2].get_values()
#X = sklearn.preprocessing.scale(X)

In [10]:
y = data_sample.iloc[:,-2].copy()

In [11]:
poly = PolynomialFeatures(degree=2)
X_ = poly.fit_transform(X)
#X_ = sklearn.preprocessing.scale(X_)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_, y, test_size=0.2, random_state=42)

In [13]:
reg = linear_model.LinearRegression(normalize=True).fit(X_train, y_train)

In [14]:
y_tr_pred = reg.predict(X_train)
y_te_pred = reg.predict(X_test)

In [15]:
mean_squared_error(y_train,y_tr_pred),mean_squared_error(y_test,y_te_pred)

(61.692575895173846, 61.86356495792062)

In [16]:
np.sort(y_te_pred)

array([-13.75, -11.25,  -6.5 , ...,   9.25,   9.25,  21.  ])

In [None]:
reg.

---

In [36]:
X = data_.iloc[:,:-2]
y = data_.iloc[:,-2].copy()

In [22]:
y[y<=0] = 1
y[y>0] = 2

In [23]:
y.value_counts()

1.0    356747
2.0     51295
3.0     35168
Name: DEPARTURE_DELAY, dtype: int64

In [24]:
idx1 = y.loc[y==1].index.get_values()
idx2 = y.loc[y!=1].index.get_values()
slice_idx = random.sample(list(idx1), 50000)
idx = list(np.concatenate((idx2,slice_idx)))

In [25]:
X = X.loc[idx]
X = sklearn.preprocessing.scale(X)
y = y.loc[idx]

In [26]:
y.value_counts()

2.0    51295
1.0    50000
3.0    35168
Name: DEPARTURE_DELAY, dtype: int64

In [29]:
from keras.utils import np_utils
from sklearn.preprocessing import LabelEncoder
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(y)
encoded_Y = encoder.transform(y)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_Y)

In [27]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [28]:
def NN_sequential_model(X, y, epochs, activation_function, test_size=0.2):
    '''
    Create sequential neural network model and tune activatio functions and epochs.
    
    Input:
        X, Y: list - Data of authors who have keyword vector feature.
        epochs: list - A list of epochs, which is used to separate training into distinct phases and useful for periodic evaluation.
        activation_function: list - Name of activation functions. 
        test_size: float - Proportion of test set.
        
    Return:
        score: float - Prediction accurancy score.
    '''
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    # Construct sequential model
    model = Sequential()
    
    # Two hidden layers
    model.add(Dense(64, input_dim=X_train.shape[1], activation=activation_function))
    # Dropout consists in randomly setting a fraction rate of input units to 0, which helps prevent overfitting.
    #model.add(Dropout(0.5))
    model.add(Dense(64, activation=activation_function))
    #model.add(Dropout(0.5))
    
    # Output layer performs the actual ML task, we still use logistic regression model.
    model.add(Dense(output_dim=y.shape[1], activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    # Fit NN model and evaluate the accurancy
    model.fit(X_train, y_train, epochs=epochs, batch_size=64, verbose=0)
    _, score = model.evaluate(X_test, y_test, batch_size=64, verbose=0)
    return score

In [30]:
# List of parameters
activation_function_list = ['sigmoid', 'tanh', 'relu', 'softmax', 'linear']
epochs_list = [10,20,30]
score_dict = {}
# Tune parameters
for activation_function in activation_function_list:
    t0 = time.time()
    score_list = []
    for epochs in epochs_list:
        score_list.append(NN_sequential_model(X, dummy_y, epochs, activation_function))
    t1 = time.time()-t0
    print(t1/60)
    score_dict.update({activation_function: score_list})



1.7764692028363547
1.7869672656059266
1.898898728688558
1.9820945978164672
1.9309935847918192


In [31]:
score_dict

{'sigmoid': [0.4509214817127993, 0.45125123657534855, 0.4505917268109401],
 'tanh': [0.44824680321179805, 0.44447294178212743, 0.44678122595755704],
 'relu': [0.45414575166011817, 0.44960246217197103, 0.44700106255770455],
 'softmax': [0.4507016451246631, 0.4534129630209863, 0.45018869307366743],
 'linear': [0.4468178653889131, 0.4470743414204166, 0.450555087399239]}

----

In [32]:
def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(output_dim=64, input_dim=X.shape[1], activation='relu'))
    model.add(Dense(output_dim=y.unique().shape[0], activation='softmax'))
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model
estimator = KerasClassifier(build_fn=baseline_model, epochs=30, batch_size=64, verbose=0)

In [33]:
# fix random seed for reproducibility
seed = 7
np.random.seed(seed)

In [34]:
kfold = KFold(n_splits=3, shuffle=True, random_state=seed)

In [35]:
t0 = time.time()
results = cross_val_score(estimator, X, dummy_y, cv=kfold)
print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))
t1 = time.time() - t0
print(t1/60)

  after removing the cwd from sys.path.
  """


Baseline: 36.18% (0.89%)
2.213956348101298


In [81]:
X_train, X_test, Y_train, Y_test = train_test_split(X, dummy_y, test_size=0.1, random_state=seed)
estimator.fit(X_train, Y_train)

<keras.callbacks.History at 0x1a3f212978>

In [88]:
pred = estimator.predict(X_test)

---
### Decision tree

In [83]:
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score


In [153]:
data_ = flights.loc[(flights.DEPARTURE_DELAY>=-5)& (flights.DEPARTURE_DELAY<=30)]
y = data_.loc[(data_.MONTH==8)].DEPARTURE_DELAY.copy()
y[y<5] = 1
y[(y>=5)&(y<15)] = 2
y[y>=15] = 3
y.value_counts()


1.0    264869
2.0     51295
3.0     36796
Name: DEPARTURE_DELAY, dtype: int64

In [154]:
idx1 = y.loc[y==1].index.get_values()
idx2 = y.loc[y!=1].index.get_values()
slice_idx = random.sample(list(idx1), 50000)
idx = list(np.concatenate((idx2,slice_idx)))

In [155]:
data_ = data_.loc[idx]

In [173]:
#month_feature = pd.get_dummies(data_.MONTH,prefix='Month',sparse=True)
hour_feature = pd.get_dummies(data_.time_hour,prefix='Hour',sparse=True)
org_state = pd.get_dummies(data_.DEPARTURE_STATE,prefix='Org',sparse=True)
airline_feature = pd.get_dummies(data_.AIRLINE,sparse=True)
day_of_week_feature = pd.get_dummies(data_.DAY_OF_WEEK,prefix='week',sparse=True)
numeric_feature = data_[['AIR_TIME','DISTANCE']]

In [174]:
df = pd.concat([hour_feature,org_state,airline_feature,day_of_week_feature,numeric_feature], axis=1)

In [178]:
X = df.copy()
X = sklearn.preprocessing.scale(X)
y = data_.DEPARTURE_DELAY.copy()

In [179]:
y[y<5] = 1
y[(y>=5)&(y<15)] = 2
y[y>=15] = 3
y.value_counts()

2.0    51295
1.0    50000
3.0    36796
Name: DEPARTURE_DELAY, dtype: int64

In [127]:
from keras.utils import np_utils
from sklearn.preprocessing import LabelEncoder
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(y)
encoded_Y = encoder.transform(y)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_Y)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [226]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [192]:
scoring = {'Accuracy': make_scorer(accuracy_score)}
gs = GridSearchCV(tree.DecisionTreeClassifier(random_state=42),
                  param_grid={'min_samples_split': range(500, 2000, 100),
                             'criterion':('gini','entropy'),
                              'class_weight':(None,'balanced')
                             },
                  scoring=scoring, cv=3, refit=False,return_train_score=True)
gs.fit(X_train, y_train)
results = gs.cv_results_

In [190]:
gs

GridSearchCV(cv=3, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'min_samples_split': range(400, 600, 10), 'criterion': ('gini', 'entropy')},
       pre_dispatch='2*n_jobs', refit=False, return_train_score=True,
       scoring={'Accuracy': make_scorer(accuracy_score)}, verbose=0)

In [169]:
results.keys()

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_min_samples_split', 'params', 'split0_test_Accuracy', 'split1_test_Accuracy', 'split2_test_Accuracy', 'mean_test_Accuracy', 'std_test_Accuracy', 'rank_test_Accuracy', 'split0_train_Accuracy', 'split1_train_Accuracy', 'split2_train_Accuracy', 'mean_train_Accuracy', 'std_train_Accuracy'])

In [198]:
results['params'][26]

{'class_weight': None, 'criterion': 'entropy', 'min_samples_split': 1600}

In [197]:
np.argmax(results['mean_test_Accuracy'])

26

In [194]:
results['mean_train_Accuracy']

array([0.46352924, 0.45962326, 0.45723351, 0.45541858, 0.45362173,
       0.4516846 , 0.45015932, 0.44785558, 0.44666523, 0.44548846,
       0.44399487, 0.44278642, 0.44096695, 0.44013416, 0.43949147,
       0.46299064, 0.4594377 , 0.45674923, 0.4543414 , 0.45245404,
       0.44991947, 0.44848471, 0.44704997, 0.44552017, 0.44408541,
       0.44286791, 0.4421709 , 0.44075424, 0.43940096, 0.43878542,
       0.44756141, 0.44270044, 0.43904339, 0.43650431, 0.43426842,
       0.4307155 , 0.4290092 , 0.42950708, 0.42752469, 0.4256147 ,
       0.42498558, 0.42461897, 0.42146427, 0.41958598, 0.4180788 ,
       0.445837  , 0.43994407, 0.43632326, 0.43569413, 0.43281559,
       0.43070645, 0.42897297, 0.42809041, 0.42630714, 0.42237853,
       0.42095281, 0.42091661, 0.41815571, 0.41688842, 0.4152726 ])

In [171]:
results['rank_test_Accuracy']

array([20, 18, 19, 17, 14, 15, 13, 16, 11, 12, 10,  9,  8,  7,  4,  3,  1,
        5,  6,  2], dtype=int32)

In [229]:
clf = tree.DecisionTreeClassifier(min_samples_split=1600,min_samples_leaf=1,criterion='entropy',).fit(X_train,y_train)

In [230]:
clf.score(X_train,y_train)

0.44623071909624157

In [231]:
clf.score(X_test,y_test)

0.4354610956225787

In [234]:
feature_score = pd.concat((pd.Series(df.columns.get_values()),pd.Series(clf.feature_importances_)),axis=1)
feature_score.columns=['feature_name','score']

In [238]:
feature_score.sort_values(by='score',ascending=False)

Unnamed: 0,feature_name,score
6,Hour_6,0.164742
7,Hour_7,0.117648
8,Hour_8,0.077441
5,Hour_5,0.076764
80,DL,0.046981
98,DISTANCE,0.046552
97,AIR_TIME,0.046518
87,UA,0.046209
89,WN,0.035562
9,Hour_9,0.033870


In [None]:
selector = RFE(clf, 5, step=1).fit(X_train,y_train)


In [202]:
from sklearn.feature_selection import RFE