In [1]:
import pandas as pd
import time

In [2]:
path = '../datasets/resampledDF.csv'
df = pd.read_csv(path, index_col=0)

In [3]:
df.head()


Unnamed: 0,SCHEDULED_DEPATURE_HR,SCHEDULED_ARRIVAL_HR,AIRLINE_CODE,PRESSURE,PRESSURE_DEST,ORIGIN_AIRPORT_CODE,RH_DEST,RH,DESTINATION_AIRPORT_CODE,DEWPT_DEST,DEWPT,CATEGORY
0,0,5,3,29.39,30.04,5,46.0,62.0,5,17.0,25.0,0
1,0,5,0,29.39,30.04,5,46.0,62.0,5,17.0,25.0,0
2,0,5,3,29.39,30.5,5,54.0,62.0,7,19.0,25.0,0
3,0,6,0,30.51,29.26,6,77.0,56.0,0,13.0,20.0,0
4,0,8,0,29.92,29.5,0,64.0,20.0,2,20.0,18.0,0


In [4]:
target_name = 'CATEGORY'
features_names = df.columns.drop(target_name)
X = df[features_names]
y = df[target_name]

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report


In [6]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [7]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, #number of trees in the forest
                            random_state=42,  #for reproducibility
                            bootstrap=True, #bootstrap samples when building trees
                            max_depth=None, #which is the same as having no limit
                            max_features='sqrt', #max number of features considered for splitting a node
                            min_samples_leaf=1, #min number of samples required to be at a leaf node
                            min_samples_split=2, #min number of samples required to split an internal node
                            criterion='gini', #measure of quality of a split
                            class_weight=None, #which is the same as not using weights,
                            min_weight_fraction_leaf=0.0, #min weighted fraction of the sum total of weights required to be at a leaf node
                            ccp_alpha=0.0, #complexity parameter used for Minimal Cost-Complexity Pruning,
                            max_samples=None, #max number of samples drawn from X to train each base estimator
                            max_leaf_nodes=None, #grow trees with max_leaf_nodes in best-first fashion
                            min_impurity_decrease=0.0, #a node will be split if this split induces a decrease of the impurity greater than or equal to this value
                            n_jobs=-1) #number of jobs to run in parallel

start = time.time()
rf.fit(X_train, y_train)
fit_time = time.time() - start

start = time.time()
y_pred = rf.predict(X_test)
predict_time = time.time() - start

accuracy = accuracy_score(y_test, y_pred)
cls_report = classification_report(y_test, y_pred)

print('fit_time: ', fit_time)
print('predict_time: ', predict_time)
print('accuracy: ', accuracy)
print('classification_report: ', cls_report)


fit_time:  171.88824033737183
predict_time:  12.05019736289978
accuracy:  0.8662186962121989
classification_report:                precision    recall  f1-score   support

           0       0.87      0.76      0.81    263450
           1       0.86      0.93      0.90    438382

    accuracy                           0.87    701832
   macro avg       0.87      0.84      0.85    701832
weighted avg       0.87      0.87      0.86    701832



In [8]:
#decision tree
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=42, #for reproducibility
                            criterion='gini', #measure of quality of a split
                            splitter='best', #strategy used to choose the split at each node
                            max_depth=None, #which is the same as having no limit
                            min_samples_split=2, #min number of samples required to split an internal node
                            min_samples_leaf=1, #min number of samples required to be at a leaf node
                            min_weight_fraction_leaf=0.0, #min weighted fraction of the sum total of weights required to be at a leaf node
                            max_features=None, #which is the same as not using any features
                            max_leaf_nodes=None, #grow trees with max_leaf_nodes in best-first fashion
                            min_impurity_decrease=0.0, #a node will be split if this split induces a decrease of the impurity greater than or equal to this value
                            ccp_alpha=0.0, #complexity parameter used for Minimal Cost-Complexity Pruning
                            class_weight=None) #which is the same as not using weights

start = time.time()
dt.fit(X_train, y_train)
fit_time = time.time() - start

start = time.time()
y_pred = dt.predict(X_test)
predict_time = time.time() - start

accuracy = accuracy_score(y_test, y_pred)
cls_report = classification_report(y_test, y_pred)

print('fit_time: ', fit_time)
print('predict_time: ', predict_time)
print('accuracy: ', accuracy)
print('classification_report: ', cls_report)

fit_time:  30.369096517562866
predict_time:  0.49315714836120605
accuracy:  0.7792776618905949
classification_report:                precision    recall  f1-score   support

           0       0.71      0.69      0.70    263450
           1       0.82      0.83      0.82    438382

    accuracy                           0.78    701832
   macro avg       0.76      0.76      0.76    701832
weighted avg       0.78      0.78      0.78    701832



In [9]:
#KNN

from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3, #number of neighbors to use by default for kneighbors queries
                            weights='uniform', #weight function used in prediction
                            algorithm='auto', #algorithm used to compute the nearest neighbors
                            leaf_size=30, #leaf size passed to BallTree or KDTree
                            p=2, #power parameter for the Minkowski metric
                            metric='minkowski', #the distance metric to use for the tree
                            metric_params=None, #additional keyword arguments for the metric function
                            n_jobs=-1) #number of parallel jobs to run

start = time.time()
knn.fit(X_train, y_train)
fit_time = time.time() - start

start = time.time()
y_pred = knn.predict(X_test)
predict_time = time.time() - start

accuracy = accuracy_score(y_test, y_pred)
cls_report = classification_report(y_test, y_pred)

print('fit_time: ', fit_time)
print('predict_time: ', predict_time)
print('accuracy: ', accuracy)
print('classification_report: ', cls_report)


fit_time:  10.31286907196045
predict_time:  46.872979402542114
accuracy:  0.8743816183930058
classification_report:                precision    recall  f1-score   support

           0       0.90      0.74      0.82    263450
           1       0.86      0.95      0.90    438382

    accuracy                           0.87    701832
   macro avg       0.88      0.85      0.86    701832
weighted avg       0.88      0.87      0.87    701832



In [10]:
#logistic regression

from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(penalty='l2', #used to specify the norm used in the penalization
                        dual=False, #select the algorithm to either solve the dual or primal optimization problem
                        tol=0.0001, #tolerance for stopping criteria
                        C=1.0, #inverse of regularization strength
                        fit_intercept=True, #whether to calculate the intercept for this model
                        intercept_scaling=1, #when self.fit_intercept is set to True, instance vector x becomes [x, self.intercept_scaling],
                        class_weight=None, #which is the same as not using weights
                        random_state=42, #for reproducibility
                        solver='lbfgs', #algorithm to use in the optimization problem
                        max_iter=100, #maximum number of iterations taken for the solvers to converge
                        multi_class='auto', #if the option chosen is 'ovr', then a binary problem is fit for each label
                        verbose=0, #for the liblinear and lbfgs solvers set verbose to any positive number for verbosity
                        warm_start=False, #when set to True, reuse the solution of the previous call to fit as initialization,
                        n_jobs=-1, #number of CPU cores used when parallelizing over classes if multi_class='ovr'
                        l1_ratio=None) #the Elastic-Net mixing parameter, with 0 <= l1_ratio <= 1

start = time.time()
lr.fit(X_train, y_train)
fit_time = time.time() - start

start = time.time()
y_pred = lr.predict(X_test)
predict_time = time.time() - start

accuracy = accuracy_score(y_test, y_pred)
cls_report = classification_report(y_test, y_pred)

print('fit_time: ', fit_time)
print('predict_time: ', predict_time)
print('accuracy: ', accuracy)
print('classification_report: ', cls_report)

fit_time:  17.89913010597229
predict_time:  0.047102928161621094
accuracy:  0.6525920733166912
classification_report:                precision    recall  f1-score   support

           0       0.57      0.29      0.38    263450
           1       0.67      0.87      0.76    438382

    accuracy                           0.65    701832
   macro avg       0.62      0.58      0.57    701832
weighted avg       0.63      0.65      0.62    701832



In [12]:
#NN

from sklearn.neural_network import MLPClassifier

nn = MLPClassifier(hidden_layer_sizes=(100, ), #the ith element represents the number of neurons in the ith hidden layer
                    activation='relu', #activation function for the hidden layer
                    solver='adam', #the solver for weight optimization
                    alpha=0.0001, #L2 penalty (regularization term) parameter
                    batch_size='auto', #size of minibatches for stochastic optimizers
                    learning_rate='constant', #learning rate schedule for weight updates
                    learning_rate_init=0.001, #initial learning rate used
                    power_t=0.5, #exponent for inverse scaling learning rate
                    max_iter=200, #maximum number of iterations
                    shuffle=True, #whether to shuffle samples in each iteration
                    random_state=42, #for reproducibility
                    tol=0.0001, #tolerance for the optimization
                    verbose=False, #whether to print progress messages to stdout
                    warm_start=False, #when set to True, reuse the solution of the previous call to fit as initialization
                    momentum=0.9, #momentum for gradient descent update
                    nesterovs_momentum=True, #whether to use Nesterov's momentum
                    early_stopping=False, #whether to use early stopping to terminate training when validation score is not improving
                    validation_fraction=0.1, #the proportion of training data to set aside as validation set for early stopping
                    beta_1=0.9, #exponential decay rate for estimates of first moment vector in adam, should be in [0, 1)
                    beta_2=0.999, #exponential decay rate for estimates of second moment vector in adam, should be in [0, 1)
                    epsilon=1e-08, #value for numerical stability in adam
                    n_iter_no_change=10, #maximum number of epochs to not meet tol improvement
                    max_fun=15000) #maximum number of function evaluations

start = time.time()
nn.fit(X_train, y_train)
fit_time = time.time() - start

start = time.time()
y_pred = nn.predict(X_test)
predict_time = time.time() - start

accuracy = accuracy_score(y_test, y_pred)
cls_report = classification_report(y_test, y_pred)

print('fit_time: ', fit_time)
print('predict_time: ', predict_time)
print('accuracy: ', accuracy)
print('classification_report: ', cls_report)


fit_time:  1139.9256253242493
predict_time:  1.216982364654541
accuracy:  0.6849944146177432
classification_report:                precision    recall  f1-score   support

           0       0.60      0.49      0.54    263450
           1       0.72      0.80      0.76    438382

    accuracy                           0.68    701832
   macro avg       0.66      0.65      0.65    701832
weighted avg       0.68      0.68      0.68    701832



In [15]:
#gradient boosting

from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(loss='log_loss', #loss function to be optimized
                                learning_rate=0.1, #learning rate shrinks the contribution of each tree by learning_rate
                                n_estimators=100, #the number of boosting stages to perform
                                subsample=1.0, #the fraction of samples to be used for fitting the individual base learners
                                criterion='friedman_mse', #the function to measure the quality of a split
                                min_samples_split=2, #the minimum number of samples required to split an internal node
                                min_samples_leaf=1, #the minimum number of samples required to be at a leaf node
                                min_weight_fraction_leaf=0.0, #the minimum weighted fraction of the sum total of weights (of all the input samples)
                                max_depth=3, #maximum depth of the individual regression estimators
                                min_impurity_decrease=0.0, #a node will be split if this split induces a decrease of the impurity greater than or equal to this value
                                init=None, #an estimator object that is used to compute the initial predictions
                                random_state=42, #for reproducibility
                                max_features=None, #the number of features to consider when looking for the best split
                                verbose=0, #enable verbose output
                                max_leaf_nodes=None, #grow trees with max_leaf_nodes in best-first fashion
                                warm_start=False, #when set to True, reuse the solution of the previous call to fit and add more estimators to the ensemble
                                validation_fraction=0.1, #the proportion of training data to set aside as validation set for early stopping
                                n_iter_no_change=None, #n_iter_no_change is used to decide if early stopping will be used to terminate training when validation score is not improving
                                tol=0.0001, #tol is the threshold for measuring the increase of the validation score. If the increase is smaller than tol, the training stops
                                ccp_alpha=0.0) #complexity parameter used for Minimal Cost-Complexity Pruning. The subtree with the largest cost complexity that is smaller than ccp_alpha will be chosen

start = time.time()
gb.fit(X_train, y_train)
fit_time = time.time() - start

start = time.time()
y_pred = gb.predict(X_test)
predict_time = time.time() - start

accuracy = accuracy_score(y_test, y_pred)
cls_report = classification_report(y_test, y_pred)

print('fit_time: ', fit_time)
print('predict_time: ', predict_time)
print('accuracy: ', accuracy)
print('classification_report: ', cls_report)

fit_time:  584.1369001865387
predict_time:  1.0987548828125
accuracy:  0.7140184545589258
classification_report:                precision    recall  f1-score   support

           0       0.68      0.44      0.54    263450
           1       0.72      0.88      0.79    438382

    accuracy                           0.71    701832
   macro avg       0.70      0.66      0.67    701832
weighted avg       0.71      0.71      0.70    701832



In [None]:
#save each model, as pickle

import pickle
#ts = time.time(), round
ts = round(time.time())

with open(f'lr_{ts}.pkl', 'wb') as f:
    pickle.dump(knn, f)

with open(f'dt_{ts}.pkl', 'wb') as f:
    pickle.dump(dt, f)

with open(f'rf_{ts}.pkl', 'wb') as f:
    pickle.dump(rf, f)

with open(f'nn_{ts}.pkl', 'wb') as f:
    pickle.dump(nn, f)

with open(f'gb_{ts}.pkl', 'wb') as f:
    pickle.dump(gb, f)

with open(f'knn_{ts}.pkl', 'wb') as f:
    pickle.dump(knn, f)

import os
#print size of models
size_lr = os.path.getsize(f'lr_{ts}.pkl')
size_dt = os.path.getsize(f'dt_{ts}.pkl')
size_rf = os.path.getsize(f'rf_{ts}.pkl')
size_nn = os.path.getsize(f'nn_{ts}.pkl')
size_gb = os.path.getsize(f'gb_{ts}.pkl')
size_knn = os.path.getsize(f'knn_{ts}.pkl')
