In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler, normalize
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras_tuner.tuners import RandomSearch
from keras_tuner.engine.hyperparameters import HyperParameters
import time
import pickle
import tensorflow.keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import sklearn
from sklearn.metrics import accuracy_score 
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_curve, average_precision_score
from sklearn.metrics import roc_curve, auc

 # Loading the data

In [2]:
def creating_dataset():
    df = pd.read_csv('dataset/datasetcleaned.csv', index_col=[0])
    X = df.drop('DEP_DEL15', axis=1).drop('MONTH',axis=1).drop("LONGITUDE",axis=1).drop("LATITUDE",axis=1)
    y = df['DEP_DEL15']
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42,shuffle=True)
    x_train,x_test = scaleNorm(x_train,x_test)
    return x_train,x_test,y_train,y_test

def redoSplit(x,y):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42,shuffle=True)
    return x_train,x_test,y_train,y_test

def balancing_dataset(x_train,y_train,drop_per):
    idx = np.where(y_train == 0)[0]
    x_train = (x_train.drop(x_train.index[idx[:int(len(idx)*drop_per)]]))
    y_train = (y_train.drop(y_train.index[idx[:int(len(idx)*drop_per)]]))
    return x_train,y_train

def standardize(x):
    x = x.to_numpy()
    x = (x- np.min(x,axis=0))/np.max(x,axis=0)
    return x

def scaleNorm(X_train,X_test):
    X_train = standardize(X_train)
    X_test = standardize(X_test)
    return pd.DataFrame(X_train),pd.DataFrame(X_test)

In [3]:
X,x_real_test,Y,y_real_test = creating_dataset()
cols = pd.read_csv('dataset/datasetcleaned.csv', index_col=[0]).drop('DEP_DEL15', axis=1).drop('MONTH',axis=1).drop("LONGITUDE",axis=1).drop("LATITUDE",axis=1).columns
x_real_train,x_test,y_real_train,y_test = redoSplit(X,Y)
x,y = balancing_dataset(x_real_train,y_real_train,0.8)
x_train,x_test,y_train,y_test = redoSplit(x,y)

# Random model

In [4]:
#baseline
y_pred = np.random.randint(0,2,y_train.shape)
cm = confusion_matrix(y_train, y_pred)
print(cm /np.sum(cm,axis=0))

[[0.48636716 0.47966301]
 [0.51363284 0.52033699]]


 # Models

In [6]:
weights = y_train.to_numpy()*4 + 1
print(weights)

[5 5 5 ... 1 5 5]


In [7]:
clf = xgb.XGBClassifier(weight=weights)
clf.fit(x_real_train, y_real_train)
train_acc,t_acc = clf.score(x_real_train, y_real_train), clf.score(x_real_test, y_real_test)
y_hat = clf.predict(x_real_test)
cm = confusion_matrix(y_real_test, y_hat)
print(cm /np.sum(cm,axis=0))

Parameters: { "weight" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[[0.84515874 0.33462282]
 [0.15484126 0.66537718]]


In [5]:
xgb_model = xgb.XGBClassifier(learning_rate=0.1,
                                  max_depth=10,
                                  eval_metric='logloss')
xgb_model.fit(x_train,y_train)
y_pred_test = xgb_model.predict(x_test)
cm_test = confusion_matrix(y_pred_test,y_test.to_numpy())
print(cm_test /np.sum(cm_test,axis=0))



[[0.648341   0.33749322]
 [0.351659   0.66250678]]


In [None]:
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(x_train, y_train)
print("Train accuracy", clf.score(x_train, y_train))
print("Test accuracy", clf.score(x_test, y_test))
y_hat = clf.predict(x_test)
cm = confusion_matrix(y_test, y_hat)
print(cm /np.sum(cm,axis=0))

In [None]:
clf = CatBoostClassifier()
clf.fit(x_train, y_train, verbose=False)
print("Train accuracy", clf.score(x_train, y_train))
print("Test accuracy", clf.score(x_test, y_test))
y_hat = clf.predict(x_test)
cm = confusion_matrix(y_test, y_hat)
print(cm /np.sum(cm,axis=0))

### Deep Learning

#### AutoML

In [None]:
LOG_DIR = f"{int(time.time())}"

# tensorboard = TensorBoard(log_dir=LOG_DIR)

def build_model(hp):  # random search passes this hyperparameter() object 
    model = keras.models.Sequential()

    model.add(Dense(hp.Int('input_units',min_value=50, max_value=200, step=50), 
                    input_shape=x_train.shape[1:],
                    activation='relu'))

    model.add( Dropout(hp.Float('dropout_rate',
                                    min_value=0.0,
                                    max_value=0.5,
                                    step=0.1) ) )
    
    for i in range(hp.Int('n_layers', 1, 2)):  # adding variation of layers.
        model.add(Dense(hp.Int(f'layer_{i}_units',
                                min_value=10,
                                max_value=150,
                                step=30), 
                                activation='relu'))
    
        model.add( Dropout(hp.Float(f'dropout_{i}_rate',
                                    min_value=0.0,
                                    max_value=0.5,
                                    step=0.1) ) )
    
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', 
              optimizer=tensorflow.keras.optimizers.Adam(),
              metrics =['accuracy'])
              
    return model
                  

tuner = RandomSearch(
    build_model,
    objective='accuracy',
    max_trials=10,  # how many variations on model?
    executions_per_trial=1) # how many trials per variation? (same model could perform differently)


tuner.search_space_summary()

tuner.search(x=x_train,
             y=y_train,
             epochs=5,
             batch_size=64)

tuner.results_summary()

#### DL model

In [None]:
def build_model():  # random search passes this hyperparameter() object 
    model = keras.models.Sequential()

    model.add(Dense(200, input_shape=x_train.shape[1:], activation='relu'))
    model.add(Dense(100, activation='relu'))
    model.add( Dropout( 0.1 ) )
    model.add(Dense(70, activation='relu'))
    model.add( Dropout( 0.4 ) )
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', 
              optimizer=tensorflow.keras.optimizers.Adam(),
              metrics =['accuracy'])
              
    return model
                  

model = build_model()

early_stopping = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=10, verbose=1)
reducer = tf.keras.callbacks.ReduceLROnPlateau( monitor='loss', factor=0.5, patience=3, verbose=1, mode='min')


model.fit(x_train,y_train, callbacks=[early_stopping, reducer], batch_size=256,verbose=1,epochs=1000 )

In [None]:
y_hat = model.predict_classes(x_real_test)
# y_test
print("Test accuracy", sklearn.metrics.accuracy_score(y_real_test, y_hat))

### Confusion Matrix

In [None]:
y_hat = clf.predict(x_real_test)
cm = confusion_matrix(y_real_test, y_hat)
cm /np.sum(cm,axis=0)

### Precision Recall Curve


In [None]:
y_train_pred = model.predict(x_train)
train_precision, train_recall, _ = precision_recall_curve(y_train, y_train_pred)
train_ap = average_precision_score(y_train, y_train_pred)
y_test_pred = model.predict(x_real_test)
test_precision, test_recall, _ = precision_recall_curve(y_real_test, y_test_pred)
test_ap = average_precision_score(y_real_test, y_test_pred)


plt.figure(figsize=(20, 15))

axs = plt.subplot(2,2,1)
axs.set_title('Train Precision Recall Curve')
axs.set_xlabel('Recall')
axs.set_ylabel('Precission')
axs.plot(train_recall, train_precision, label = 'AP = {:.4f}'.format(train_ap))
axs.legend()

axs = plt.subplot(2,2,2)
axs.set_title('Test Precision Recall Curve')
axs.set_xlabel('Recall')
axs.set_ylabel('Precission')
axs.plot(test_recall, test_precision, label = 'AP = {:.4f}'.format(test_ap))
axs.legend()

### ROC curve

In [None]:
train_fpr, train_tpr, _ = roc_curve(y_train, y_train_pred)
train_auc = auc(train_fpr, train_tpr)

test_fpr, test_tpr, _ = roc_curve(y_real_test, y_test_pred)
test_auc = auc(test_fpr, test_tpr)

plt.figure(figsize=(20, 15))

axs = plt.subplot(2,2,1)
axs.set_title('Train ROC Curve')
axs.set_xlabel('False Positive Rate')
axs.set_ylabel('True Positive Rate')
axs.plot(train_fpr, train_tpr, label = 'AUC = {:.4f}'.format(train_auc))
axs.legend()

axs = plt.subplot(2,2,2)
axs.set_title('Test ROC Curve')
axs.set_xlabel('False Positive Rate')
axs.set_ylabel('True Positive Rate')
axs.plot(test_fpr, test_tpr, label = 'AUC = {:.4f}'.format(test_auc))
axs.legend()
plt.show()

# Trash

In [None]:
# tr_acc = []
# test_acc = []
# X = df.drop('DEP_DEL15', axis=1)
# y = df['DEP_DEL15']
# X_tmp, X_test, y_tmp, y_test = train_test_split(X, y, test_size=0.20, random_state=42,shuffle=True)
# for i in range(1,9):
#     print("curr percentage",i/10)
#     X_train,X_val,y_train,y_val = train_test_split(X_tmp, y_tmp, test_size=0.33, random_state=42,shuffle=True)
#     idx = np.where(y_train == 0)[0]
#     drop_per = int(len(idx)*i/10)
#     X_train = X_train.drop(X_train.index[idx[:drop_per]])
#     y_train = y_train.drop(y_train.index[idx[:drop_per]])

#     clf = xgb.XGBClassifier()
#     clf.fit(X_train, y_train)
#     train_acc,t_acc = clf.score(X_train, y_train), clf.score(X_val, y_val)
#     tr_acc.append(train_acc)
#     test_acc.append(t_acc)
#     print("Train accuracy",train_acc)
#     print("Test accuracy",t_acc)
#     y_hat = clf.predict(X_val)
#     cm = confusion_matrix(y_val, y_hat)
#     print(cm /np.sum(cm,axis=0))
    
# y_train_pred = clf.predict_proba(X_train)
# y_test_pred = clf.predict_proba(X_test)
