In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from sklearn import preprocessing
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_classif
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
import xgboost as xgb
from sklearn.feature_selection import SelectKBest
pd.options.mode.chained_assignment = None  # default='warn'
from sklearn.preprocessing import normalize

# Preprocessing

In [2]:
def creating_dataset():
    df = pd.read_csv('dataset/datasetcleaned.csv', index_col=[0])
    X = df.drop('DEP_DEL15', axis=1).drop('MONTH',axis=1).drop("LONGITUDE",axis=1).drop("LATITUDE",axis=1)
    y = df['DEP_DEL15']
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42,shuffle=True)
    x_train,x_test = scaleNorm(x_train,x_test)
    return x_train,x_test,y_train,y_test

def redoSplit(x,y):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42,shuffle=True)
    return x_train,x_test,y_train,y_test

def balancing_dataset(x_train,y_train,drop_per):
    idx = np.where(y_train == 0)[0]
    x_train = (x_train.drop(x_train.index[idx[:int(len(idx)*drop_per)]]))
    y_train = (y_train.drop(y_train.index[idx[:int(len(idx)*drop_per)]]))
    return x_train,y_train

def standardize(x):
    x = x.to_numpy()
    x = (x- np.min(x,axis=0))/np.max(x,axis=0)
    return x

def scaleNorm(X_train,X_test):
    X_train = standardize(X_train)
    X_test = standardize(X_test)
    return pd.DataFrame(X_train),pd.DataFrame(X_test)

 # Current best model

In [3]:
def bestFittingMethod(x,y):
    xgb_model = xgb.XGBClassifier(learning_rate=0.1,
                                  max_depth=10,
                                  eval_metric='logloss')
    xgb_model.fit(x, y)
    return xgb_model

In [None]:
X,x_real_test,Y,y_real_test = creating_dataset()
cols = pd.read_csv('dataset/datasetcleaned.csv', index_col=[0]).drop('DEP_DEL15', axis=1).drop('MONTH',axis=1).drop("LONGITUDE",axis=1).drop("LATITUDE",axis=1).columns
x_real_train,x_test,y_real_train,y_test = redoSplit(X,Y)
x,y = balancing_dataset(x_real_train,y_real_train,0.8)
x_train,x_test,y_train,y_test = redoSplit(x,y)

In [4]:
X,x_real_test,y,y_real_test = creating_dataset()
cols = pd.read_csv('dataset/datasetcleaned.csv', index_col=[0]).drop('DEP_DEL15', axis=1).drop('MONTH',axis=1).drop("LONGITUDE",axis=1).drop("LATITUDE",axis=1).columns
x_train,x_test,y_train,y_test = redoSplit(X,y)
x_train,y_train = balancing_dataset(x_train,y_train,0.8)
clf = bestFittingMethod(x_train,y_train)
y_pred_test = clf.predict(x_test)
y_pred_test = clf.predict(x_test)
cm_test = confusion_matrix(y_pred_test,y_test.to_numpy())

n_cm_test = cm_test/np.sum(cm_test,axis=0)
print(n_cm_test)
print("acc non delay:",n_cm_test[0,0],"acc delay:",n_cm_test[1,1])



[[0.66199705 0.3376075 ]
 [0.33800295 0.6623925 ]]
acc non delay: 0.6619970519189824 acc delay: 0.6623925021885781


In [None]:
delay_acc = []
not_delay_acc = []
for i in range(1,10):
    x_train,x_test,y_train,y_test = redoSplit(X,y)
    x_train,y_train = balancing_dataset(x_train,y_train,i/10)
    clf = bestFittingMethod(x_train,y_train)
    y_pred_test = clf.predict(x_test)
    y_pred_test = clf.predict(x_test)
    cm_test = confusion_matrix(y_pred_test,y_test.to_numpy())
    print(cm_test)
    n_cm_test = cm_test /np.sum(cm_test,axis=0)
    print("acc non delay:",n_cm_test[0,0],"acc delay:",n_cm_test[1,1])
    delay_acc.append(n_cm_test[0,0])
    not_delay_acc.append(n_cm_test[1,1])

In [None]:
plt.plot(np.arange(1,10)*0.1,delay_acc,label="delay")
plt.plot(np.arange(1,10)*0.1,not_delay_acc,label="not delayed")
plt.ylim(0,1)
plt.xlim(0,1)
plt.title("Accuracy of the model with respect of the percentage of dropped 'delayed' instances")
plt.legend()
plt.show()

# Feature selection

## F scores

In [None]:
def computingFscoresAccuracies(verbose = False):
    f_scores_train = []
    f_scores_test = []
    delay_hits = []
    nondelay_hits = []
    for k_f in tqdm(range(1,15)):
        x_train,x_test,y_train,y_test = redoSplit(X,y)
        x_train,y_train = balancing_dataset(x_train,y_train,0.8)
        Kbest_f = SelectKBest(f_classif, k=k_f).fit(x_train, y_train)
        Xf = Kbest_f.transform(x_train)
        Xf_test = Kbest_f.transform(x_test)
        #fitting
        clf = bestFittingMethod(Xf,y_train)
        #accuracy
        #train
        y_pred_train = clf.predict(Xf)
        Train_f_score = f1_score(y_train.to_numpy(),y_pred_train)
        f_scores_train.append(Train_f_score)
        #test
        y_pred_test = clf.predict(Xf_test)
        f_score_test = f1_score(y_test.to_numpy(),y_pred_test)
        f_scores_test.append(f_score_test)
        cm_test = confusion_matrix(y_pred_test,y_test.to_numpy())
        n_cm_test = cm_test /np.sum(cm_test,axis=0)
        delay_hits.append(n_cm_test[0,0])
        nondelay_hits.append(n_cm_test[1,1])
        if verbose: 
            train_acc,t_acc = clf.score(Xf, y_train), clf.score(Xf_test, y_test)
            cm = confusion_matrix(y_pred_train,y_train.to_numpy())
            print(f"---k={k_f}-----------------------------------------------------")
            print("Train accuracy",train_acc)
            print("Test accuracy",t_acc)
            print("confusion matrix train \n",cm /np.sum(cm,axis=0))
            print("train score:",clf.score(Xf,y_train))
            print("train f-score:",Train_f_score)
            print("confusion matrix test \n",cm_test /np.sum(cm_test,axis=0))
            print("test score:",clf.score(Xf_test,y_test))
            print("test f-score:",f_score_test)
    return f_scores_train,f_scores_test,delay_hits,nondelay_hits

In [None]:
f_scores_train,f_scores_test,df,ndf = computingFscoresAccuracies(verbose=False)

In [None]:
plt.plot(np.arange(1,15),df,label="delay accuracy")
plt.plot(np.arange(1,15),ndf,label="not delayed accuracy")
plt.xlabel("Value of k")
plt.ylabel("accuracy")
plt.ylim(0,1)
plt.legend()
plt.show()

In [None]:
num_to_col = dict(enumerate(cols))
scores,p_values = f_classif(x_train,y_train)
k = len(scores)
for i in reversed(range(k)):
    print(num_to_col[np.argsort(scores)[-k:][i]],np.log(np.sort(scores)[-k:][i]))

 ## $\chi^2$ feature selection

In [None]:
def computingChi2scoresAccuracies(verbose = False):
    #X need to be non negative to apply this method
    chi2_scores_train = []
    chi2_scores_test = []
    delay_hits = []
    nondelay_hits = []
    for k in tqdm(range(1,15)):
        x_train,x_test,y_train,y_test = redoSplit(X,y)
        x_train,y_train = balancing_dataset(x_train,y_train,0.8)
        x_chi2 = x_train.copy()
        x_chi2_test = x_test.copy()
        Kbest_chi2 = SelectKBest(chi2, k=k).fit(x_chi2, y_train)
        Xchi2 = Kbest_chi2.transform(x_chi2)
        XTchi2 = Kbest_chi2.transform(x_chi2_test)
        #fitting
        clf = bestFittingMethod(Xchi2,y_train)
        #accuracy
        #train
        y_pred_train = clf.predict(Xchi2)
        Train_chi2_score = f1_score(y_train.to_numpy(),y_pred_train)
        chi2_scores_train.append(Train_chi2_score)
        #test
        y_pred_test = clf.predict(XTchi2)
        Test_chi2_score = f1_score(y_test.to_numpy(),y_pred_test)
        chi2_scores_test.append(Test_chi2_score)
        cm_test = confusion_matrix(y_pred_test,y_test.to_numpy())
        n_cm_test = cm_test /np.sum(cm_test,axis=0)
        delay_hits.append(n_cm_test[0,0])
        nondelay_hits.append(n_cm_test[1,1])
        if verbose: 
            cm = confusion_matrix(y_pred_train,y_train.to_numpy())
            cm_test = confusion_matrix(y_pred_test,y_test.to_numpy())
            print(f"---k={k_f}-----------------------------------------------------")
            print("confusion matrix train \n",cm /np.sum(cm,axis=0))
            print("train f-score:",Train_f_score)
            print("confusion matrix test \n",cm_test /np.cm_test(cm,axis=0))
            print("test chi2-score:",Test_chi2_score)
    return chi2_scores_train,chi2_scores_test,delay_hits,nondelay_hits

In [None]:
chi2_scores_train,chi2_scores_test,d_chi2,nd_chi2 = computingChi2scoresAccuracies()

In [None]:
plt.plot(np.arange(1,15),d_chi2,label="delay accuracy")
plt.plot(np.arange(1,15),nd_chi2,label="not delayed accuracy")
plt.xlabel("Value of k")
plt.ylabel("accuracy")
plt.ylim(0,1)
plt.legend()
plt.show()

In [None]:
scores,p_values = chi2(x_train,y_train)
k = len(scores)
print("printing log scores")
for i in reversed(range(k)):
    print(cols[np.argsort(scores)[-k:][i]],np.log(np.sort(scores)[-k:][i]))

# PCA 

In [None]:
def computingPCAscoresAccuracies(verbose = False):
    scores_train = []
    scores_test = []
    delay_hits = []
    nondelay_hits = []
    for k in tqdm(range(1,15)):
        x_train,x_test,y_train,y_test = redoSplit(X,y)
        x_train,y_train = balancing_dataset(x_train,y_train,0.8)
        #transforming
        pca = PCA(n_components=k)
        pca.fit(x_train)
        Xpca = pca.transform(x_train)
        Xpca -= np.min(Xpca,axis=0);Xpca /= np.max(Xpca,axis=0)
        Xtestpca = pca.fit_transform(x_test)
        Xtestpca -= np.min(Xtestpca,axis=0);Xtestpca /= np.max(Xtestpca,axis=0)
        #fitting
        clf = bestFittingMethod(Xpca,y_train)
        #accuracy
        #train
        y_pred_train = clf.predict(Xpca)
        score_train = clf.score(Xpca,y_train.to_numpy())
        scores_train.append(score_train)
        #test
        y_pred_test = clf.predict(Xtestpca)
        score_test = clf.score(Xtestpca,y_test.to_numpy())
        scores_test.append(score_test)
        cm_test = confusion_matrix(y_pred_test,y_test.to_numpy())
        n_cm_test = cm_test /np.sum(cm_test,axis=0)
        delay_hits.append(n_cm_test[0,0])
        nondelay_hits.append(n_cm_test[1,1])
        if verbose: 
            cm = confusion_matrix(y_pred_train,y_train.to_numpy())
            cm_test = confusion_matrix(y_pred_test,y_test.to_numpy())
            print(f"---k={k}-----------------------------------------------------")
            print("confusion matrix train \n",cm /np.sum(cm,axis=0))
            print("train score:",score_train)
            print("confusion matrix test \n",cm_test /np.sum(cm_test,axis=0))
            print("test score:",score_test)
        clf = None
    return scores_train,scores_test,delay_hits,nondelay_hits

In [None]:
pcaScoresTrain,pcaScoresTest,d_pca,nd_pca = computingPCAscoresAccuracies(verbose = True)

In [None]:
plt.plot(np.arange(1,15),d_pca,label="delay accuracy")
plt.plot(np.arange(1,15),nd_pca,label="not delayed accuracy")
plt.xlabel("Value of k")
plt.ylabel("accuracy")
plt.ylim(0,1)
plt.legend()
plt.show()