### Import libaries

In [None]:
import pandas as pd
import numpy as np 
import seaborn as sns 
%matplotlib inline
import matplotlib.pyplot as plt
import scipy.stats as ss

### Import data

In [None]:
# Read from url - Take time to run
url = "http://stat-computing.org/dataexpo/2009/2008.csv.bz2"
df = pd.read_csv(url, compression='bz2')

In [None]:
# Reduce rows
df = df.sample(frac=0.35)

In [None]:
df.head()

Variable descriptions: http://stat-computing.org/dataexpo/2009/the-data.html

Reference: https://www.kaggle.com/fabiendaniel/predicting-flight-delays-tutorial

### Data pre-processing

In [None]:
df.count()

In [None]:
#Create Date column
#df['Date'] = pd.to_datetime(df[['Year', 'Month', 'DayofMonth']])

# Drop columns
# Year: The dataset only contains data in 2008
# Cancelled + CancellationCode + Diverted: Cancelled flight does not have delay label.
df.drop(['Year', 'Cancelled', 'CancellationCode', 'Diverted'], axis=1, inplace=True)
# Delay details: Only 20% rows have these features.
df.drop(['CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay'], axis=1, inplace=True)
# Time: Only keep CRSArrTime, CRSElapsedTime
df.drop(['DepTime', 'ArrTime', 'CRSDepTime', 'ActualElapsedTime'], axis=1, inplace=True)
# Airport: Only keep the Dest Airport
df.drop(['Origin'], axis=1, inplace=True)
# Carrier: Drop FlightNum and TailNum
df.drop(['FlightNum', 'TailNum'], axis=1, inplace=True)

df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

df.count()

In [None]:
# Convert CRSArrTime to Parts of the Day
conditions = [
    (df['CRSArrTime'] >= 500) & (df['CRSArrTime'] < 1201),
    (df['CRSArrTime'] >= 1201) & (df['CRSArrTime'] < 1701),
    (df['CRSArrTime'] >= 1701) & (df['CRSArrTime'] < 2101)]
choices = [1, 2, 3]
df['PotD'] = np.select(conditions, choices, default=4)

# Morning, Afternoon, Evening, Night

In [None]:
# Convert Months to Quarter
conditions = [
    (df['Month'] >= 4) & (df['Month'] < 7),
    (df['Month'] >= 7) & (df['Month'] < 10),
    (df['Month'] >= 10)]
choices = [2, 3, 4]
df['Quarter'] = np.select(conditions, choices, default=1)

In [None]:
# Convert DayofMonth to Week
conditions = [
    (df['DayofMonth'] >= 8) & (df['DayofMonth'] < 15),
    (df['DayofMonth'] >= 15) & (df['DayofMonth'] < 22),
    (df['DayofMonth'] >= 22)]
choices = [2, 3, 4]
df['Week'] = np.select(conditions, choices, default=1)

In [None]:
# Convert DayOfWeek to Weekend
conditions = [
    (df['DayOfWeek'] >= 6)]
choices = [1]
df['Weekend'] = np.select(conditions, choices, default=0)

In [None]:
# Convert ArrDelay to Delay
# A flight only counts as late if it is more than 30 minutes late.
conditions = [
    (df['ArrDelay'] > 30)]
choices = [1]
df['Delay'] = np.select(conditions, choices, default=0)

In [None]:
df.drop(['Month', 'DayofMonth', 'DayOfWeek', 'CRSArrTime'], axis=1, inplace=True)
df.dtypes

In [None]:
df.head()

### Data analysis

In [None]:
#Check basic stats
df.describe()

In [None]:
#Check correlation
df.corr()

In [None]:
# Groupby carrier
df.groupby(['UniqueCarrier']).mean()

In [None]:
# Groupby destination airport
df.groupby(['Dest']).mean()

In [None]:
# Bias check
sns.countplot(x="Delay", data=df)

In [None]:
# Check Correlation
def cramers_corrected_stat(x, y):

    """ calculate Cramers V statistic for categorial-categorial association.
        uses correction from Bergsma and Wicher, 
        Journal of the Korean Statistical Society 42 (2013): 323-328
    """
    result=-1
    if len(x.value_counts())==1 :
        print("First variable is constant")
    elif len(y.value_counts())==1:
        print("Second variable is constant")
    else:   
        conf_matrix=pd.crosstab(x, y)

        if conf_matrix.shape[0]==2:
            correct=False
        else:
            correct=True

        chi2 = ss.chi2_contingency(conf_matrix, correction=correct)[0]

        n = sum(conf_matrix.sum())
        phi2 = chi2/n
        r,k = conf_matrix.shape
        phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))    
        rcorr = r - ((r-1)**2)/(n-1)
        kcorr = k - ((k-1)**2)/(n-1)
        result=np.sqrt(phi2corr / min( (kcorr-1), (rcorr-1)))
    return round(result,6)

#### Quarter

In [None]:
sns.heatmap(pd.crosstab(df['Delay'], df['Quarter'], normalize='index'))

In [None]:
cramers_corrected_stat(df['Delay'], df['Quarter'])

#### Week

In [None]:
sns.heatmap(pd.crosstab(df['Delay'], df['Week'], normalize='index'))

In [None]:
cramers_corrected_stat(df['Delay'], df['Week'])

#### Weekend

In [None]:
sns.heatmap(pd.crosstab(df['Delay'], df['Weekend'], normalize='index'))

In [None]:
cramers_corrected_stat(df['Delay'], df['Weekend'])

#### Parts of the Day

In [None]:
sns.heatmap(pd.crosstab(df['Delay'], df['PotD'], normalize='index'))

In [None]:
cramers_corrected_stat(df['Delay'], df['PotD'])

### Encoding

In [None]:
# Apply Dummy encoding to Dest column

#df = pd.concat([df, pd.get_dummies(df['Dest'], prefix='Dest',dummy_na=True)],axis=1).drop(['Dest'],axis=1)
df.drop(['Dest'], axis=1, inplace=True) # Temp

In [None]:
# Apply Dummy encoding to UniqueCarrier column

df = pd.concat([df, pd.get_dummies(df['UniqueCarrier'], prefix='UniqueCarrier',dummy_na=True)],axis=1).drop(['UniqueCarrier'],axis=1)
df.head()

In [None]:
# http://www.insightsbot.com/blog/McTKK/python-one-hot-encoding-with-scikit-learn
#from sklearn.preprocessing import LabelBinarizer

#UniqueCarrier_lb = LabelBinarizer()
#X = UniqueCarrier_lb.fit_transform(df.UniqueCarrier.values)

### Model evaluation

In [None]:
#features = df.drop(['Delay'], axis=1)
# Why?
# Airline:
# Dest Airport + CRSArrTime: 
# Departure features:


X_data = df.drop(['Delay'], axis=1)
y_data = df['Delay']

# Holdout
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=0)

In [None]:
X_train.head()

#### Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
# Metrics
# https://medium.com/thalus-ai/performance-metrics-for-classification-problems-in-machine-learning-part-i-b085d432082b
# Confusion Matrix
from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_test, y_pred))

# Report
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

#### SVC

In [None]:
from sklearn.svm import SVC

model = SVC()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
# Confusion Matrix
from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_test, y_pred))

# Report
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

#### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial').fit(X, y)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
# Confusion Matrix
from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_test, y_pred))

# Report
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

#### Ensemble learning
https://medium.com/@aravanshad/gradient-boosting-versus-random-forest-cfa3fa8f0d80

### PCA

In [None]:
#Standar Scaler

from sklearn.preprocessing import StandardScaler

# Fit on training set only.
scaler = StandardScaler().fit(X_train)
    
# Apply transform to both the training set and the test set.
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
#PCA
from sklearn.decomposition import PCA

# Fit on training set only.
pca = PCA(n_components=1).fit(X_train)

# Apply transform to both the training set and the test set.
X_train = pca.transform(X_train)
X_test = pca.transform(X_test)

# TODO

1. Edit threshhold / class weight
2. PCA: Dimension reducing
3. RFE: Features selection
4. GridsearchCV for tuning
- Tuning with 3 values
- Goal: Precison ~ Recall
5. Compare performances among models

#### Note
- Label Encoder: For algorithm that does not care about distance between data point (Naive Bayes, ...)