In [1]:
pip install yellowbrick

Note: you may need to restart the kernel to use updated packages.


In [37]:
import matplotlib
matplotlib.rcParams['figure.figsize'] = (30,30)
%matplotlib inline

import numpy as np

import pandas as pd
pd.set_option('display.max_columns', None)
import scipy.stats as stats

import seaborn as sns
sns.set_style('darkgrid')

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

from statsmodels.graphics.gofplots import qqplot

from statsmodels.stats.outliers_influence import variance_inflation_factor

from yellowbrick.regressor import ResidualsPlot

In [10]:
df = pd.read_csv('wisconsin.csv')
df.head(2)
df.drop(df.columns[0], axis=1, inplace=True)
del df['YEAR']
departure_delay = df['DEPARTURE_DELAY']
df['CANCELLATION_REASON'] = df['CANCELLATION_REASON'].replace('nan', np.nan).fillna(0)
df['AIR_SYSTEM_DELAY'] = df['AIR_SYSTEM_DELAY'].replace('nan', np.nan).fillna(0)
df['SECURITY_DELAY'] = df['SECURITY_DELAY'].replace('nan', np.nan).fillna(0)
df['AIRLINE_DELAY'] = df['AIRLINE_DELAY'].replace('nan', np.nan).fillna(0)
df['LATE_AIRCRAFT_DELAY'] = df['LATE_AIRCRAFT_DELAY'].replace('nan', np.nan).fillna(0)
df['WEATHER_DELAY'] = df['WEATHER_DELAY'].replace('nan', np.nan).fillna(0)
df['DEPARTURE_TIME'] = df['DEPARTURE_TIME'].replace('nan', np.nan).fillna(0)
df['DEPARTURE_DELAY'] = df['DEPARTURE_DELAY'].replace('nan', np.nan).fillna(0)
df['TAXI_OUT'] = df['TAXI_OUT'].replace('nan', np.nan).fillna(0)
df['WHEELS_OFF'] = df['WHEELS_OFF'].replace('nan', np.nan).fillna(0)
df['ELAPSED_TIME'] = df['ELAPSED_TIME'].replace('nan', np.nan).fillna(0)
df['AIR_TIME'] = df['AIR_TIME'].replace('nan', np.nan).fillna(0)
df['WHEELS_ON'] = df['WHEELS_ON'].replace('nan', np.nan).fillna(0)
df['TAXI_IN'] = df['TAXI_IN'].replace('nan', np.nan).fillna(0)
df['ARRIVAL_TIME'] = df['ARRIVAL_TIME'].replace('nan', np.nan).fillna(0)
df['ARRIVAL_DELAY'] = df['ARRIVAL_DELAY'].replace('nan', np.nan).fillna(0)
cancelled_2 = df[df['CANCELLED'] != 0]
delayed = df[df['DEPARTURE_DELAY'] > 0]
df['DELAYED_TARGET'] = np.where(df['DEPARTURE_DELAY'] > 0, 1, 0)

In [31]:
features = ['MONTH',                    
            'DAY',                      
            'DAY_OF_WEEK',                 
            'SCHEDULED_DEPARTURE',            
            'AIR_TIME',
            'DISTANCE',
            'DELAYED_TARGET',
            'DEPARTURE_DELAY'
           ]


dfFeatures = df[features]
dff = dfFeatures.iloc[:1100]
dff.head()

Unnamed: 0,MONTH,DAY,DAY_OF_WEEK,SCHEDULED_DEPARTURE,AIR_TIME,DISTANCE,DELAYED_TARGET,DEPARTURE_DELAY
0,7,1,3,1605,231.0,1694,0,-2.0
1,7,2,4,1605,234.0,1694,0,-5.0
2,7,3,5,1605,257.0,1694,0,0.0
3,7,4,6,1605,224.0,1694,0,-7.0
4,7,5,7,1605,221.0,1694,0,-5.0


In [32]:
labels = dfFeatures['DELAYED_TARGET']
dfFeatures.drop('DELAYED_TARGET', axis=1, inplace=True)

In [34]:
X_train, X_test, y_train, y_test = train_test_split(dfFeatures, labels, test_size=0.25, random_state=42)

In [35]:
# Import StandardScaler
from sklearn.preprocessing import StandardScaler

# Instantiate StandardScaler
scaler = StandardScaler()

# Transform the training and test sets
scaled_data_train = scaler.fit_transform(X_train)
scaled_data_test = scaler.transform(X_test)

# Convert into a DataFrame
scaled_df_train = pd.DataFrame(scaled_data_train, columns=dfFeatures.columns)
scaled_df_train.head()

Unnamed: 0,MONTH,DAY,DAY_OF_WEEK,SCHEDULED_DEPARTURE,AIR_TIME,DISTANCE,DEPARTURE_DELAY
0,1.713005,1.645209,-0.44038,-1.300284,-0.349273,-0.487311,1.342896
1,-1.252615,0.615319,1.069651,-0.698114,-0.527995,-0.672621,-0.193731
2,-0.659491,0.500886,-1.447067,0.441708,-0.474378,-0.503326,-0.09129
3,-0.362929,-0.757869,1.069651,1.22668,1.223478,0.835021,-0.270563
4,1.713005,0.958615,0.062964,-0.255089,-1.028416,-1.061542,-0.449836


In [36]:
# Import KNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier

# Instantiate KNeighborsClassifier
clf = KNeighborsClassifier()

# Fit the classifier
clf.fit(scaled_data_train, y_train)

# Predict on the test set
test_preds = clf.predict(scaled_data_test)

In [38]:
def print_metrics(labels, preds):
    print("Precision Score: {}".format(precision_score(labels, preds)))
    print("Recall Score: {}".format(recall_score(labels, preds)))
    print("Accuracy Score: {}".format(accuracy_score(labels, preds)))
    print("F1 Score: {}".format(f1_score(labels, preds)))
    
print_metrics(y_test, test_preds)

Precision Score: 0.9356749894202285
Recall Score: 0.6412412993039444
Accuracy Score: 0.8918561195889131
F1 Score: 0.76097057305111
