# KNN Model

## Set Up

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

## Import and Preprocess Data

In [2]:
data = pd.read_csv('1987.csv')

In [3]:
def func(row):
    if pd.isnull(row['ArrDelay']):
        return 'Cancelled'
    else:
        if row['ArrDelay']>60:
            return 'Very Late'
        else:
            if row['ArrDelay']>30:
                return 'Late'
            else:
                return 'On time'
            
data['DelayClass'] = data.apply(func, axis=1)

In [4]:
data=data.drop(['DepTime', 'ActualElapsedTime','ArrTime', 'AirTime', 'ArrDelay', 'DepDelay', 'TaxiIn', 'TaxiOut', 'Cancelled', 'CancellationCode', 'Diverted', 'CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay'], axis=1)

## Data Split and Normalization

In [5]:
num_data = data.drop(columns=['UniqueCarrier', 'TailNum', 'Origin', 'Dest', 'FlightNum']).dropna().reset_index(drop=True)

In [6]:
num_data

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,CRSDepTime,CRSArrTime,CRSElapsedTime,Distance,DelayClass
0,1987,10,14,3,730,849,79,447.0,On time
1,1987,10,15,4,730,849,79,447.0,On time
2,1987,10,17,6,730,849,79,447.0,On time
3,1987,10,18,7,730,849,79,447.0,On time
4,1987,10,19,1,730,849,79,447.0,Late
...,...,...,...,...,...,...,...,...,...
1310806,1987,12,11,5,1530,1823,113,719.0,On time
1310807,1987,12,13,7,1530,1823,113,719.0,On time
1310808,1987,12,14,1,1530,1823,113,719.0,On time
1310809,1987,12,1,2,1525,1638,73,200.0,On time


In [7]:
x = num_data.drop(columns=['DelayClass'])

In [8]:
y = num_data.DelayClass

In [9]:
x

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,CRSDepTime,CRSArrTime,CRSElapsedTime,Distance
0,1987,10,14,3,730,849,79,447.0
1,1987,10,15,4,730,849,79,447.0
2,1987,10,17,6,730,849,79,447.0
3,1987,10,18,7,730,849,79,447.0
4,1987,10,19,1,730,849,79,447.0
...,...,...,...,...,...,...,...,...
1310806,1987,12,11,5,1530,1823,113,719.0
1310807,1987,12,13,7,1530,1823,113,719.0
1310808,1987,12,14,1,1530,1823,113,719.0
1310809,1987,12,1,2,1525,1638,73,200.0


In [10]:
MinMaxScaler = preprocessing.MinMaxScaler()
x_data_minmax = MinMaxScaler.fit_transform(x)
x = pd.DataFrame(x_data_minmax, columns=['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'CRSDepTime',
       'CRSArrTime', 'CRSElapsedTime', 'Distance'])

In [11]:
x

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,CRSDepTime,CRSArrTime,CRSElapsedTime,Distance
0,0.0,0.0,0.433333,0.333333,0.309160,0.353481,0.074608,0.089705
1,0.0,0.0,0.466667,0.500000,0.309160,0.353481,0.074608,0.089705
2,0.0,0.0,0.533333,0.833333,0.309160,0.353481,0.074608,0.089705
3,0.0,0.0,0.566667,1.000000,0.309160,0.353481,0.074608,0.089705
4,0.0,0.0,0.600000,0.000000,0.309160,0.353481,0.074608,0.089705
...,...,...,...,...,...,...,...,...
1310806,0.0,1.0,0.333333,0.666667,0.648431,0.759483,0.095925,0.144291
1310807,0.0,1.0,0.400000,1.000000,0.648431,0.759483,0.095925,0.144291
1310808,0.0,1.0,0.433333,0.000000,0.648431,0.759483,0.095925,0.144291
1310809,0.0,1.0,0.000000,0.166667,0.646310,0.682368,0.070846,0.040136


In [12]:
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=12345)

## KNN GridSearch

In [13]:
# knn = KNeighborsClassifier()

# k_range = list(range(1, 20))
# param_grid = dict(n_neighbors=k_range)
  
# # defining parameter range
# grid = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy', return_train_score=False,verbose=1)
  
# # fitting the model for grid search
# grid_search=grid.fit(x_train, y_train)

In [14]:
# print(grid_search.best_params_)

In [15]:
# accuracy = grid_search.best_score_ *100
# print("Accuracy for our training dataset with tuning is : {:.2f}%".format(accuracy) )

## KNN Model

In [16]:
knn = KNeighborsClassifier(n_neighbors=19)
knn.fit(x_train, y_train)

KNeighborsClassifier(n_neighbors=19)

In [17]:
ypred=knn.predict(x_test)

In [18]:
result = confusion_matrix(y_test, ypred)
print('Confusion Matrix:')
print(result)
result1 = classification_report(y_test, ypred)
print('Classification Report:',)
print (result1)
result2 = accuracy_score(y_test, ypred)
print('Accuracy:',result2)

Confusion Matrix:
[[   195     38   4389     62]
 [    32    227  16588     84]
 [   160    358 231370    198]
 [    49    166   7995    252]]
Classification Report:
              precision    recall  f1-score   support

   Cancelled       0.45      0.04      0.08      4684
        Late       0.29      0.01      0.03     16931
     On time       0.89      1.00      0.94    232086
   Very Late       0.42      0.03      0.06      8462

    accuracy                           0.89    262163
   macro avg       0.51      0.27      0.27    262163
weighted avg       0.83      0.89      0.84    262163

Accuracy: 0.8851134599466745
