# KNN Modeling

## Importing Packages

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import os

## Connect to directory and get list of files

In [None]:
os.chdir('/home/lmh2ur/DS5110')

In [None]:
years = ["2008.csv", '2007.csv', '2006.csv', '2005.csv', '2004.csv', '2003.csv','2000.csv','1999.csv','1998.csv', '1997.csv','1996.csv','1995.csv','1994.csv','1993.csv','1992.csv','1991.csv','1990.csv','1989.csv','1988.csv','1987.csv']

In [None]:
years

## Data Preprocessing

In [None]:
def clean_df(df):

    #helper funtion to make Classification column 
    def func(row):
        if pd.isnull(row['ArrDelay']):
            return 'Cancelled'
        else:
            if row['ArrDelay']>60:
                return 'Very Delayed'
            else:
                if row['ArrDelay']>30:
                    return 'Delayed'
                else:
                    return 'On time'
            
    df = df[['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'CRSDepTime',
       'CRSArrTime', 'CRSElapsedTime', 'Distance', 'ArrDelay']]
    
    #apply helper function
    df['DelayClass'] = df.apply(func, axis=1)

    return df.reset_index(drop=True)

In [None]:
data=clean_df(pd.read_csv('1997.csv', dtype={'CancellationCode': 'object'}))
for year in years:
    print(year)
    try: 
        data_new=clean_df(pd.read_csv(year, dtype={'CancellationCode': 'object'}))
        data=pd.concat([data, data_new], axis=0)
    except:
        print(year, 'fail')

## Normalize and split data

In [None]:
num_data = data.drop(columns=['ArrDelay']).dropna().reset_index(drop=True)

In [None]:
x = num_data.drop(columns=['DelayClass'])

In [None]:
y = num_data.DelayClass

In [None]:
MinMaxScaler = preprocessing.MinMaxScaler()
x_data_minmax = MinMaxScaler.fit_transform(x)
x = pd.DataFrame(x_data_minmax, columns=['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'CRSDepTime',
       'CRSArrTime', 'CRSElapsedTime', 'Distance'])

In [None]:
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=12345)

## KNN Classification Model

In [None]:
knn = KNeighborsClassifier(n_neighbors=19, algorithm='ball_tree', n_jobs=127)
knn.fit(x_train, y_train)

In [None]:
ypred=knn.predict(x_test)

In [None]:
result = confusion_matrix(y_test, ypred)
print('Confusion Matrix:')
print(result)
result1 = classification_report(y_test, ypred)
print('Classification Report:',)
print (result1)
result2 = accuracy_score(y_test, ypred)
print('Accuracy:',result2)

## KNN Function

In [None]:
for year in years:
    print(year)
    
    data=clean_df(pd.read_csv(year, dtype={'CancellationCode': 'object'}))
    num_data = data.drop(columns=['ArrDelay', 'Year']).dropna().reset_index(drop=True)
    x = num_data.drop(columns=['DelayClass'])
    y = num_data.DelayClass
    
    MinMaxScaler = preprocessing.MinMaxScaler()
    x_data_minmax = MinMaxScaler.fit_transform(x)
    x = pd.DataFrame(x_data_minmax, columns=['Month', 'DayofMonth', 'DayOfWeek', 'CRSDepTime',
       'CRSArrTime', 'CRSElapsedTime', 'Distance'])
    
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=12345)
    
    knn = KNeighborsClassifier(n_neighbors=19, algorithm='ball_tree')
    knn.fit(x_train, y_train)
    
    ypred=knn.predict(x_test)
    
    result = confusion_matrix(y_test, ypred)
    print('Confusion Matrix:' + year)
    print(result)
    result1 = classification_report(y_test, ypred)
    print('Classification Report:' + year,)
    print (result1)
    result2 = accuracy_score(y_test, ypred)
    print('Accuracy:' + year, result2)

## Plot Accuracies

In [None]:
year_list = ['1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2003', '2004', '2005', '2006', '2007', '2008']
acc_list = [0.885, 0.91, 0.884, 0.903, 0.921, 0.923, 0.914, 0.911, 0.889, 0.858, 0.882, 0.865, 0.858, 0.829, 0.896, 0.869, 0.864, 0.85, 0.833, 0.827]

In [None]:
import matplotlib.pyplot as plt

In [None]:
fig = plt.figure(figsize =(14, 8))
plt.bar(year_list, acc_list, width=0.4, color='maroon')
plt.xlabel('Year')
plt.title('KNN Prediction Accuracy Per Year')
plt.ylim([0.7, 1])


 
plt.show()

## EDA

In [None]:
months = data.groupby(['Month', 'DelayClass']).agg({'DelayClass':'count'})

In [None]:
months.groupby(level=0).apply(lambda x:100 * x / float(x.sum()))

In [None]:
week_day = data.groupby(['DayOfWeek', 'DelayClass']).agg({'DelayClass':'count'})

In [None]:
week_day.groupby(level=0).apply(lambda x:100 * x / float(x.sum()))

On time = 6, delayed = 5, very delayed = 5, cancelled = 2

In [None]:
data.Distance.mean()

In [None]:
data.CRSElapsedTime.mean()