In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.exceptions import ConvergenceWarning
import warnings

# Load the dataset
file_path = 'output_dataset.csv'  # Replace 'your_dataset.csv' with the actual file path
df = pd.read_csv(file_path)

# Drop non-numeric columns that are not needed for prediction
df = df.drop(['DayOfWeek', 'Date', 'UniqueCarrier', 'Airline', 'FlightNum', 'TailNum', 'Origin', 'Org_Airport', 'Dest', 'Dest_Airport', 'Cancelled', 'CancellationCode', 'Diverted'], axis=1)

# Define range-specific encoding function with 10 partitions
def range_encoding(value):
    if value <= 100:
        return '0-100'
    elif 101 <= value <= 200:
        return '101-200'
    elif 201 <= value <= 300:
        return '201-300'
    elif 301 <= value <= 400:
        return '301-400'
    elif 401 <= value <= 500:
        return '401-500'
    elif 501 <= value <= 600:
        return '501-600'
    elif 601 <= value <= 700:
        return '601-700'
    elif 701 <= value <= 800:
        return '701-800'
    elif 801 <= value <= 900:
        return '801-900'
    else:
        return '901-1000'

# Apply range-specific encoding to 'CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay', and 'ArrDelay'
for col in ['CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay', 'ArrDelay']:
    df[col] = df[col].apply(range_encoding)

# Convert categorical variables to numeric using Label Encoding
label_encoder = LabelEncoder()
for col in ['CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay', 'ArrDelay']:
    df[col] = label_encoder.fit_transform(df[col])

# Separate features (X) and target variable (y)
X = df.drop('ArrDelay', axis=1)
y = df['ArrDelay']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Suppress ConvergenceWarnings
warnings.filterwarnings("ignore", category=ConvergenceWarning)

# Iterate over different k values
for k in [3, 5, 10]:
    # Create and train the KNN model
    model = KNeighborsClassifier(n_neighbors=k)
    model.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = model.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    classification_rep = classification_report(y_test, y_pred, zero_division=1)

    # Display the results for each k
    print(f"\nResults for k = {k}:")
    print("Accuracy:", accuracy)
    print("Confusion Matrix:")
    print(conf_matrix)
    print("Classification Report:")
    print(classification_rep)

# Re-enable warnings after fitting the models
warnings.resetwarnings()



Results for k = 3:
Accuracy: 0.9723454751831596
Confusion Matrix:
[[16243    62     0     0     0     0     0     0     0]
 [  338  2189    14     0     0     0     0     0     0]
 [    0    85   330     4     0     0     0     0     0]
 [    0     0    23    66     0     0     0     0     0]
 [    0     0     1     6    12     0     0     0     0]
 [    0     0     0     1     0     5     0     0     0]
 [    0     0     0     0     0     0     0     1     0]
 [    0     0     0     0     0     0     0     0     1]
 [    0     0     0     0     0     0     0     0     1]]
Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99     16305
           1       0.94      0.86      0.90      2541
           2       0.90      0.79      0.84       419
           3       0.86      0.74      0.80        89
           4       1.00      0.63      0.77        19
           5       1.00      0.83      0.91         6
           7     