In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.exceptions import ConvergenceWarning
import warnings

# Load the dataset
file_path = 'output_dataset.csv'  # Replace 'your_dataset.csv' with the actual file path
df = pd.read_csv(file_path)

# Drop non-numeric columns that are not needed for prediction
df = df.drop(['DayOfWeek', 'Date', 'UniqueCarrier', 'Airline', 'FlightNum', 'TailNum', 'Origin', 'Org_Airport', 'Dest', 'Dest_Airport', 'Cancelled', 'CancellationCode', 'Diverted'], axis=1)

# Convert categorical variables to numeric using Label Encoding
label_encoder = LabelEncoder()
df['CarrierDelay'] = label_encoder.fit_transform(df['CarrierDelay'])
df['WeatherDelay'] = label_encoder.fit_transform(df['WeatherDelay'])
df['NASDelay'] = label_encoder.fit_transform(df['NASDelay'])
df['SecurityDelay'] = label_encoder.fit_transform(df['SecurityDelay'])
df['LateAircraftDelay'] = label_encoder.fit_transform(df['LateAircraftDelay'])

# Separate features (X) and target variable (y)
X = df.drop('ArrDelay', axis=1)
y = df['ArrDelay']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Suppress ConvergenceWarnings
warnings.filterwarnings("ignore", category=ConvergenceWarning)

# Create and train the K-Nearest Neighbors model
model = KNeighborsClassifier()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Re-enable warnings after fitting the model
warnings.resetwarnings()

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred, zero_division=1)

# Display the results for the first 8 classes
target_classes = [str(i) for i in range(8)]
print("Accuracy:", accuracy)
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report (for the first 8 classes):")
report_lines = classification_rep.split('\n')
for class_label in target_classes:
    print(report_lines[int(class_label) + 2])


Accuracy: 0.04695077907336704

Confusion Matrix:
[[178 108  84 ...   0   0   0]
 [154  96  78 ...   0   0   0]
 [155  97  83 ...   0   0   0]
 ...
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   1   0   0]
 [  0   0   0 ...   0   0   0]]

Classification Report (for the first 8 classes):
          15       0.13      0.33      0.19       543
          16       0.10      0.20      0.13       488
          17       0.09      0.17      0.12       478
          18       0.06      0.09      0.07       481
          19       0.06      0.08      0.07       487
          20       0.05      0.06      0.05       451
          21       0.07      0.07      0.07       439
          22       0.06      0.07      0.06       407
