In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from xgboost import XGBClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, mean_squared_error, r2_score
from sklearn.exceptions import ConvergenceWarning
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

In [None]:
df = pd.read_csv("./output_dataset.csv")

In [None]:
df.head()

In [None]:
# Print the dimention of the dataframe and also show all the column names in the dataset:
rows, cols = df.shape
print(f"SHAPE BREAKDOWN\n{rows} rows and {cols} columns")

print("\nCOLUMN BREAKDOWN")
print(df.columns)

In [None]:
# Display how many zero values each column has
for column_name in df.columns:
    column = df[column_name]
    count = (column == 0).sum()
    print("Total Zero in ",column_name," is: ",count)

<h2> Data Cleaning </h2>

In [None]:
# Display all unique values in each column:
print("Unique Value in Each Columns")
df.nunique()

In [None]:
# Summarise Dataset:
print(df.info())
print(df.describe())

In [None]:
# Visualize the distribution of the target variable 'ArrDelay' after data cleaning
plt.figure(figsize=(10, 6))
sns.histplot(df['ArrDelay'], bins=30, kde=True)
plt.title('Distribution of Arrival Delays (After Data Cleaning)')
plt.xlabel('Arrival Delay (minutes)')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Correlation matrix after data cleaning
correlation_after_cleaning = df.corr()
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_after_cleaning, annot=True, cmap='coolwarm', fmt=".0%")
plt.title('Correlation Matrix After Data Cleaning')
plt.show()

In [None]:
df.hist(figsize=(20, 20))
plt.show()

In [None]:
# Selecting the first 10 numerical columns
numerical_columns = df.select_dtypes(include='number').columns[:10]

# Creating subplots
fig, axes = plt.subplots(nrows=5, ncols=2, figsize=(20, 20))
fig.suptitle('Histograms for Numerical Columns', fontsize=16)

# Plotting histograms for each numerical column
for i, column in enumerate(numerical_columns):
    sns.histplot(df[column], ax=axes[i//2, i%2], kde=True)
    axes[i//2, i%2].set_title(column)

# Adjusting layout
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

In [None]:
# Drop non-numeric columns that are not needed for prediction
df = df.drop(['DayOfWeek', 'Date', 'UniqueCarrier', 'Airline', 'FlightNum', 'TailNum', 'Origin', 'Org_Airport', 'Dest', 'Dest_Airport', 'Cancelled', 'CancellationCode', 'Diverted'], axis=1)

<h2> Implementing Models </h2>

In [None]:
# Define range-specific encoding function with 10 partitions
def range_encoding(value):
    if value <= 100:
        return '0-100'
    elif 101 <= value <= 200:
        return '101-200'
    elif 201 <= value <= 300:
        return '201-300'
    elif 301 <= value <= 400:
        return '301-400'
    elif 401 <= value <= 500:
        return '401-500'
    elif 501 <= value <= 600:
        return '501-600'
    elif 601 <= value <= 700:
        return '601-700'
    elif 701 <= value <= 800:
        return '701-800'
    elif 801 <= value <= 900:
        return '801-900'
    else:
        return '901-1000'

# Apply range-specific encoding to 'CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay', and 'ArrDelay'
for col in ['CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay', 'ArrDelay']:
    df[col] = df[col].apply(range_encoding)

# Convert categorical variables to numeric using Label Encoding
label_encoder = LabelEncoder()
for col in ['CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay', 'ArrDelay']:
    df[col] = label_encoder.fit_transform(df[col])

# Separate features (X) and target variable (y)
X = df.drop('ArrDelay', axis=1)
y = df['ArrDelay']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Suppress ConvergenceWarnings
warnings.filterwarnings("ignore", category=ConvergenceWarning)

<h2> K-Nearest Neighbour </h2>

In [None]:
# Iterate over different k values
for k in [10]: #can use [3 and 5] too
    # Create and train the KNN model
    model = KNeighborsClassifier(n_neighbors=k)
    model.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = model.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    classification_rep = classification_report(y_test, y_pred, zero_division=1)

    # Display the results for each k
    print(f"\nResults for k = {k}:")
    print("Accuracy:", accuracy)
    print("Confusion Matrix:")
    print(conf_matrix)
    print("Classification Report:")
    print(classification_rep)

# Re-enable warnings after fitting the models
warnings.resetwarnings()

<h2> Decision Tree </h2>

In [None]:
# Create and train the Decision Tree model
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Re-enable warnings after fitting the model
warnings.resetwarnings()

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred, zero_division=1)

# Display the results
print("Accuracy:", accuracy)
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(classification_rep)

<h2> Naive Bayes </h2>

In [None]:
# Create and train the Naive Bayes model
model = GaussianNB()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred, zero_division=1)

# Display the results
print("\nNaive Bayes Results:")
print("Accuracy:", accuracy)
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(classification_rep)

# Re-enable warnings after fitting the model
warnings.resetwarnings()


<h2> SVM </h2>

In [None]:
# Create and train the SVM model with different kernels and C values
kernels = ['linear', 'poly', 'rbf']
C_values = [20]

for kernel in kernels:
    for C in C_values:
        # Create and train the SVM model
        model = SVC(kernel=kernel, C=C)
        model.fit(X_train, y_train)

        # Make predictions on the test set
        y_pred = model.predict(X_test)

        # Evaluate the model
        accuracy = accuracy_score(y_test, y_pred)
        conf_matrix = confusion_matrix(y_test, y_pred)
        classification_rep = classification_report(y_test, y_pred, zero_division=1)

        # Display the results
        print(f"\nSVM with Kernel={kernel} and C={C} Results:")
        print("Accuracy:", accuracy)
        print("\nConfusion Matrix:")
        print(conf_matrix)
        print("\nClassification Report:")
        print(classification_rep)

# Re-enable warnings after fitting the model
warnings.resetwarnings()

<h2> Random Forest </h2>

In [None]:
# Create and train the Random Forest model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred, zero_division=1)

# Display the results
print("\nRandom Forest Results:")
print("Accuracy:", accuracy)
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(classification_rep)

# Re-enable warnings after fitting the model
warnings.resetwarnings()

<h2> Linear Regression </h2>

In [None]:
# Create and train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Display the results
print("\nLinear Regression Results:")
print("Mean Squared Error:", mse)
print("R-squared:", r2)

# Re-enable warnings after fitting the model
warnings.resetwarnings()


<h2> XGBoost </h2>

In [None]:
# Create and train the XGBoost model
model = XGBClassifier()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred, zero_division=1)

# Display the results
print("\nXGBoost Results:")
print("Accuracy:", accuracy)
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(classification_rep)

# Re-enable warnings after fitting the model
warnings.resetwarnings()

<h2> Artificial Neural Network </h2>

In [None]:
# Create and train the ANN model
model = Sequential()
model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2, verbose=0)

# Make predictions on the test set
y_pred = (model.predict(X_test) > 0.5).astype(int).reshape(-1)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred, zero_division=1)

# Display the results
print("\nANN Results:")
print("Accuracy:", accuracy)
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(classification_rep)

# Re-enable warnings after fitting the model
warnings.resetwarnings()