In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.exceptions import ConvergenceWarning
import warnings

# Load the dataset
file_path = 'output_dataset.csv'  # Replace 'your_dataset.csv' with the actual file path
df = pd.read_csv(file_path)

# Drop non-numeric columns that are not needed for prediction
df = df.drop(['DayOfWeek', 'Date', 'UniqueCarrier', 'Airline', 'FlightNum', 'TailNum', 'Origin', 'Org_Airport', 'Dest', 'Dest_Airport', 'Cancelled', 'CancellationCode', 'Diverted'], axis=1)

# Convert categorical variables to numeric using Label Encoding
label_encoder = LabelEncoder()
df['CarrierDelay'] = label_encoder.fit_transform(df['CarrierDelay'])
df['WeatherDelay'] = label_encoder.fit_transform(df['WeatherDelay'])
df['NASDelay'] = label_encoder.fit_transform(df['NASDelay'])
df['SecurityDelay'] = label_encoder.fit_transform(df['SecurityDelay'])
df['LateAircraftDelay'] = label_encoder.fit_transform(df['LateAircraftDelay'])

# Separate features (X) and target variable (y)
X = df.drop('ArrDelay', axis=1)
y = df['ArrDelay']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Suppress ConvergenceWarnings
warnings.filterwarnings("ignore", category=ConvergenceWarning)

# Create and train the logistic regression model with increased max_iter
model = LogisticRegression(max_iter=8)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Re-enable warnings after fitting the model
warnings.resetwarnings()

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred, zero_division=1)

# Display the results for the first 8 classes
target_classes = [str(i) for i in range(8)]
print("Accuracy:", accuracy)
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report (for the first 8 classes):")
report_lines = classification_rep.split('\n')
for class_label in target_classes:
    print(report_lines[int(class_label) + 2])


Accuracy: 0.033536270766690746

Confusion Matrix:
[[264 115  98 ...   0   0   0]
 [230  88 103 ...   0   0   0]
 [241  86  82 ...   0   0   0]
 ...
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]]

Classification Report (for the first 8 classes):
          15       0.08      0.49      0.14       543
          16       0.09      0.18      0.12       488
          17       0.07      0.17      0.10       478
          18       0.06      0.06      0.06       481
          19       0.04      0.05      0.04       487
          20       0.03      0.05      0.04       451
          21       0.02      0.03      0.02       439
          22       0.00      0.00      1.00       407
