In [1]:
import pandas as pd 
import numpy as np
import warnings
warnings.filterwarnings('ignore')

data=pd.read_csv(r'D:\UMBC\602\Project\2006.csv')
data.shape

(7141922, 29)

In [3]:
#dropping columns which are unnecessary 
data.drop(columns=['Year','FlightNum','TailNum','ArrDelay',
       'DepDelay','TaxiIn', 'TaxiOut','Diverted',
       'AirTime','CancellationCode', 'CarrierDelay',
       'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay','DepTime','ArrTime','ActualElapsedTime','CRSElapsedTime'],inplace=True)

In [4]:
# copying a dataframe into other dataframe
df=data.copy()
df.shape

(7141922, 10)

In [5]:
# Columns remaining after dropping the columns which are not necessary
df.columns

Index(['Month', 'DayofMonth', 'DayOfWeek', 'CRSDepTime', 'CRSArrTime',
       'UniqueCarrier', 'Origin', 'Dest', 'Distance', 'Cancelled'],
      dtype='object')

In [6]:
# check the null values in the dataframe
df.isna().sum()

Month            0
DayofMonth       0
DayOfWeek        0
CRSDepTime       0
CRSArrTime       0
UniqueCarrier    0
Origin           0
Dest             0
Distance         0
Cancelled        0
dtype: int64

In [7]:
# Count number of records for the target class
df.Cancelled.value_counts()

0    7019988
1     121934
Name: Cancelled, dtype: int64

- Since the class is unbalanced, we try to downsample the majority class, but before that since there are large number of dataset, we randomly sample the data of each carrier for each month. 

In [9]:
#ignore
random_records = pd.DataFrame()

for carrier in df['UniqueCarrier'].unique():
    carrier_data = df[df['UniqueCarrier'] == carrier]
    grouped_data = carrier_data.groupby('Month')
    
    for month, group in grouped_data:
        if len(group) >= 800:
            sampled_group = group.sample(n=800)
        else:
            sampled_group = group
        random_records = pd.concat([random_records, sampled_group])

In [10]:
# Converting the categorical values into numerical values using OneHotEncoding

categorical_features = ['UniqueCarrier','Origin', 'Dest']
df_encoded = pd.get_dummies(random_records, columns=categorical_features, dtype='int')
df_encoded.shape

(189600, 599)

In [11]:
# top 5 records
df_encoded.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,CRSDepTime,CRSArrTime,Distance,Cancelled,UniqueCarrier_AA,UniqueCarrier_AQ,UniqueCarrier_AS,...,Dest_TXK,Dest_TYR,Dest_TYS,Dest_VLD,Dest_VPS,Dest_WRG,Dest_WYS,Dest_XNA,Dest_YAK,Dest_YUM
19560,1,24,2,1614,2030,1276,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
281599,1,4,3,855,1108,2136,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4000,1,13,5,1145,1314,336,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
29206,1,31,2,1955,2200,762,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27419,1,30,1,1940,2056,329,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
# Value counts of the target class
df_encoded.Cancelled.value_counts()

0    186508
1      3092
Name: Cancelled, dtype: int64

In [13]:
# Downsampling the majority class

from sklearn.utils import resample

minority_class = df_encoded[df_encoded["Cancelled"] == 1]
majority_class = df_encoded[df_encoded["Cancelled"] == 0]

# Calculate the desired sample size
desired_sample_size = len(minority_class)

# Downsample the majority class
downsampled_majority = resample(majority_class,
                                replace=False,  # Set to True if the sample size is larger than the majority class
                                n_samples=desired_sample_size,
                                random_state=42)

# Combine the minority class with the downsampled majority class
balanced_df = pd.concat([minority_class, downsampled_majority])

# Shuffling the balanced dataset
balanced_df = balanced_df.sample(frac=1, random_state=42)

print(balanced_df["Cancelled"].value_counts())


0    3092
1    3092
Name: Cancelled, dtype: int64


In [14]:
# Train-test-split the data for modelling

from sklearn.model_selection import train_test_split
target_col = 'Cancelled'
y = balanced_df[target_col]

# Drop the 'DepDelay' column
X = balanced_df.drop('Cancelled', axis=1)
X.shape

(6184, 598)

In [15]:
# PCA to reduce the dimentions 

from sklearn.decomposition import PCA

pca = PCA(n_components=10)
X_pca = pca.fit_transform(X)
# Shape of data frame after applying PCA
print(X_pca.shape)
print("Total number of components after applying PCA",len(pca.components_))

(6184, 10)
Total number of components after applying PCA 10


In [16]:

from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# smote = SMOTE(random_state=42)
# X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# print(X_train_resampled.shape)
print(X_pca.shape)
print(y.shape)

(6184, 10)
(6184,)


In [63]:

y_train_resampled.value_counts()

0    2462
1    2462
Name: Cancelled, dtype: int64

In [64]:
# Random Forest Classifier 

rf_classifier = RandomForestClassifier()

rf_classifier.fit(X_train_resampled, y_train_resampled)

y_pred = rf_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.6682926829268293


In [65]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.66      0.67      0.67       612
           1       0.67      0.66      0.67       618

    accuracy                           0.67      1230
   macro avg       0.67      0.67      0.67      1230
weighted avg       0.67      0.67      0.67      1230



In [66]:
# Logistic Regression 

from sklearn.linear_model import LogisticRegression

lr_classifier = LogisticRegression()

lr_classifier.fit(X_train_resampled, y_train_resampled)

y_pred_lr = lr_classifier.predict(X_test)

accuracy_lr = accuracy_score(y_test, y_pred_lr)
print("Logistic Regression Accuracy:", accuracy_lr)

print(classification_report(y_test, y_pred_lr))

Logistic Regression Accuracy: 0.6073170731707317
              precision    recall  f1-score   support

           0       0.61      0.59      0.60       612
           1       0.61      0.63      0.62       618

    accuracy                           0.61      1230
   macro avg       0.61      0.61      0.61      1230
weighted avg       0.61      0.61      0.61      1230



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [68]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(4918, 10)
(1230, 10)
(4918,)
(1230,)


In [19]:
# XG Boost Classifier 

import pandas as pd
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

# Define the parameter grid for hyperparameter tuning
param_grid = {
    "n_estimators": [100, 200],
    "learning_rate": [0.1, 0.01],
    "max_depth": [3, 4],
}

# Initialize the XGBoost classifier
xgb_model = xgb.XGBClassifier()

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Evaluate the model performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy * 100))




Accuracy: 65.80%


In [20]:
# Print the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.63      0.70      0.66       596
           1       0.69      0.62      0.65       641

    accuracy                           0.66      1237
   macro avg       0.66      0.66      0.66      1237
weighted avg       0.66      0.66      0.66      1237



In [17]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Initialize the individual classifiers
log_reg = LogisticRegression()
random_forest = RandomForestClassifier()
xgb_classifier = XGBClassifier()

# Train the individual classifiers
log_reg.fit(X_train, y_train)
random_forest.fit(X_train, y_train)
xgb_classifier.fit(X_train, y_train)

# Make predictions on the test set
log_reg_pred = log_reg.predict(X_test)
random_forest_pred = random_forest.predict(X_test)
xgb_pred = xgb_classifier.predict(X_test)

# Combine predictions using majority voting
ensemble_pred = []
for i in range(len(X_test)):
    predictions = [log_reg_pred[i], random_forest_pred[i], xgb_pred[i]]
    ensemble_pred.append(max(set(predictions), key=predictions.count))

# Evaluate the ensemble model
ensemble_accuracy = accuracy_score(y_test, ensemble_pred)
ensemble_classification_report = classification_report(y_test, ensemble_pred)

# Print the ensemble model's performance
print("Ensemble Accuracy: {:.2f}%".format(ensemble_accuracy * 100))
print("Ensemble Classification Report:")
print(ensemble_classification_report)


Ensemble Accuracy: 66.53%
Ensemble Classification Report:
              precision    recall  f1-score   support

           0       0.64      0.70      0.67       596
           1       0.69      0.63      0.66       641

    accuracy                           0.67      1237
   macro avg       0.67      0.67      0.67      1237
weighted avg       0.67      0.67      0.67      1237



In [21]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report


# Define the parameter grids for hyperparameter tuning
log_reg_param_grid = {
    "C": [0.1, 1, 10],
    "penalty": ["l1", "l2"]
}

rf_param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [None, 5, 10]
}

xgb_param_grid = {
    "n_estimators": [100, 200, 300],
    "learning_rate": [0.1, 0.01, 0.001],
    "max_depth": [3, 4, 5]
}

# Initialize the individual classifiers
log_reg = LogisticRegression()
random_forest = RandomForestClassifier()
xgb_classifier = XGBClassifier()

# Perform grid search with cross-validation for each classifier
log_reg_grid_search = GridSearchCV(estimator=log_reg, param_grid=log_reg_param_grid, cv=5)
log_reg_grid_search.fit(X_train, y_train)

random_forest_grid_search = GridSearchCV(estimator=random_forest, param_grid=rf_param_grid, cv=5)
random_forest_grid_search.fit(X_train, y_train)

xgb_grid_search = GridSearchCV(estimator=xgb_classifier, param_grid=xgb_param_grid, cv=5)
xgb_grid_search.fit(X_train, y_train)

# Get the best models for each classifier
best_log_reg = log_reg_grid_search.best_estimator_
best_random_forest = random_forest_grid_search.best_estimator_
best_xgb_classifier = xgb_grid_search.best_estimator_

# Make predictions on the test set for each classifier
log_reg_pred = best_log_reg.predict(X_test)
random_forest_pred = best_random_forest.predict(X_test)
xgb_pred = best_xgb_classifier.predict(X_test)

# Combine predictions using majority voting
ensemble_pred = []
for i in range(len(X_test)):
    predictions = [log_reg_pred[i], random_forest_pred[i], xgb_pred[i]]
    ensemble_pred.append(max(set(predictions), key=predictions.count))

# Evaluate the ensemble model
ensemble_accuracy = accuracy_score(y_test, ensemble_pred)
ensemble_classification_report = classification_report(y_test, ensemble_pred)

# Print the ensemble model's performance
print("Ensemble Accuracy: {:.2f}%".format(ensemble_accuracy * 100))
print("Ensemble Classification Report:")
print(ensemble_classification_report)












Ensemble Accuracy: 66.29%
Ensemble Classification Report:
              precision    recall  f1-score   support

           0       0.64      0.69      0.67       596
           1       0.69      0.63      0.66       641

    accuracy                           0.66      1237
   macro avg       0.66      0.66      0.66      1237
weighted avg       0.67      0.66      0.66      1237



- The accuracy remains the same for ensemble modeling even after hyperparameter tuning and 5 Fold Cross validation. 