In [18]:
import pandas as pd
import numpy as np

from catboost import CatBoostClassifier, Pool
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder


In [19]:
df = pd.read_csv('E:/Capstone_ICR_Project/Dataset/State_wise_Data_final/Illinois_10_years_data.csv')

In [20]:
df.head()

Unnamed: 0,Year,Quarter,Month,Day_of_Month,Day_of_Week,Operating_Carrier_Code,Tail_Number,Origin_Airport_ID,Origin_Airport_Code,Origin_State_Name,...,Wind_Direction_Degrees,Wind_Speed_Knots,Hourly_Precipitation_Inches,Pressure_Altimeter_Inches,Sea_Level_Pressure_Millibar,Visibility_Miles,Sky_Cover_Level_1,Sky_Level_1_Altitude_Feet,Apparent_Temperature_Fahrenheit,Target
0,2014,2,4,1.0,2.0,AA,N359AA,13930.0,ORD,Illinois,...,260.0,15.0,0.0,30.0,1014.2,10.0,SCT,22000.0,40.28,0.0
1,2014,2,4,1.0,2.0,AA,N3ADAA,13930.0,ORD,Illinois,...,260.0,24.0,0.0,30.0,1008.9,10.0,SCT,2800.0,20.44,0.0
2,2014,2,4,1.0,2.0,AA,N3AKAA,13930.0,ORD,Illinois,...,220.0,15.0,0.0,30.0,1004.2,10.0,SCT,4300.0,55.04,0.0
3,2014,2,4,1.0,2.0,AA,N3AKAA,13930.0,ORD,Illinois,...,250.0,20.0,0.0,30.0,1011.5,10.0,OVC,2600.0,21.74,0.0
4,2014,2,4,1.0,2.0,AA,N3ANAA,13930.0,ORD,Illinois,...,220.0,15.0,0.0,30.0,1004.2,10.0,CLR,4700.0,55.26,0.0


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2969937 entries, 0 to 2969936
Data columns (total 31 columns):
 #   Column                            Dtype  
---  ------                            -----  
 0   Year                              int64  
 1   Quarter                           int64  
 2   Month                             int64  
 3   Day_of_Month                      float64
 4   Day_of_Week                       float64
 5   Operating_Carrier_Code            object 
 6   Tail_Number                       object 
 7   Origin_Airport_ID                 float64
 8   Origin_Airport_Code               object 
 9   Origin_State_Name                 object 
 10  Destination_Airport_Code          object 
 11  Destination_State_Name            object 
 12  Scheduled_Departure_Time          float64
 13  Departure_Delay_Minutes           float64
 14  Taxi_Out_Time_Minutes             float64
 15  Flight_Distance_Miles             float64
 16  Departure_Datetime                ob

In [22]:
df.isna().sum()

Year                                0
Quarter                             0
Month                               0
Day_of_Month                        0
Day_of_Week                         0
Operating_Carrier_Code              0
Tail_Number                         0
Origin_Airport_ID                   0
Origin_Airport_Code                 0
Origin_State_Name                   0
Destination_Airport_Code            0
Destination_State_Name              0
Scheduled_Departure_Time            0
Departure_Delay_Minutes             0
Taxi_Out_Time_Minutes               0
Flight_Distance_Miles               0
Departure_Datetime                  0
Scheduled_Departure_Time_Minutes    0
Air_Temperature_Fahrenheit          0
Dew_Point_Temperature_Fahrenheit    0
Relative_Humidity_Percent           0
Wind_Direction_Degrees              0
Wind_Speed_Knots                    0
Hourly_Precipitation_Inches         0
Pressure_Altimeter_Inches           0
Sea_Level_Pressure_Millibar         0
Visibility_M

In [23]:
df.dropna(inplace= True)

In [24]:
# Convert integer features to int32
int_columns = [
    'Year', 
    'Quarter', 
    'Month', 
    'Day_of_Month', 
    'Day_of_Week', 
    'Scheduled_Departure_Time', 
    'Scheduled_Departure_Time_Minutes',
    'Target'
]

for col in int_columns:
    df[col] = df[col].astype(np.int32)

# Convert continuous numeric features to float32
float_columns = [
    'Departure_Delay_Minutes',
    'Taxi_Out_Time_Minutes',
    'Flight_Distance_Miles',
    'Air_Temperature_Fahrenheit',
    'Dew_Point_Temperature_Fahrenheit',
    'Relative_Humidity_Percent',
    'Wind_Direction_Degrees',
    'Wind_Speed_Knots',
    'Hourly_Precipitation_Inches',
    'Pressure_Altimeter_Inches',
    'Sea_Level_Pressure_Millibar',
    'Visibility_Miles',
    'Sky_Level_1_Altitude_Feet',
    'Apparent_Temperature_Fahrenheit',
    
]

for col in float_columns:
    df[col] = df[col].astype(np.float32)

# Convert categorical features to 'category' dtype
categorical_columns = [
    'Operating_Carrier_Code',
    'Tail_Number',
    'Origin_Airport_ID',
    'Origin_Airport_Code',
    'Origin_State_Name',
    'Destination_Airport_Code',
    'Destination_State_Name',
    'Sky_Cover_Level_1'
]

for col in categorical_columns:
    df[col] = df[col].astype('category')

In [25]:
df = df.drop([ 'Origin_State_Name','Departure_Datetime','Departure_Delay_Minutes'], axis=1)

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2969937 entries, 0 to 2969936
Data columns (total 28 columns):
 #   Column                            Dtype   
---  ------                            -----   
 0   Year                              int32   
 1   Quarter                           int32   
 2   Month                             int32   
 3   Day_of_Month                      int32   
 4   Day_of_Week                       int32   
 5   Operating_Carrier_Code            category
 6   Tail_Number                       category
 7   Origin_Airport_ID                 category
 8   Origin_Airport_Code               category
 9   Destination_Airport_Code          category
 10  Destination_State_Name            category
 11  Scheduled_Departure_Time          int32   
 12  Taxi_Out_Time_Minutes             float32 
 13  Flight_Distance_Miles             float32 
 14  Scheduled_Departure_Time_Minutes  int32   
 15  Air_Temperature_Fahrenheit        float32 
 16  Dew_Point_Temperat

In [27]:
df['Year'].value_counts()

Year
2019    426951
2020    420073
2015    395345
2014    362592
2017    301209
2023    300540
2016    292813
2021    254563
2018    166021
2024     25193
2022     24637
Name: count, dtype: int64

In [28]:
df1= df[df['Month'] == 1]

In [30]:
train_df['Year'].value_counts()

Year
2018    32116
2019    30290
2015    30099
2017    25477
2016    25412
2014    15129
Name: count, dtype: int64

In [None]:
df1['Year'] = df1['Year'].astype(int)

train_years = [2014, 2015, 2016, 2017, 2018, 2019]
val_years = [2020, 2021, 2022]
test_years = [2023, 2024]


In [35]:
# Create the training set
train_df = df1[df1['Year'].isin(train_years)].reset_index(drop=True)

# Create the validation set
val_df = df1[df1['Year'].isin(val_years)].reset_index(drop=True)

# Create the test set
test_df = df1[df1['Year'].isin(test_years)].reset_index(drop=True)

# Separate features and target
X_train = train_df.drop('Target', axis=1)
y_train = train_df['Target']

X_val = val_df.drop('Target', axis=1)
y_val = val_df['Target']

X_test = test_df.drop('Target', axis=1)
y_test = test_df['Target']



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['Year'] = df1['Year'].astype(int)


In [36]:
cat_features = X_train.select_dtypes(include=['category']).columns.tolist()

for cat_col in cat_features:
    X_train[cat_col] = X_train[cat_col].astype(str)
    X_val[cat_col] = X_val[cat_col].astype(str)
    X_test[cat_col] = X_test[cat_col].astype(str)


In [45]:
cat_features

['Operating_Carrier_Code',
 'Tail_Number',
 'Origin_Airport_ID',
 'Origin_Airport_Code',
 'Destination_Airport_Code',
 'Destination_State_Name',
 'Sky_Cover_Level_1']

In [37]:
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weights = dict(zip(np.unique(y_train), class_weights))

print(f"Class Weights: {class_weights}")



Class Weights: {0: 0.6878547253319448, 1: 1.8308156052941584}


In [38]:
train_pool = Pool(data=X_train, label=y_train, cat_features=cat_features)
val_pool = Pool(data=X_val, label=y_val, cat_features=cat_features)

model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.01,
    eval_metric='F1',
    random_seed=42,
    use_best_model=True,
    class_weights=[class_weights[0], class_weights[1]],
    early_stopping_rounds=100,
    l2_leaf_reg=3,
    border_count=254,
    depth=6
)

model.fit(
    train_pool,
    eval_set=val_pool,
    verbose=100
)

0:	learn: 0.6210115	test: 0.4449313	best: 0.4449313 (0)	total: 354ms	remaining: 5m 53s
100:	learn: 0.6631658	test: 0.4559437	best: 0.4736184 (2)	total: 31.7s	remaining: 4m 41s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.4736184048
bestIteration = 2

Shrink model to first 3 iterations.


<catboost.core.CatBoostClassifier at 0x235556dd350>

In [39]:
test_pool = Pool(data=X_test, label=y_test, cat_features=cat_features)

y_pred = model.predict(test_pool)
y_pred_proba = model.predict_proba(test_pool)[:, 1]

# Evaluate the model
test_accuracy = accuracy_score(y_test, y_pred)
print(f'Test Accuracy: {test_accuracy}\n')

print('Classification Report:')
print(classification_report(y_test, y_pred))

print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))



Test Accuracy: 0.6501409121581392

Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.70      0.74     18140
           1       0.40      0.51      0.45      7053

    accuracy                           0.65     25193
   macro avg       0.59      0.61      0.60     25193
weighted avg       0.68      0.65      0.66     25193

Confusion Matrix:
[[12768  5372]
 [ 3442  3611]]


In [42]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

# Prepare data for cross-validation
X_cv = pd.concat([X_train, X_val], axis=0)
y_cv = pd.concat([y_train, y_val], axis=0)

for cat_col in cat_features:
    X_cv[cat_col] = X_cv[cat_col].astype(str)

# Initialize Stratified K-Fold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

model_cv = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.01,
    eval_metric='F1',
    random_seed=42,
    use_best_model=False,
    class_weights=[class_weights[0], class_weights[1]],
    l2_leaf_reg=3,
    border_count=254,
    depth=6
)

cv_scores = []

for train_index, test_index in skf.split(X_cv, y_cv):
    X_fold_train, X_fold_val = X_cv.iloc[train_index], X_cv.iloc[test_index]
    y_fold_train, y_fold_val = y_cv.iloc[train_index], y_cv.iloc[test_index]
    
    train_pool_cv = Pool(data=X_fold_train, label=y_fold_train, cat_features=cat_features)
    val_pool_cv = Pool(data=X_fold_val, label=y_fold_val, cat_features=cat_features)
    
    model_cv.fit(
        train_pool_cv,
        eval_set=val_pool_cv,
        verbose=100,
        early_stopping_rounds=100
    )
    
    y_fold_pred = model_cv.predict(val_pool_cv)
    fold_f1 = classification_report(y_fold_val, y_fold_pred, output_dict=True)['1']['f1-score']
    cv_scores.append(fold_f1)
    print(f'Fold F1-Score: {fold_f1}\n')

print(f'Mean F1-Score from Cross-Validation: {np.mean(cv_scores)}')



0:	learn: 0.5740489	test: 0.5798735	best: 0.5798735 (0)	total: 299ms	remaining: 4m 58s
100:	learn: 0.6122586	test: 0.6171229	best: 0.6171229 (100)	total: 31.5s	remaining: 4m 40s
200:	learn: 0.6244373	test: 0.6301543	best: 0.6302337 (195)	total: 1m 6s	remaining: 4m 25s
300:	learn: 0.6320269	test: 0.6365470	best: 0.6369849 (278)	total: 1m 40s	remaining: 3m 53s
400:	learn: 0.6390320	test: 0.6440725	best: 0.6440725 (400)	total: 2m 14s	remaining: 3m 20s
500:	learn: 0.6435561	test: 0.6476493	best: 0.6481933 (495)	total: 2m 49s	remaining: 2m 48s
600:	learn: 0.6475939	test: 0.6504881	best: 0.6509319 (573)	total: 3m 24s	remaining: 2m 16s
700:	learn: 0.6505656	test: 0.6536980	best: 0.6536980 (700)	total: 3m 58s	remaining: 1m 41s
800:	learn: 0.6537453	test: 0.6568823	best: 0.6570540 (793)	total: 4m 32s	remaining: 1m 7s
900:	learn: 0.6566541	test: 0.6587277	best: 0.6588743 (890)	total: 5m 6s	remaining: 33.7s
999:	learn: 0.6592877	test: 0.6591633	best: 0.6601312 (946)	total: 5m 40s	remaining: 0us

