# Imports

In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [None]:
import warnings
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report

# Data Loading

In [None]:
!unzip dataset.zip

Archive:  dataset.zip
  inflating: flight-delays-fall-2018/flight_delays_test.csv  
  inflating: flight-delays-fall-2018/flight_delays_train.csv  


In [None]:
PATH_TO_DATA = Path('flight-delays-fall-2018/')

In [None]:
train_df = pd.read_csv(PATH_TO_DATA / 'flight_delays_train.csv')

In [None]:
test_df = pd.read_csv(PATH_TO_DATA / 'flight_delays_test.csv')

# Data understanding

In [None]:
train_df.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732,N
1,c-4,c-20,c-3,1548,US,PIT,MCO,834,N
2,c-9,c-2,c-5,1422,XE,RDU,CLE,416,N
3,c-11,c-25,c-6,1015,OO,DEN,MEM,872,N
4,c-10,c-7,c-6,1828,WN,MDW,OMA,423,Y


In [None]:
class_counts = train_df['dep_delayed_15min'].value_counts()

print("Number of data per class:")
print(class_counts)

Number of data per class:
dep_delayed_15min
N    80956
Y    19044
Name: count, dtype: int64


In [None]:
test_df.head()
# bisa coba lakukan data cleaning, seperti DepTime harusnya gada yang 25

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance
0,c-7,c-25,c-3,615,YV,MRY,PHX,598
1,c-4,c-17,c-2,739,WN,LAS,HOU,1235
2,c-12,c-2,c-7,651,MQ,GSP,ORD,577
3,c-3,c-25,c-7,1614,WN,BWI,MHT,377
4,c-6,c-6,c-3,1505,UA,ORD,STL,258


# Feature Engineering

In [None]:
#Extract the labels
train_y = train_df.pop('dep_delayed_15min')
train_y = train_y.map({'N': 0, 'Y': 1})

# Concatenate for preprocessing
train_split = train_df.shape[0]
full_df = pd.concat((train_df, test_df))

# Hour and minute
full_df['hour'] = full_df['DepTime'] // 100
full_df.loc[full_df['hour'] == 24, 'hour'] = 0
full_df.loc[full_df['hour'] == 25, 'hour'] = 1
full_df['minute'] = full_df['DepTime'] % 100

# Season
full_df['summer'] = (full_df['Month'].isin(['c-6', 'c-7', 'c-8'])).astype(np.int32)
full_df['autumn'] = (full_df['Month'].isin(['c-9', 'c-10', 'c-11'])).astype(np.int32)
full_df['winter'] = (full_df['Month'].isin(['c-12', 'c-1', 'c-2'])).astype(np.int32)
full_df['spring'] = (full_df['Month'].isin(['c-3', 'c-4', 'c-5'])).astype(np.int32)

# Daytime
full_df['daytime'] = pd.cut(full_df['hour'], bins=[0, 6, 12, 18, 23], include_lowest=True)

# DepHour < 17
full_df['DepHour<17'] = (full_df['hour'] < 17).astype(np.int32)

In [None]:
# String to numerical
for col in ['Month', 'DayofMonth', 'DayOfWeek']:
    full_df[col] = full_df[col].apply(lambda x: x.split('-')[1]).astype(np.int32) - 1

# Label Encoding
for col in ['Origin', 'Dest', 'UniqueCarrier', 'daytime']:
    full_df[col] = pd.factorize(full_df[col])[0]

# Categorical columns
cat_cols = ['Month', 'DayofMonth', 'DayOfWeek', 'Origin', 'Dest', 'UniqueCarrier', 'hour', 'summer', 'autumn', 'winter', 'spring', 'daytime']

for c in cat_cols:
    full_df[c] = full_df[c].astype('category')

# Split into train and test
train_df, test_df = full_df.iloc[:train_split], full_df.iloc[train_split:]
train_df.shape, train_y.shape, test_df.shape

((100000, 16), (100000,), (100000, 16))

In [None]:
train_df

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,hour,minute,summer,autumn,winter,spring,daytime,DepHour<17
0,7,20,6,1934,0,0,0,732,19,34,1,0,0,0,0,0
1,3,19,2,1548,1,1,1,834,15,48,0,0,0,1,1,1
2,8,1,4,1422,2,2,2,416,14,22,0,1,0,0,1,1
3,10,24,5,1015,3,3,3,872,10,15,0,1,0,0,2,1
4,9,6,5,1828,4,4,4,423,18,28,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,4,3,2,1618,3,28,143,199,16,18,0,0,0,1,1,1
99996,0,17,2,804,11,60,163,884,8,4,0,0,1,0,2,1
99997,0,23,1,1901,5,66,12,1076,19,1,0,0,1,0,0,0
99998,3,26,3,1515,10,12,226,140,15,15,0,0,0,1,1,1


In [None]:
categ_feat_idx = np.where((train_df.dtypes=='object') | (train_df.dtypes=='category') )[0]
categ_feat_idx

array([ 0,  1,  2,  4,  5,  6,  8, 10, 11, 12, 13, 14])

In [None]:
train_df.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,hour,minute,summer,autumn,winter,spring,daytime,DepHour<17
0,7,20,6,1934,0,0,0,732,19,34,1,0,0,0,0,0
1,3,19,2,1548,1,1,1,834,15,48,0,0,0,1,1,1
2,8,1,4,1422,2,2,2,416,14,22,0,1,0,0,1,1
3,10,24,5,1015,3,3,3,872,10,15,0,1,0,0,2,1
4,9,6,5,1828,4,4,4,423,18,28,0,1,0,0,1,0


# Handling imbalance

**WHY?**
```
Number of data per class:
dep_delayed_15min
N    80956
Y    19044
Name: count, dtype: int64
```

# Training

In [None]:
X_train_part, X_valid, y_train_part, y_valid = train_test_split(train_df, train_y,
                                                                test_size=0.3,
                                                                random_state=17)

In [None]:
from sklearn.preprocessing import OrdinalEncoder
from imblearn.combine import SMOTEENN
smote_enn = SMOTEENN(sampling_strategy='not majority')

encoder = OrdinalEncoder()
X_train_part_encoded = encoder.fit_transform(X_train_part)
X_train_balanced, y_train_balanced = smote_enn.fit_resample(X_train_part_encoded, y_train_part)

In [None]:
# from sklearn.preprocessing import OrdinalEncoder
# from imblearn.over_sampling import ADASYN

# adasyn = ADASYN(sampling_strategy='minority')
# encoder = OrdinalEncoder()
# X_train_part_encoded = encoder.fit_transform(X_train_part)
# X_train_balanced, y_train_balanced = adasyn.fit_resample(X_train_part_encoded, y_train_part)

In [None]:
# Get unique values and their counts
unique_values, counts = np.unique(y_train_balanced, return_counts=True)

# Print the counts
print("Number of data per class:")
for value, count in zip(unique_values, counts):
    print(f"Class {value}: {count}")

# # Before
# N    80956
# Y    19044

Number of data per class:
Class 0: 33115
Class 1: 46584


In [None]:
from catboost import CatBoostClassifier, Pool
ctb = CatBoostClassifier(random_state=17,
                         silent=True,
                        #  learning_rate=0.01,
                         iterations=500,
                        #  bagging_temperature=1,
                        #  depth=6,
                         loss_function='Logloss',
                         eval_metric='AUC',
                         l2_leaf_reg=1.5
                        )

valid_set = Pool(data=X_valid,
                  label=y_valid,
                  cat_features=categ_feat_idx
                  )

ctb.fit(train_df, train_y,
        cat_features=categ_feat_idx,
        use_best_model=True,
        early_stopping_rounds=20,
        plot=True,
        eval_set=valid_set,
        verbose=True);

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	test: 0.6509472	best: 0.6509472 (0)	total: 407ms	remaining: 3m 23s
1:	test: 0.6638556	best: 0.6638556 (1)	total: 582ms	remaining: 2m 24s
2:	test: 0.6704403	best: 0.6704403 (2)	total: 1.03s	remaining: 2m 50s
3:	test: 0.6762358	best: 0.6762358 (3)	total: 1.34s	remaining: 2m 46s
4:	test: 0.6800338	best: 0.6800338 (4)	total: 1.78s	remaining: 2m 56s
5:	test: 0.6799768	best: 0.6800338 (4)	total: 1.96s	remaining: 2m 41s
6:	test: 0.6844099	best: 0.6844099 (6)	total: 2.35s	remaining: 2m 45s
7:	test: 0.6880126	best: 0.6880126 (7)	total: 2.67s	remaining: 2m 44s
8:	test: 0.6898593	best: 0.6898593 (8)	total: 3.04s	remaining: 2m 46s
9:	test: 0.6914502	best: 0.6914502 (9)	total: 3.48s	remaining: 2m 50s
10:	test: 0.6926915	best: 0.6926915 (10)	total: 4.04s	remaining: 2m 59s
11:	test: 0.6929927	best: 0.6929927 (11)	total: 4.47s	remaining: 3m 1s
12:	test: 0.6936339	best: 0.6936339 (12)	total: 4.83s	remaining: 3m
13:	test: 0.6944818	best: 0.6944818 (13)	total: 5.33s	remaining: 3m 4s
14:	test: 0.695046

In [None]:
ctb_test_pred = ctb.predict_proba(test_df)[:, 1]
ctb_test_pred

# array([0.01751595, 0.04798026, 0.03079552, ..., 0.43106122, 0.06908342,
#        0.15010744])

# array([0.01354074, 0.0532012 , 0.02625807, ..., 0.50822915, 0.07478639,
#        0.15840951])

# array([0.01266802, 0.05563983, 0.02666962, ..., 0.52194224, 0.0763849 ,
#        0.16049282])

# array([0.02114111, 0.04496973, 0.03002092, ..., 0.36789419, 0.07159183,
#        0.1456582 ]) + learning rate 0.1

# array([0.01806567, 0.04811216, 0.02953581, ..., 0.41583744, 0.06877989,
#        0.14860559])

array([0.01751595, 0.04798026, 0.03079552, ..., 0.43106122, 0.06908342,
       0.15010744])

# Submit Result

In [None]:
# Prepare the submission file with test data probabilities
submission = pd.DataFrame({
    'id': range(len(ctb_test_pred)),
    'dep_delayed_15min': ctb_test_pred
})

# Save the submission to a CSV file
submission.to_csv('submission.csv', index=False)

# Display the first few rows of the submission
print(submission.head())

   id  dep_delayed_15min
0   0           0.017516
1   1           0.047980
2   2           0.030796
3   3           0.344056
4   4           0.357655


In [None]:
submission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 2 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   id                 100000 non-null  int64  
 1   dep_delayed_15min  100000 non-null  float64
dtypes: float64(1), int64(1)
memory usage: 1.5 MB
