In [18]:
import pandas as pd

from joblib import dump, load

from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

In [19]:
dataset = pd.read_parquet("../data/flights.parquet")

### Delay Departure Model

In [20]:
se = StandardScaler()
ohe = OneHotEncoder(
    handle_unknown="ignore"
)

column_transformer = make_column_transformer(
    (se, ["flight_day", "flight_month", "flight_year"]),
    (ohe, ["airline_iata", "iata_arrival"])
)

#### Logistic Regression

In [21]:
lr = LogisticRegression()
lr_pipeline = make_pipeline(column_transformer, lr)

In [22]:
features = [
    "airline_iata",
    "iata_arrival",
    "flight_day",
    "flight_month",
    "flight_year",
]

X = dataset[features]
y = dataset["delayed_departure"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=21)

In [23]:
lr_pipeline.fit(X_train, y_train)

In [24]:
y_pred = lr_pipeline.predict(X_test)
delay_prob = lr_pipeline.predict_proba(X_test)[:,-1]

acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print(f"Accuracy: {acc:2f}")
print(f"Precision: {precision:2f}")
print(f"Recall: {recall:2f}")
print(f"F1: {f1:2f}")

Accuracy: 0.833333
Precision: 0.833333
Recall: 0.365854
F1: 0.508475


#### SVC

In [25]:
svc = SVC(probability=True)
svc_pipeline = make_pipeline(column_transformer, svc)
svc_pipeline.fit(X_train, y_train)

In [26]:
y_pred = svc_pipeline.predict(X_test)
delay_prob = svc_pipeline.predict_proba(X_test)[:,-1]

acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print(f"Accuracy: {acc:2f}")
print(f"Precision: {precision:2f}")
print(f"Recall: {recall:2f}")
print(f"F1: {f1:2f}")

Accuracy: 0.793103
Precision: 0.727273
Recall: 0.195122
F1: 0.307692


#### Save Best

In [27]:
dump(lr_pipeline, "../models/departure.joblib")

['../models/departure.joblib']

### Delay Arrival Model

#### Logistic Regression

In [28]:
features = [
    "airline_iata",
    "iata_arrival",
    "flight_day",
    "flight_month",
    "flight_year",
]

X = dataset[features]
y = dataset["delayed_arrival"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=21)

In [29]:
lr = LogisticRegression()
lr_pipeline = make_pipeline(column_transformer, lr)

In [30]:
lr_pipeline.fit(X_train, y_train)

In [31]:
y_pred = lr_pipeline.predict(X_test)
delay_prob = lr_pipeline.predict_proba(X_test)[:,-1]

acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print(f"Accuracy: {acc:2f}")
print(f"Precision: {precision:2f}")
print(f"Recall: {recall:2f}")
print(f"F1: {f1:2f}")

Accuracy: 0.804598
Precision: 0.817647
Recall: 0.978873
F1: 0.891026


#### SVC

In [32]:
svc = SVC(probability=True)
svc_pipeline = make_pipeline(column_transformer, svc)
svc_pipeline.fit(X_train, y_train)

In [33]:
y_pred = svc_pipeline.predict(X_test)
delay_prob = svc_pipeline.predict_proba(X_test)[:,-1]

acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print(f"Accuracy: {acc:2f}")
print(f"Precision: {precision:2f}")
print(f"Recall: {recall:2f}")
print(f"F1: {f1:2f}")

Accuracy: 0.816092
Precision: 0.819767
Recall: 0.992958
F1: 0.898089


#### Save Best

In [34]:
dump(svc_pipeline, "../models/arrival.joblib") 

['../models/arrival.joblib']