In [10]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import joblib
import numpy as np
from tqdm import tqdm

DATA_FILE = "/kaggle/input/flight-delay-dataset-20182022/raw/Flights_2018_10.csv"

print("Reading file:", DATA_FILE)
df = pd.read_csv(DATA_FILE)

df = df[['CRSDepTime', 'DayOfWeek', 'Distance',
         'OriginAirportID', 'DestAirportID', 'DepDel15']]

# Убираем строки с пропусками в целевой
df = df[df['DepDel15'].notnull()]

df['CRSDepHour'] = df['CRSDepTime'] // 100
df.drop('CRSDepTime', axis=1, inplace=True)

df['DayOfWeek'] = df['DayOfWeek'].astype(str)
df['OriginAirportID'] = df['OriginAirportID'].astype(str)
df['DestAirportID'] = df['DestAirportID'].astype(str)

# Целевая переменная
df['DepDel15'] = df['DepDel15'].astype(int)

# Разделяем на train и поток для имитации реальных данных
df_train, df_stream = train_test_split(df, test_size=0.2, random_state=42, stratify=df['DepDel15'])

OUTPUT_DIR = "/kaggle/working/processed"
os.makedirs(OUTPUT_DIR, exist_ok=True)
df_train.to_parquet(os.path.join(OUTPUT_DIR, "flight_train.parquet"), index=False)
df_stream.to_parquet(os.path.join(OUTPUT_DIR, "flight_stream.parquet"), index=False)

print("Preprocessing finished!")
print(f"Train samples: {len(df_train)}, Stream samples: {len(df_stream)}")
print("Sample data:")
print(df.head())

Reading file: /kaggle/input/flight-delay-dataset-20182022/raw/Flights_2018_10.csv


  df = pd.read_csv(DATA_FILE)


Preprocessing finished!
Train samples: 532115, Stream samples: 133029
Sample data:
  DayOfWeek  Distance OriginAirportID DestAirportID  DepDel15  CRSDepHour
0         5     834.0           12892         14057         0          23
1         5     937.0           14747         10800         0           8
2         5     859.0           14057         14908         0          12
3         5    1107.0           14107         14747         0          13
4         5     987.0           14747         14262         0           9


In [11]:
TRAIN_FILE = "/kaggle/working/processed/flight_train.parquet"
OUTPUT_DIR = "/kaggle/working/model"
BATCH_SIZE = 1024
EPOCHS = 10
LR = 0.001
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

os.makedirs(OUTPUT_DIR, exist_ok=True)
df = pd.read_parquet(TRAIN_FILE)
print("Train samples:", len(df))
print(df.head())

Train samples: 532115
  DayOfWeek  Distance OriginAirportID DestAirportID  DepDel15  CRSDepHour
0         4    1044.0           14107         13198         0           7
1         1     164.0           11049         11298         0          16
2         4     436.0           15048         13930         0          12
3         2     338.0           14107         14908         0          21
4         5     987.0           14262         14747         0          13


In [12]:
categorical_features = ['DayOfWeek', 'OriginAirportID', 'DestAirportID']
numerical_features = ['Distance', 'CRSDepHour']
target = 'DepDel15'

ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_cat = ohe.fit_transform(df[categorical_features])

scaler = StandardScaler()
X_num = scaler.fit_transform(df[numerical_features])

X = np.hstack([X_cat, X_num])
y = df[target].values

joblib.dump(ohe, os.path.join(OUTPUT_DIR, "ohe.joblib"))
joblib.dump(scaler, os.path.join(OUTPUT_DIR, "scaler.joblib"))

['/kaggle/working/model/scaler.joblib']

In [13]:
class FlightDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = FlightDataset(X, y)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [14]:
class FlightDelayModel(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )
    def forward(self, x):
        return self.net(x)

model = FlightDelayModel(X.shape[1]).to(DEVICE)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)

for epoch in tqdm(range(EPOCHS)):
    model.train()
    epoch_loss = 0
    correct = 0
    total = 0
    for xb, yb in train_loader:
        xb, yb = xb.to(DEVICE), yb.to(DEVICE).unsqueeze(1)
        optimizer.zero_grad()
        preds = model(xb)
        loss = criterion(preds, yb)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item() * xb.size(0)
        correct += ((preds>0.5)==yb).sum().item()
        total += xb.size(0)
    print(f"Epoch {epoch+1}/{EPOCHS} - Loss: {epoch_loss/total:.4f}, Accuracy: {correct/total:.4f}")

MODEL_PATH = os.path.join(OUTPUT_DIR, "flight_delay_model.pt")
torch.save(model.state_dict(), MODEL_PATH)
print("Model saved at:", MODEL_PATH)

 10%|█         | 1/10 [00:12<01:49, 12.21s/it]

Epoch 1/10 - Loss: 0.4240, Accuracy: 0.8459


 20%|██        | 2/10 [00:22<01:30, 11.28s/it]

Epoch 2/10 - Loss: 0.4050, Accuracy: 0.8461


 30%|███       | 3/10 [00:34<01:20, 11.46s/it]

Epoch 3/10 - Loss: 0.4019, Accuracy: 0.8463


 40%|████      | 4/10 [00:45<01:07, 11.20s/it]

Epoch 4/10 - Loss: 0.3996, Accuracy: 0.8465


 50%|█████     | 5/10 [00:56<00:55, 11.05s/it]

Epoch 5/10 - Loss: 0.3978, Accuracy: 0.8465


 60%|██████    | 6/10 [01:06<00:42, 10.72s/it]

Epoch 6/10 - Loss: 0.3965, Accuracy: 0.8466


 70%|███████   | 7/10 [01:17<00:32, 10.77s/it]

Epoch 7/10 - Loss: 0.3955, Accuracy: 0.8466


 80%|████████  | 8/10 [01:27<00:21, 10.63s/it]

Epoch 8/10 - Loss: 0.3946, Accuracy: 0.8467


 90%|█████████ | 9/10 [01:37<00:10, 10.59s/it]

Epoch 9/10 - Loss: 0.3939, Accuracy: 0.8469


100%|██████████| 10/10 [01:47<00:00, 10.78s/it]

Epoch 10/10 - Loss: 0.3930, Accuracy: 0.8468
Model saved at: /kaggle/working/model/flight_delay_model.pt



