In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from math import radians, sin, cos, sqrt, atan2

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import math
import os

In [None]:
DATA_PATH = 'C:/Users/OlesiaBrusentseva/PyProjects/2025MLModels/MLmodels2/lab1Data/train.csv' # Assuming train.csv is in the same directory
df = pd.read_csv(DATA_PATH)
TEST_SIZE = 0.2
RANDOM_STATE = 42

In [4]:
# --- 1. Haversine Distance Calculation ---
# Function to calculate Haversine distance between two sets of lat/lon coordinates
def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371 # Radius of Earth in kilometers

    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c
    return distance


In [None]:
# --- 3. Initial Data Cleaning and Outlier Handling ---
df['trip_duration'] = pd.to_numeric(df['trip_duration'], errors='coerce')
df.dropna(subset=['trip_duration'], inplace=True)

initial_rows = df.shape[0]
df = df[(df['trip_duration'] >= 60) & (df['trip_duration'] <= 3600 * 6)]
print(f"Removed {initial_rows - df.shape[0]} outliers from trip_duration. New shape: {df.shape}")


Removed 10656 outliers from trip_duration. New shape: (1447988, 11)


In [6]:
# Log transform trip_duration to handle its skewed distribution
df['log_trip_duration'] = np.log1p(df['trip_duration'])
print("Log-transformed 'trip_duration' to 'log_trip_duration'.")


Log-transformed 'trip_duration' to 'log_trip_duration'.


In [7]:
# --- 4. Feature Engineering ---
print("Performing feature engineering...")
# Convert datetime columns
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
df['dropoff_datetime'] = pd.to_datetime(df['dropoff_datetime'])


Performing feature engineering...


In [8]:
# Extract time-based features from pickup_datetime
df['pickup_hour'] = df['pickup_datetime'].dt.hour
df['pickup_day_of_week'] = df['pickup_datetime'].dt.dayofweek # Monday=0, Sunday=6
df['pickup_month'] = df['pickup_datetime'].dt.month
df['pickup_day_of_year'] = df['pickup_datetime'].dt.dayofyear
df['pickup_weekday'] = df['pickup_day_of_week'].apply(lambda x: 1 if x < 5 else 0) # 1 for weekday, 0 for weekend


In [None]:
# Calculate Haversine distance
df['haversine_distance'] = df.apply(
    lambda row: haversine_distance(
        row['pickup_latitude'], row['pickup_longitude'],
        row['dropoff_latitude'], row['dropoff_longitude']
    ), axis=1
)
print("Calculated 'haversine_distance'.")


Calculated 'haversine_distance'.


In [None]:
# Handle 'store_and_fwd_flag' - convert 'Y'/'N' to 1/0
df['store_and_fwd_flag'] = df['store_and_fwd_flag'].map({'Y': 1, 'N': 0}).fillna(0) # Fillna for any potential NaNs

In [11]:
# Drop original datetime columns and trip_duration
df_processed = df.drop(columns=['id', 'pickup_datetime', 'dropoff_datetime', 'trip_duration'])
print("Dropped original 'id', 'pickup_datetime', 'dropoff_datetime', 'trip_duration' columns.")


Dropped original 'id', 'pickup_datetime', 'dropoff_datetime', 'trip_duration' columns.


In [None]:
# --- 5. Define Features and Target ---
# Features to be used in the model
numerical_features = [
    'passenger_count',
    'pickup_longitude', 'pickup_latitude',
    'dropoff_longitude', 'dropoff_latitude',
    'haversine_distance',
    'pickup_hour', 'pickup_day_of_week', 'pickup_month', 'pickup_day_of_year'
]
categorical_features = [
    'vendor_id',
    'store_and_fwd_flag', # binary
    'pickup_weekday' # binary
]
target = 'log_trip_duration'


In [13]:
# Ensure all selected features exist in the dataframe
for col in numerical_features + categorical_features:
    if col not in df_processed.columns:
        print(f"Warning: Feature '{col}' not found in processed DataFrame. Please check feature engineering steps.")

X = df_processed[numerical_features + categorical_features]
y = df_processed[target]


In [None]:
# --- 6. Preprocessing Pipelines for Numerical and Categorical Features ---
# Numerical pipeline: just scaling
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Identify columns for scaling
X_numerical = X[numerical_features]
X_categorical = X[categorical_features]

scaler = StandardScaler()
X_numerical_scaled = scaler.fit_transform(X_numerical)
X_numerical_scaled_df = pd.DataFrame(X_numerical_scaled, columns=numerical_features, index=X.index)

# Combine scaled numerical features with original categorical features
# We'll convert categorical features to appropriate types for PyTorch embeddings later
X_preprocessed = pd.concat([X_numerical_scaled_df, X_categorical], axis=1)

print("Numerical features scaled. Categorical features prepared for direct use.")
print(f"Preprocessed X shape: {X_preprocessed.shape}")
print(X_preprocessed.head())


Numerical features scaled. Categorical features prepared for direct use.
Preprocessed X shape: (1447988, 13)
   passenger_count  pickup_longitude  pickup_latitude  dropoff_longitude  \
0        -0.506088         -0.120654         0.519149           0.126373   
1        -0.506088         -0.096079        -0.379503          -0.367716   
2        -0.506088         -0.076462         0.396839          -0.450677   
3        -0.506088         -0.514607        -0.948338          -0.548997   
4        -0.506088          0.007933         1.292340           0.008800   

   dropoff_latitude  haversine_distance  pickup_hour  pickup_day_of_week  \
0          0.384921           -0.456214     0.529638           -1.560836   
1         -0.579200           -0.384789    -2.127957            1.510127   
2         -1.168711            0.680717    -0.408337           -1.049009   
3         -1.262977           -0.459243     0.842297           -0.537182   
4          0.858387           -0.528324    -0.095678  

In [None]:
# --- 7. Split Data into Training and Validation Sets ---
X_train, X_val, y_train, y_val = train_test_split(
    X_preprocessed, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)

print(f"\nTraining set shape (X_train): {X_train.shape}, (y_train): {y_train.shape}")
print(f"Validation set shape (X_val): {X_val.shape}, (y_val): {y_val.shape}")

# Save processed data for the transformer model
# Using .values to convert to numpy arrays for PyTorch
np.save('X_train_numerical.npy', X_train[numerical_features].values)
np.save('X_val_numerical.npy', X_val[numerical_features].values)
np.save('y_train.npy', y_train.values)
np.save('y_val.npy', y_val.values)

# For categorical features, we need to ensure they are integer encoded for embedding lookup.

vendor_id_mapping = {id: i for i, id in enumerate(df['vendor_id'].unique())}
df_processed['vendor_id_encoded'] = df_processed['vendor_id'].map(vendor_id_mapping)

# Update X_preprocessed with the encoded vendor_id
X_preprocessed['vendor_id'] = df_processed['vendor_id_encoded']

# Re-split with the updated X_preprocessed
X_train, X_val, y_train, y_val = train_test_split(
    X_preprocessed, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)




Training set shape (X_train): (1158390, 13), (y_train): (1158390,)
Validation set shape (X_val): (289598, 13), (y_val): (289598,)


In [16]:
# Save categorical features as numpy arrays
np.save('X_train_categorical.npy', X_train[categorical_features].values)
np.save('X_val_categorical.npy', X_val[categorical_features].values)

# Save the mapping for vendor_id and unique counts for embedding layers
np.save('vendor_id_mapping.npy', np.array(list(vendor_id_mapping.items()), dtype=object))
print(f"Unique vendor_ids: {len(vendor_id_mapping)}")

print("\nData preprocessing complete. Data saved as .npy files.")
print("Ready for Transformer model training.")

Unique vendor_ids: 2

Data preprocessing complete. Data saved as .npy files.
Ready for Transformer model training.


Transformer

In [None]:
# --- Configuration ---
BATCH_SIZE = 64
LEARNING_RATE = 1e-4
NUM_EPOCHS = 20 
EMBEDDING_DIM = 32 # Dimension for categorical embeddings
D_MODEL = 128 # Dimension of the transformer's input/output features
N_HEAD = 4 # Number of attention heads
NUM_ENCODER_LAYERS = 2 # Number of transformer encoder layers
DROPOUT = 0.1
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(f"Using device: {DEVICE}")


Using device: cpu


In [35]:

# --- 1. Custom Dataset Class ---
class NYCTaxiDataset(Dataset):
    def __init__(self, numerical_data, categorical_data, targets):
        self.numerical_data = torch.tensor(numerical_data, dtype=torch.float32)
        self.categorical_data = torch.tensor(categorical_data, dtype=torch.long)
        self.targets = torch.tensor(targets, dtype=torch.float32).unsqueeze(1) # Add a dimension for regression target

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        return self.numerical_data[idx], self.categorical_data[idx], self.targets[idx]


In [None]:

# --- 2. Transformer Model Definition ---
class TaxiTransformer(nn.Module):
    def __init__(self, num_numerical_features, num_vendor_ids, embedding_dim,
                 d_model, n_head, num_encoder_layers, dropout):
        super().__init__()

        self.embedding_dim = embedding_dim
        self.d_model = d_model

        # Embedding for vendor_id
        self.vendor_embedding = nn.Embedding(num_vendor_ids, embedding_dim)

        # Linear layer to project numerical features to d_model space
    
        self.numerical_projection = nn.Linear(num_numerical_features, d_model - (embedding_dim * 3))

        # Linear layers for store_and_fwd_flag and pickup_weekday (binary, treated as categorical)
        
        self.store_flag_embedding = nn.Embedding(2, embedding_dim) # 2 classes: 0 or 1
        self.weekday_embedding = nn.Embedding(2, embedding_dim) # 2 classes: 0 or 1

        # Transformer Encoder Layer
        # The transformer will process a single combined feature vector per trip.
        # The self-attention mechanism will learn interactions between the components of this vector.
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=n_head,
            dim_feedforward=d_model * 4, # Standard practice
            dropout=dropout,
            batch_first=True # Input and output tensors are (batch, sequence, feature)
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_encoder_layers)

        # Output regression head
        self.regressor = nn.Sequential(
            nn.Linear(d_model, d_model // 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(d_model // 2, 1) # Output a single regression value
        )

    def forward(self, numerical_features, categorical_features):
        # Unpack categorical features
        vendor_ids = categorical_features[:, 0] # vendor_id is the first categorical feature
        store_flags = categorical_features[:, 1] # store_and_fwd_flag is the second
        weekdays = categorical_features[:, 2] # pickup_weekday is the third

        # Embed categorical features
        vendor_embed = self.vendor_embedding(vendor_ids)
        store_flag_embed = self.store_flag_embedding(store_flags)
        weekday_embed = self.weekday_embedding(weekdays)

        # Project numerical features
        numerical_proj = self.numerical_projection(numerical_features)

        # Concatenate all features to form the combined input vector for the transformer
        # The sequence length is 1, as each sample is a single combined vector
        combined_features = torch.cat((numerical_proj, vendor_embed, store_flag_embed, weekday_embed), dim=1)
        
        # Add a sequence dimension (batch_size, 1, d_model) for the transformer
        combined_features = combined_features.unsqueeze(1)

        # Pass through transformer encoder
        transformer_output = self.transformer_encoder(combined_features)

        # Take the output for the first (and only) token in the sequence
        # and pass it to the regressor
        output = self.regressor(transformer_output.squeeze(1)) # Remove the sequence dimension

        return output


In [37]:

# --- 3. Load Preprocessed Data ---
print("Loading preprocessed data...")
try:
    X_train_numerical = np.load('X_train_numerical.npy')
    X_val_numerical = np.load('X_val_numerical.npy')
    X_train_categorical = np.load('X_train_categorical.npy')
    X_val_categorical = np.load('X_val_categorical.npy')
    y_train = np.load('y_train.npy')
    y_val = np.load('y_val.npy')
    vendor_id_mapping_items = np.load('vendor_id_mapping.npy', allow_pickle=True)
    num_vendor_ids = len(vendor_id_mapping_items)
    print("Preprocessed data loaded successfully.")
except FileNotFoundError:
    print("Error: Preprocessed .npy files not found. Please run the data_preprocessing_normalization.py script first.")
    exit()


Loading preprocessed data...
Preprocessed data loaded successfully.


In [38]:

# Create Dataset and DataLoader instances
train_dataset = NYCTaxiDataset(X_train_numerical, X_train_categorical, y_train)
val_dataset = NYCTaxiDataset(X_val_numerical, X_val_categorical, y_val)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

# --- 4. Initialize Model, Loss, and Optimizer ---
num_numerical_features = X_train_numerical.shape[1]

model = TaxiTransformer(
    num_numerical_features=num_numerical_features,
    num_vendor_ids=num_vendor_ids,
    embedding_dim=EMBEDDING_DIM,
    d_model=D_MODEL,
    n_head=N_HEAD,
    num_encoder_layers=NUM_ENCODER_LAYERS,
    dropout=DROPOUT
).to(DEVICE)

criterion = nn.MSELoss() # Mean Squared Error for regression
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

print(f"\nModel initialized:\n{model}")
print(f"Number of parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad)}")



Model initialized:
TaxiTransformer(
  (vendor_embedding): Embedding(2, 32)
  (numerical_projection): Linear(in_features=10, out_features=32, bias=True)
  (store_flag_embedding): Embedding(2, 32)
  (weekday_embedding): Embedding(2, 32)
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-1): 2 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
        )
        (linear1): Linear(in_features=128, out_features=512, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=512, out_features=128, bias=True)
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (regressor): Sequential(
    (0): Linear(in_f

In [39]:

# --- 5. Training Loop ---
print("\nStarting training...")
best_val_loss = float('inf')

for epoch in range(NUM_EPOCHS):
    model.train()
    train_loss = 0
    for batch_idx, (numerical_data, categorical_data, targets) in enumerate(train_loader):
        numerical_data, categorical_data, targets = numerical_data.to(DEVICE), categorical_data.to(DEVICE), targets.to(DEVICE)

        optimizer.zero_grad()
        outputs = model(numerical_data, categorical_data)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    avg_train_loss = train_loss / len(train_loader)

    # --- Validation ---
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for numerical_data, categorical_data, targets in val_loader:
            numerical_data, categorical_data, targets = numerical_data.to(DEVICE), categorical_data.to(DEVICE), targets.to(DEVICE)
            outputs = model(numerical_data, categorical_data)
            loss = criterion(outputs, targets)
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_loader)

    print(f"Epoch [{epoch+1}/{NUM_EPOCHS}], Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

    # Save the best model based on validation loss
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), 'best_taxi_transformer_model.pth')
        print(f"Saved best model with Val Loss: {best_val_loss:.4f}")

print("\nTraining complete.")



Starting training...
Epoch [1/20], Train Loss: 0.3912, Val Loss: 0.1512
Saved best model with Val Loss: 0.1512
Epoch [2/20], Train Loss: 0.2574, Val Loss: 0.1276
Saved best model with Val Loss: 0.1276
Epoch [3/20], Train Loss: 0.2262, Val Loss: 0.1190
Saved best model with Val Loss: 0.1190
Epoch [4/20], Train Loss: 0.2062, Val Loss: 0.1157
Saved best model with Val Loss: 0.1157
Epoch [5/20], Train Loss: 0.1904, Val Loss: 0.1168
Epoch [6/20], Train Loss: 0.1760, Val Loss: 0.1231
Epoch [7/20], Train Loss: 0.1640, Val Loss: 0.1116
Saved best model with Val Loss: 0.1116
Epoch [8/20], Train Loss: 0.1546, Val Loss: 0.1100
Saved best model with Val Loss: 0.1100
Epoch [9/20], Train Loss: 0.1463, Val Loss: 0.1109
Epoch [10/20], Train Loss: 0.1392, Val Loss: 0.1074
Saved best model with Val Loss: 0.1074
Epoch [11/20], Train Loss: 0.1337, Val Loss: 0.1082
Epoch [12/20], Train Loss: 0.1288, Val Loss: 0.1053
Saved best model with Val Loss: 0.1053
Epoch [13/20], Train Loss: 0.1246, Val Loss: 0.1060

In [None]:

# --- 6. Evaluation (Load best model and evaluate on validation set) ---
print("\nEvaluating best model on validation set...")
model.load_state_dict(torch.load('best_taxi_transformer_model.pth'))
model.eval()

total_val_loss = 0
all_preds = []
all_targets = []

with torch.no_grad():
    for numerical_data, categorical_data, targets in val_loader:
        numerical_data, categorical_data, targets = numerical_data.to(DEVICE), categorical_data.to(DEVICE), targets.to(DEVICE)
        outputs = model(numerical_data, categorical_data)
        loss = criterion(outputs, targets)
        total_val_loss += loss.item()
        all_preds.extend(outputs.cpu().numpy())
        all_targets.extend(targets.cpu().numpy())

final_val_loss = total_val_loss / len(val_loader)
print(f"Final Validation MSE: {final_val_loss:.4f}")

# Calculate RMSE (Root Mean Squared Error)
rmse = np.sqrt(final_val_loss)
print(f"Final Validation RMSE (log-transformed): {rmse:.4f}")



Evaluating best model on validation set...
Final Validation MSE: 0.1033
Final Validation RMSE (log-transformed): 0.3214


In [None]:
all_preds_original_scale = np.expm1(np.array(all_preds))
all_targets_original_scale = np.expm1(np.array(all_targets))

rmse_original_scale = np.sqrt(np.mean((all_preds_original_scale - all_targets_original_scale)**2))
print(f"Approximate Final Validation RMSE (original scale): {rmse_original_scale:.2f} seconds")

# Example of a single prediction (using the first validation sample)
print("\nExample prediction:")
model.eval()
with torch.no_grad():
    sample_numerical, sample_categorical, sample_target = val_dataset[0]
    sample_numerical = sample_numerical.unsqueeze(0).to(DEVICE) # Add batch dimension
    sample_categorical = sample_categorical.unsqueeze(0).to(DEVICE) # Add batch dimension

    predicted_log_duration = model(sample_numerical, sample_categorical).item()
    predicted_duration = np.expm1(predicted_log_duration)
    actual_duration = np.expm1(sample_target.item())

    print(f"Predicted log duration: {predicted_log_duration:.4f}")
    print(f"Actual log duration: {sample_target.item():.4f}")
    print(f"Predicted duration (seconds): {predicted_duration:.2f}")
    print(f"Actual duration (seconds): {actual_duration:.2f}")

Approximate Final Validation RMSE (original scale): 306.09 seconds

Example prediction:
Predicted log duration: 6.5311
Actual log duration: 6.6503
Predicted duration (seconds): 685.17
Actual duration (seconds): 772.00
