In [2]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader
import seaborn as sns
import matplotlib.pyplot as plt
from class_models import early_stop
import time
from torch.optim.lr_scheduler import StepLR

In [3]:
# setting device and reproducibility
has_mps = torch.backends.mps.is_built()
device = "mps" if has_mps else "cuda" if torch.cuda.is_available() else "cpu"
torch.manual_seed(42)
print(f"Using device: {device}")

Using device: cuda


## Learning schedule to find the best learning rate using *StepLR* scheduler

In [9]:
# import data 
data_frame = pd.read_csv("data/NYCTaxiFares.csv", na_values=["NA", "?"])

In [10]:
# function to calculate the distance of the travel
def haversine_distance(dat_f, lat1, lon1, lat2, lon2):
    
    # average radius of the Earth in (km)
    r = 6371
    
    phi1 = np.radians(dat_f[lat1])
    phi2 = np.radians(dat_f[lat2])
    delta_phi = np.radians(dat_f[lat2] - dat_f[lat1])
    delta_lambda = np.radians(dat_f[lon2] - dat_f[lon1])
    
    a = np.sin(delta_phi/2)**2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    d = (r * c)
    
    return d

def preprocessing(df_n, cat_cols):
    """
    Preprocesses the data and adds pandas categorical fields to a dataframe.
    :param df_n: pandas dataframe 
    :param cat_cols: list of categorical fields
    :return: pandas dataframe
    """
    # append a 'dist_km' new feature in the dataframe
    df_n['dist_km'] = haversine_distance(df_n, 'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude')
    
    # remove outliers
    dfd = df_n[(df_n['fare_amount'] != 49.57) & (df_n['fare_amount'] != 45.00)].copy()
    
    # convert to pd datetime
    dfd['pickup_datetime'] = pd.to_datetime(dfd['pickup_datetime'])
    
    # Correcting pickup_datetime due to daylight savings time (April)
    dfd['EDTdate'] = dfd['pickup_datetime'] - pd.Timedelta(hours=4)
    
    # create new time fields
    dfd['Hour'] = dfd['EDTdate'].dt.hour
    dfd['AMorPM'] = np.where(dfd['Hour']<12, 'am', 'pm')
    dfd['Weekday'] = dfd['EDTdate'].dt.strftime("%a")
    
    # transform to pandas categorical variables
    for cat in cat_cols:
        dfd[cat] = dfd[cat].astype('category')
    
    dfd = dfd.drop(columns=['pickup_datetime'])
    
    return dfd

def model_tensors(df, cat_cols, cont_cols, y_col):
    """
    Get categorical, continuous and label tensors for the model
    :param df: pd dataframe
    :param cat_cols: list of categorical fields
    :param cont_cols: list of continuous fields
    :param y_col: list with the labels
    :return: cats, conts, y tensors
    """
    
    # group the data in categorical continuous and target label    
    cats = np.stack([df[col].cat.codes.values for col in cat_cols], axis=1)
    conts = np.stack([df[col].values for col in cont_cols], axis=1)
    y = df[y_col].values.reshape(-1, 1)

    # Convert to PyTorch tensors
    cats_t = torch.tensor(cats, dtype=torch.int64)
    conts_t = torch.tensor(conts, dtype=torch.float32)
    y_t = torch.tensor(y, dtype=torch.float32)
    
    return cats_t, conts_t, y_t

def create_embedding_sizes(df, cat_cols):
    """
    Create embedding sizes for PyTorch embedding layers
    :param df: pandas dataframe
    :param cat_cols: list of categorical fields
    :return: emb_sizes list
    """
    # categorical sizes list
    cat_sizes = [len(df[col].cat.categories) for col in cat_cols]

    # embedding sizes list (divide the number of unique entries in each column by two, if the result is greater than 50 select 50)
    emb_sizes = [(size, min(50,(size+1)//2)) for size in cat_sizes]
    
    return emb_sizes



In [11]:
df = preprocessing(data_frame, ['Hour', 'AMorPM', 'Weekday'])

cats, conts, y = model_tensors(df, ['Hour', 'AMorPM', 'Weekday'], ['dist_km', 'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude'], ['fare_amount'])

# number of continuous fields of the conts tensor
n_cont = conts.shape[1]

emb_sizes = create_embedding_sizes(df, ['Hour', 'AMorPM', 'Weekday'])

### Model definition

In [12]:
# Define the model
class TabularModel(nn.Module):
    def __init__(self, emb_sizes, n_cont, out_size, layers, p=0.5):
        super().__init__()
        self.embeds = nn.ModuleList([nn.Embedding(ni, nf) for ni, nf in emb_sizes])
        self.emb_drop = nn.Dropout(p)
        self.batch_norm_cont = nn.BatchNorm1d(n_cont)

        layer_list = []
        n_emb = sum([nf for ni, nf in emb_sizes])
        n_in = n_emb + n_cont
        for i in layers:
            layer_list.append(nn.Linear(n_in, i))
            layer_list.append(nn.ReLU(inplace=True))
            layer_list.append(nn.BatchNorm1d(i))
            layer_list.append(nn.Dropout(p))
            n_in = i

        layer_list.append(nn.Linear(layers[-1], out_size))
        self.layers = nn.Sequential(*layer_list)

    def forward(self, x_cat, x_cont):
        embeddings = [e(x_cat[:, i]) for i, e in enumerate(self.embeds)]
        x = torch.cat(embeddings, 1)
        x = self.emb_drop(x)
        x_cont = self.batch_norm_cont(x_cont)
        x = torch.cat([x, x_cont], 1)
        x = self.layers(x)
        return x

### Split Function
Function to split the dataset in 6

In [39]:
def get_train_test(categoricals, continuous, y_train, test_size=0.2):
    # Ensure the input arrays have the same number of rows
    assert categoricals.shape[0] == continuous.shape[0] == y_train.shape[0], "Input arrays must have the same number of rows"

    # Combine the data into a single array for splitting
    combined = np.hstack((categoricals, continuous, y_train))

    # Split the combined data into train and test sets
    train_data, test_data = train_test_split(combined, test_size=test_size, random_state=42)

    # Determine the number of categorical and continuous columns
    n_cat_cols = categoricals.shape[1]
    n_cont_cols = continuous.shape[1]

    # Separate the train and test data back into categorical, continuous, and target tensors

    # selects all rows and the first n_cat_cols columns (categorical features).
    cat_train = torch.tensor(train_data[:, :n_cat_cols], dtype=torch.int64).to(device)

    # selects all rows and the columns from n_cat_cols to n_cat_cols + n_cont_cols (continuous features).
    con_train = torch.tensor(train_data[:, n_cat_cols:n_cat_cols + n_cont_cols], dtype=torch.float32).to(device)

    # selects all rows and the last column (target labels).
    y_train = torch.tensor(train_data[:, -1], dtype=torch.float32).unsqueeze(1).to(device)

    cat_test = torch.tensor(test_data[:, :n_cat_cols], dtype=torch.int64).to(device)
    con_test = torch.tensor(test_data[:, n_cat_cols:n_cat_cols + n_cont_cols], dtype=torch.float32).to(device)
    y_test = torch.tensor(test_data[:, -1], dtype=torch.float32).unsqueeze(1).to(device)

    return cat_train, con_train, y_train, cat_test, con_test, y_test

cat_train, con_train, y_train, cat_test, con_test, y_test = get_train_test(cats, conts, y)

### Model and training parameters
With the aim of contrast the performance of the learning with a constant learning rate vs the performance of the training with a learning rate schedule  

In [43]:
# model instance (output_size = 1 for regression task)
model = TabularModel(emb_sizes, conts.shape[1], 1, [400, 300, 200, 100], p=0.4).to(device)
print(model.layers)

Sequential(
  (0): Linear(in_features=22, out_features=400, bias=True)
  (1): ReLU(inplace=True)
  (2): BatchNorm1d(400, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (3): Dropout(p=0.4, inplace=False)
  (4): Linear(in_features=400, out_features=300, bias=True)
  (5): ReLU(inplace=True)
  (6): BatchNorm1d(300, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (7): Dropout(p=0.4, inplace=False)
  (8): Linear(in_features=300, out_features=200, bias=True)
  (9): ReLU(inplace=True)
  (10): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (11): Dropout(p=0.4, inplace=False)
  (12): Linear(in_features=200, out_features=100, bias=True)
  (13): ReLU(inplace=True)
  (14): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (15): Dropout(p=0.4, inplace=False)
  (16): Linear(in_features=100, out_features=1, bias=True)
)


### General Configuration for all trainings

In [44]:
# criteria
criterion = nn.SmoothL1Loss()

# TensorDatasets
train_dataset = TensorDataset(cat_train, con_train, y_train)
test_dataset = TensorDataset(cat_test, con_test, y_test)

batch_size = 128

# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


# function to reset weights
def reset_weights(m):
    if hasattr(m, 'reset_parameters'):
        m.reset_parameters()



### Defining a training function for learning schedules 

In [45]:
# Training function
def train(epochs, train_schedule, start_lr, early_s_patience, print_rate):
    """
    :param epochs: int, number of training epochs
    :param train_schedule: list of lists, [[mode, scheduler_step_size, reduction_factor_gamma]]
    :param start_lr: float, starting learning rate
    :param early_s_patience: int, early stopping patience
    :param print_rate: int, printing rate
    :return: Pandas.DataFrame, the results of the training
    """
    results = []
    start_time = time.time()
    for mode, step_size, gamma in train_schedule:
        print(f"Training mode: {mode}")
        
        # restart optimizer
        optimizer = torch.optim.Adam(model.parameters(), lr=start_lr)
        
        if mode == "LR Scheduler":
            # restart learning rate scheduler
            scheduler = StepLR(optimizer, step_size=step_size, gamma=gamma)
            
        # reset model parameters
        model.apply(reset_weights)
        
        # reset early stop
        early_stopping = early_stop.EarlyStopping(patience=early_s_patience)
        for epoch in range(epochs):
            
            train_losses = []
            val_losses = []
            
            model.train()
            epoch_losses = []
            for cat_batch, con_batch, y_batch in train_loader:
                optimizer.zero_grad()
                y_pred = model(cat_batch, con_batch).flatten()
                loss = torch.sqrt(criterion(y_pred, y_batch.flatten()))  # Use y_batch.flatten() here
                epoch_losses.append(loss.item())
                loss.backward()
                optimizer.step()
        
            train_losses.append(np.mean(epoch_losses))
        
            # Validation
            model.eval()
            with torch.no_grad():
                epoch_val_loss = []
                for cat_batch, con_batch, y_batch in test_loader:
                    y_val = model(cat_batch, con_batch).flatten()
                    val_loss = torch.sqrt(criterion(y_val, y_batch.flatten()))  # Use y_batch.flatten() here
                    epoch_val_loss.append(val_loss.item())
        
                val_losses.append(np.mean(epoch_val_loss))
        
            if (epoch % print_rate == 0 or epoch == epochs - 1) and mode == "LR Scheduler":
                print(f'Epoch {epoch + 1}/{epochs}, T. Loss: {train_losses[-1]:.4f}, V. Loss: {val_losses[-1]:.4f}, Early S: {early_stopping.status}, Gamma: {gamma}, S. Size: {step_size}, L.R. {optimizer.param_groups[0]['lr']:.6f}')
            elif epoch % print_rate == 0 or epoch == epochs - 1: 
                print(f'Epoch {epoch + 1}/{epochs}, T. Loss: {train_losses[-1]:.4f}, V. Loss: {val_losses[-1]:.4f}, Early S: {early_stopping.status}, L.R. {optimizer.param_groups[0]['lr']:.6f}')
            
            results.append({
                "Mode": mode,
                "Gamma": gamma,
                "Step": step_size,
                "Epoch": epoch + 1,
                "Learning Rate": optimizer.param_groups[0]['lr'],
                "Train Loss": train_losses[-1],
                "Validation Loss": val_losses[-1],
            })
            
            if mode == "LR Scheduler":
                scheduler.step()
            
            # Check early stopping
            if early_stopping(model, val_losses[-1]):  # Pass the latest validation loss to early stopping
                print(early_stopping.status)
                break
        
        print(f'Training completed in {time.time() - start_time:.2f} seconds')

    return pd.DataFrame(results)

### Train Schedule 1: 

In [46]:
training_epochs = 100
train_lr_schedule = [
    ["LR Scheduler", 25, 0.9],
    ["LR Scheduler", 20, 0.9],
    ["LR Scheduler", 15, 0.9],
    ["LR Scheduler", 10, 0.9],
    ["LR Scheduler", 5, 0.9]
]
starting_lr = 0.01
early_stop_patience = 40
printing_rate = 20

results_lr_09 = train(training_epochs, train_lr_schedule, starting_lr, early_stop_patience, printing_rate)

Training mode: LR Scheduler
Epoch 1/100, T. Loss: 1.2868, V. Loss: 1.0874, Early S: , Gamma: 0.9, S. Size: 25, L.R. 0.010000
Epoch 21/100, T. Loss: 1.0651, V. Loss: 0.9531, Early S: Improvement!!!, actual counter 3, Gamma: 0.9, S. Size: 25, L.R. 0.010000
Epoch 41/100, T. Loss: 1.0591, V. Loss: 0.9533, Early S: Improvement!!!, actual counter 6, Gamma: 0.9, S. Size: 25, L.R. 0.009000
Epoch 61/100, T. Loss: 1.0522, V. Loss: 0.9344, Early S: NO improvement in the last 8 epochs, Gamma: 0.9, S. Size: 25, L.R. 0.008100
Epoch 81/100, T. Loss: 1.0380, V. Loss: 0.9265, Early S: NO improvement in the last 11 epochs, Gamma: 0.9, S. Size: 25, L.R. 0.007290
Epoch 100/100, T. Loss: 1.0418, V. Loss: 0.9821, Early S: NO improvement in the last 16 epochs, Gamma: 0.9, S. Size: 25, L.R. 0.007290
Training completed in 380.13 seconds
Training mode: LR Scheduler
Epoch 1/100, T. Loss: 1.2817, V. Loss: 1.0213, Early S: , Gamma: 0.9, S. Size: 20, L.R. 0.010000
Epoch 21/100, T. Loss: 1.0725, V. Loss: 0.9597, Ear