Skip to content

Commit

Permalink
Merge pull request #2 from nick-roberson/nick/test-train-score
Browse files Browse the repository at this point in the history
update some small things
  • Loading branch information
nick-roberson committed Apr 29, 2024
2 parents 2445432 + 87a1223 commit a4a4d58
Show file tree
Hide file tree
Showing 5 changed files with 37 additions and 32 deletions.
Binary file modified TaxiFareRegrModel.pt
Binary file not shown.
16 changes: 9 additions & 7 deletions example.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

# Params specifically for the taxi fare example
TAXI_DATA = "data/NYCTaxiFares.csv"
MODEL_NAME = "TaxiFareRegrModel.pt"
MODEL_FILE = "TaxiFareRegrModel.pt"
CATEGORICAL_COLUMNS = ["hour", "am_or_pm", "weekday"]
CONTINUOUS_COLUMNS = [
"pickup_latitude",
Expand All @@ -34,19 +34,21 @@
MODEL_DROPOUT = 0.4


def init_logging():
def init_logging(verbose: bool = False):
logging.basicConfig(
level=logging.INFO,
level=logging.DEBUG if verbose else logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)


@app.command()
def train_taxi():
def train_taxi(
verbose: bool = False,
):
"""Run a simple example of a computation workflow."""
# Initialize logging and load the data
init_logging()
init_logging(verbose=verbose)
test_data = pd.read_csv(TAXI_DATA)
print(f"Loaded data with {len(test_data)} records.")

Expand All @@ -55,7 +57,7 @@ def train_taxi():

# Create params for the training block
train_params = TrainModelParams(
model_name=MODEL_NAME,
model_file=MODEL_FILE,
cat_cols=CATEGORICAL_COLUMNS,
cont_cols=CONTINUOUS_COLUMNS,
y_col=TARGET_COLUMN,
Expand All @@ -68,7 +70,7 @@ def train_taxi():

# Create params for the prediction block
predict_params = PredictModelParams(
model_name=MODEL_NAME,
model_file=MODEL_FILE,
cat_cols=CATEGORICAL_COLUMNS,
cont_cols=CONTINUOUS_COLUMNS,
model_layers=MODEL_LAYERS[:2],
Expand Down
18 changes: 8 additions & 10 deletions src/blocks/predict/predict_taxi.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,20 @@
import logging
import os
from typing import Tuple

import numpy as np
import pandas as pd
import torch
from rich import print
from torch import nn

from src.block_base import BlockBase
from src.blocks.train.models.tabular_model import TabularModel
from src.params_base import BlockParamBase

logger = logging.getLogger(__name__)

class PredictModelParams(BlockParamBase):

# Path to the trained model file
model_name: str = "TaxiFareRegrModel.pt"
class PredictModelParams(BlockParamBase):

# Categorical columns to use for the model
cat_cols: list = ["hour", "am_or_pm", "weekday"]
Expand All @@ -29,14 +28,13 @@ class PredictModelParams(BlockParamBase):
]

# Model architecture parameters
model_file: str = "TaxiFareRegrModel.pt"
model_layers: list = [200, 100]
model_dropout: float = 0.4

# Target column
# Target column and prediction columns
target_col: str = "fare_amount"
# Prediction column
prediction_col: str = "predictions"
# Difference column
difference_col: str = "difference"


Expand All @@ -55,7 +53,7 @@ def load_model(self, input_df: pd.DataFrame) -> nn.Module:
nn.Module: The loaded PyTorch model.
"""
# Get the path to the model file
model_path = os.path.join(os.getcwd(), self.params.model_name)
model_path = os.path.join(os.getcwd(), self.params.model_file)

# Get the number of unique categories for each categorical feature, and set the embedding sizes
cat_szs = [
Expand Down Expand Up @@ -111,8 +109,8 @@ def predict(
Args:
model (nn.Module): The trained model.
cats (torch.Tensor): Categorical features tensor.
conts (torch.Tensor): Continuous features tensor.
cats (torch.Tensor): Categorical feature's tensor.
conts (torch.Tensor): Continuous feature's tensor.
Returns:
np.ndarray: Predicted values.
Expand Down
7 changes: 5 additions & 2 deletions src/blocks/train/models/tabular_model.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
import logging
from typing import List, Tuple

import torch
import torch.nn as nn

logger = logging.getLogger(__name__)


class TabularModel(nn.Module):
"""
Expand Down Expand Up @@ -35,7 +38,7 @@ def __init__(
layers (List[int]): List of integers where each integer specifies the number of neurons in a hidden layer.
p (float, optional): Dropout probability used in the embedding dropout and each hidden layer. Defaults to 0.5.
"""
print(
logger.debug(
f"Initializing TabularModel with {emb_szs}, {n_cont}, {out_sz}, {layers}, {p}"
)
super().__init__()
Expand All @@ -50,7 +53,7 @@ def __init__(
]
layerlist.append(nn.Linear(layers[-1], out_sz))
self.layers = nn.Sequential(*layerlist)
print(
logger.debug(
f"TabularModel initialized with {self.embeds}, {self.emb_drop}, {self.bn_cont}, {self.layers}"
)

Expand Down
28 changes: 15 additions & 13 deletions src/blocks/train/train_taxi.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,23 @@
import logging
import os.path
from typing import List, Tuple

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from rich import print
from typing_extensions import override

from src.block_base import BlockBase
from src.blocks.train.models.tabular_model import TabularModel
from src.params_base import BlockParamBase

logger = logging.getLogger(__name__)


class TrainModelParams(BlockParamBase):
"""Parameters for the TrainTaxiModel."""

model_name: str = "TaxiFareRegrModel.pt"
id_col: str = "id"
cat_cols: list = ["hour", "am_or_pm", "weekday"]
cont_cols: list = [
Expand All @@ -33,6 +34,7 @@ class TrainModelParams(BlockParamBase):
epochs: int = 300

# Model Params
model_file: str = "TaxiFareRegrModel.pt"
model_layers: List[int] = [200, 100]
model_dropout: float = 0.4

Expand Down Expand Up @@ -83,20 +85,20 @@ def run(self, input_df: pd.DataFrame) -> pd.DataFrame:
Returns:
pd.DataFrame: The processed DataFrame, potentially with modifications or additional columns.
"""
print("******************************************** VALIDATE")
logger.debug("******************************************** VALIDATE")
# Validate and convert columns to categories
self.validate(input_df=input_df)
self.convert_columns_to_categories(input_df=input_df)

# Prepare tensors and setup model
print(
logger.debug(
"******************************************** PREPARE TENSORS AND SETUP MODEL"
)
cats, conts, y = self.prepare_tensors(input_df=input_df)
model, criterion, optimizer = self.setup_model(input_df=input_df, conts=conts)

# Train
print("******************************************** TRAIN THE MODEL")
logger.debug("******************************************** TRAIN THE MODEL")
losses = self.train_model(
model=model,
criterion=criterion,
Expand All @@ -107,7 +109,7 @@ def run(self, input_df: pd.DataFrame) -> pd.DataFrame:
)

# Evaluate
print("******************************************** EVALUATE THE MODEL")
logger.debug("******************************************** EVALUATE THE MODEL")
self.evaluate_model(
model=model, criterion=criterion, cats=cats, conts=conts, y=y, losses=losses
)
Expand Down Expand Up @@ -220,7 +222,7 @@ def train_model(
optimizer.step()
losses.append(loss.item())
if i % 25 == 0:
print(f"Epoch {i}: Loss = {loss.item():.8f}")
logger.debug(f"Epoch {i}: Loss = {loss.item():.8f}")

return losses

Expand Down Expand Up @@ -251,7 +253,7 @@ def evaluate_model(
with torch.no_grad():
y_val = model(cat_test, con_test)
loss = torch.sqrt(criterion(y_val, y_test))
print(f"Final RMSE: {loss:.8f}")
logger.debug(f"Final RMSE: {loss:.8f}")

# Print some predictions
predictions = []
Expand All @@ -262,17 +264,17 @@ def evaluate_model(

# Sort the predictions by the difference between the predicted and actual values
predictions.sort(key=lambda x: x[2])
print("Predictions:")
logger.debug("Predictions:")
for i, (pred, actual, diff) in enumerate(predictions):
print(
logger.debug(
f"{i + 1:2}. Predicted: {pred:.4f}, Actual: {actual:.4f}, Diff: {diff:.4f}"
)

# Save the model if training is complete
if len(losses) == self.params.epochs:
model_fp = os.path.abspath(self.params.model_name)
model_fp = os.path.abspath(self.params.model_file)
torch.save(model.state_dict(), model_fp)
print(f"Model saved successfully to path '{model_fp}'")
logger.debug(f"Model saved successfully to path '{model_fp}'")
else:
print("Model training incomplete.")
logger.debug("Model training incomplete.")
raise ValueError("Model training incomplete.")

0 comments on commit a4a4d58

Please sign in to comment.