Merge pull request #2 from nick-roberson/nick/test-train-score

update some small things
nick-roberson · Apr 29, 2024 · a4a4d58 · a4a4d58
2 parents 2445432 + 87a1223
commit a4a4d58
Show file tree

Hide file tree

Showing 5 changed files with 37 additions and 32 deletions.
diff --git a/TaxiFareRegrModel.pt b/TaxiFareRegrModel.pt
diff --git a/example.py b/example.py
@@ -16,7 +16,7 @@
 
 # Params specifically for the taxi fare example
 TAXI_DATA = "data/NYCTaxiFares.csv"
-MODEL_NAME = "TaxiFareRegrModel.pt"
+MODEL_FILE = "TaxiFareRegrModel.pt"
 CATEGORICAL_COLUMNS = ["hour", "am_or_pm", "weekday"]
 CONTINUOUS_COLUMNS = [
     "pickup_latitude",
@@ -34,19 +34,21 @@
 MODEL_DROPOUT = 0.4
 
 
-def init_logging():
+def init_logging(verbose: bool = False):
     logging.basicConfig(
-        level=logging.INFO,
+        level=logging.DEBUG if verbose else logging.INFO,
         format="%(asctime)s - %(levelname)s - %(message)s",
         datefmt="%Y-%m-%d %H:%M:%S",
     )
 
 
 @app.command()
-def train_taxi():
+def train_taxi(
+    verbose: bool = False,
+):
     """Run a simple example of a computation workflow."""
     # Initialize logging and load the data
-    init_logging()
+    init_logging(verbose=verbose)
     test_data = pd.read_csv(TAXI_DATA)
     print(f"Loaded data with {len(test_data)} records.")
 
@@ -55,7 +57,7 @@ def train_taxi():
 
     # Create params for the training block
     train_params = TrainModelParams(
-        model_name=MODEL_NAME,
+        model_file=MODEL_FILE,
         cat_cols=CATEGORICAL_COLUMNS,
         cont_cols=CONTINUOUS_COLUMNS,
         y_col=TARGET_COLUMN,
@@ -68,7 +70,7 @@ def train_taxi():
 
     # Create params for the prediction block
     predict_params = PredictModelParams(
-        model_name=MODEL_NAME,
+        model_file=MODEL_FILE,
         cat_cols=CATEGORICAL_COLUMNS,
         cont_cols=CONTINUOUS_COLUMNS,
         model_layers=MODEL_LAYERS[:2],

diff --git a/src/blocks/predict/predict_taxi.py b/src/blocks/predict/predict_taxi.py
@@ -1,21 +1,20 @@
+import logging
 import os
 from typing import Tuple
 
 import numpy as np
 import pandas as pd
 import torch
-from rich import print
 from torch import nn
 
 from src.block_base import BlockBase
 from src.blocks.train.models.tabular_model import TabularModel
 from src.params_base import BlockParamBase
 
+logger = logging.getLogger(__name__)
 
-class PredictModelParams(BlockParamBase):
 
-    # Path to the trained model file
-    model_name: str = "TaxiFareRegrModel.pt"
+class PredictModelParams(BlockParamBase):
 
     # Categorical columns to use for the model
     cat_cols: list = ["hour", "am_or_pm", "weekday"]
@@ -29,14 +28,13 @@ class PredictModelParams(BlockParamBase):
     ]
 
     # Model architecture parameters
+    model_file: str = "TaxiFareRegrModel.pt"
     model_layers: list = [200, 100]
     model_dropout: float = 0.4
 
-    # Target column
+    # Target column and prediction columns
     target_col: str = "fare_amount"
-    # Prediction column
     prediction_col: str = "predictions"
-    # Difference column
     difference_col: str = "difference"
 
 
@@ -55,7 +53,7 @@ def load_model(self, input_df: pd.DataFrame) -> nn.Module:
             nn.Module: The loaded PyTorch model.
         """
         # Get the path to the model file
-        model_path = os.path.join(os.getcwd(), self.params.model_name)
+        model_path = os.path.join(os.getcwd(), self.params.model_file)
 
         # Get the number of unique categories for each categorical feature, and set the embedding sizes
         cat_szs = [
@@ -111,8 +109,8 @@ def predict(
 
         Args:
             model (nn.Module): The trained model.
-            cats (torch.Tensor): Categorical features tensor.
-            conts (torch.Tensor): Continuous features tensor.
+            cats (torch.Tensor): Categorical feature's tensor.
+            conts (torch.Tensor): Continuous feature's tensor.
 
         Returns:
             np.ndarray: Predicted values.

diff --git a/src/blocks/train/models/tabular_model.py b/src/blocks/train/models/tabular_model.py
@@ -1,8 +1,11 @@
+import logging
 from typing import List, Tuple
 
 import torch
 import torch.nn as nn
 
+logger = logging.getLogger(__name__)
+
 
 class TabularModel(nn.Module):
     """
@@ -35,7 +38,7 @@ def __init__(
             layers (List[int]): List of integers where each integer specifies the number of neurons in a hidden layer.
             p (float, optional): Dropout probability used in the embedding dropout and each hidden layer. Defaults to 0.5.
         """
-        print(
+        logger.debug(
             f"Initializing TabularModel with {emb_szs}, {n_cont}, {out_sz}, {layers}, {p}"
         )
         super().__init__()
@@ -50,7 +53,7 @@ def __init__(
         ]
         layerlist.append(nn.Linear(layers[-1], out_sz))
         self.layers = nn.Sequential(*layerlist)
-        print(
+        logger.debug(
             f"TabularModel initialized with {self.embeds}, {self.emb_drop}, {self.bn_cont}, {self.layers}"
         )
 

diff --git a/src/blocks/train/train_taxi.py b/src/blocks/train/train_taxi.py
@@ -1,22 +1,23 @@
+import logging
 import os.path
 from typing import List, Tuple
 
 import numpy as np
 import pandas as pd
 import torch
 import torch.nn as nn
-from rich import print
 from typing_extensions import override
 
 from src.block_base import BlockBase
 from src.blocks.train.models.tabular_model import TabularModel
 from src.params_base import BlockParamBase
 
+logger = logging.getLogger(__name__)
+
 
 class TrainModelParams(BlockParamBase):
     """Parameters for the TrainTaxiModel."""
 
-    model_name: str = "TaxiFareRegrModel.pt"
     id_col: str = "id"
     cat_cols: list = ["hour", "am_or_pm", "weekday"]
     cont_cols: list = [
@@ -33,6 +34,7 @@ class TrainModelParams(BlockParamBase):
     epochs: int = 300
 
     # Model Params
+    model_file: str = "TaxiFareRegrModel.pt"
     model_layers: List[int] = [200, 100]
     model_dropout: float = 0.4
 
@@ -83,20 +85,20 @@ def run(self, input_df: pd.DataFrame) -> pd.DataFrame:
         Returns:
             pd.DataFrame: The processed DataFrame, potentially with modifications or additional columns.
         """
-        print("******************************************** VALIDATE")
+        logger.debug("******************************************** VALIDATE")
         # Validate and convert columns to categories
         self.validate(input_df=input_df)
         self.convert_columns_to_categories(input_df=input_df)
 
         # Prepare tensors and setup model
-        print(
+        logger.debug(
             "******************************************** PREPARE TENSORS AND SETUP MODEL"
         )
         cats, conts, y = self.prepare_tensors(input_df=input_df)
         model, criterion, optimizer = self.setup_model(input_df=input_df, conts=conts)
 
         # Train
-        print("******************************************** TRAIN THE MODEL")
+        logger.debug("******************************************** TRAIN THE MODEL")
         losses = self.train_model(
             model=model,
             criterion=criterion,
@@ -107,7 +109,7 @@ def run(self, input_df: pd.DataFrame) -> pd.DataFrame:
         )
 
         # Evaluate
-        print("******************************************** EVALUATE THE MODEL")
+        logger.debug("******************************************** EVALUATE THE MODEL")
         self.evaluate_model(
             model=model, criterion=criterion, cats=cats, conts=conts, y=y, losses=losses
         )
@@ -220,7 +222,7 @@ def train_model(
             optimizer.step()
             losses.append(loss.item())
             if i % 25 == 0:
-                print(f"Epoch {i}: Loss = {loss.item():.8f}")
+                logger.debug(f"Epoch {i}: Loss = {loss.item():.8f}")
 
         return losses
 
@@ -251,7 +253,7 @@ def evaluate_model(
         with torch.no_grad():
             y_val = model(cat_test, con_test)
             loss = torch.sqrt(criterion(y_val, y_test))
-            print(f"Final RMSE: {loss:.8f}")
+            logger.debug(f"Final RMSE: {loss:.8f}")
 
         # Print some predictions
         predictions = []
@@ -262,17 +264,17 @@ def evaluate_model(
 
         # Sort the predictions by the difference between the predicted and actual values
         predictions.sort(key=lambda x: x[2])
-        print("Predictions:")
+        logger.debug("Predictions:")
         for i, (pred, actual, diff) in enumerate(predictions):
-            print(
+            logger.debug(
                 f"{i + 1:2}. Predicted: {pred:.4f}, Actual: {actual:.4f}, Diff: {diff:.4f}"
             )
 
         # Save the model if training is complete
         if len(losses) == self.params.epochs:
-            model_fp = os.path.abspath(self.params.model_name)
+            model_fp = os.path.abspath(self.params.model_file)
             torch.save(model.state_dict(), model_fp)
-            print(f"Model saved successfully to path '{model_fp}'")
+            logger.debug(f"Model saved successfully to path '{model_fp}'")
         else:
-            print("Model training incomplete.")
+            logger.debug("Model training incomplete.")
             raise ValueError("Model training incomplete.")