In [None]:
import os

In [None]:
os.chdir('../')

In [None]:
pwd

In [None]:
from dataclasses import dataclass
from pathlib import Path
from typing import List, Dict

@dataclass(frozen=True)
class PCAConfig:
    n_components: float

@dataclass(frozen=True)
class ModelConfig:
    KNeighborsRegressor: Dict[str, List[int]]

@dataclass(frozen=True)
class GridSearchConfig:
    cv: int
    scoring: str

@dataclass(frozen=True)
class TrainTestSplitConfig:
    test_size: float
    random_state: int


@dataclass(frozen=True)
class PrepareDataConfig:
    root_dir: Path
    data_file_path: Path
    preprocessing: PreprocessingConfig
    pca: PCAConfig
    model: ModelConfig
    grid_search: GridSearchConfig
    train_test_split: TrainTestSplitConfig

In [None]:

    def get_model_training_config(self) -> PrepareBaseModelConfig:
        config = self.config['prepare_base_model']
        root_dir = Path(self.config.root_dir)
        data_file_path = root_dir / self.config.data_file_path

        preprocessing_config = self.get_preprocessing_config()
        pca_config = PCAConfig(n_components=self.params['PCA']['n_components'])
        model_config = ModelConfig(KNeighborsRegressor=self.params['KNeighborsRegressor'])
        grid_search_config = GridSearchConfig(cv=self.params['grid_search']['cv'], scoring=self.params['grid_search']['scoring'])
        train_test_split_config = TrainTestSplitConfig(test_size=self.params['train_test_split']['test_size'], random_state=self.params['train_test_split']['random_state'])

        # Ensure creation of necessary directories
        create_directories([root_dir, data_file_path.parent])

In [None]:
from uwsoundspeed import logger
class PrepareBaseModel:
    def __init__(self, config: PrepareBaseModelConfig):
        self.config = config

    def load_data(self):
        """
        Loads data from the specified file path using a custom load function.
        """
        self.data = load_from_pkl(self.config.data_file_path)
        logger.info(f"Data loaded successfully from {self.config.data_file_path}.")

    def prepare_pipeline(self):
        """
        Prepares the machine learning pipeline including preprocessing, PCA, and the model.
        """
        # Extracting configurations
        preprocessing_config = self.config.preprocessing
        pca_config = self.config.pca
        model_config = self.config.model.KNeighborsRegressor

        # Preprocessing for numerical features
        numerical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy=preprocessing_config.numerical_transformer['imputer'])),
            ('scaler', StandardScaler())
        ])

        # Preprocessing for categorical features
        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy=preprocessing_config.categorical_transformer['imputer'])),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ])

        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numerical_transformer, preprocessing_config.numerical_features),
                ('cat', categorical_transformer, preprocessing_config.categorical_features)
            ])

        # Creating the pipeline
        self.pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('pca', PCA(n_components=pca_config.n_components)),
            ('model', KNeighborsRegressor())
        ])

        self.param_grid = {
            'model__n_neighbors': model_config['n_neighbors']
        }

    def train_test_split(self):
        """
        Splits the dataset into training and testing sets.
        """
        X = self.data.drop('target_column', axis=1)  # Replace 'target_column' with your actual target column name
        y = self.data['target_column']  # Replace 'target_column' with your actual target column name

        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X, y, test_size=self.config.train_test_split.test_size, random_state=self.config.train_test_split.random_state)

    def train_and_evaluate(self):
        """
        Trains the model using GridSearchCV and evaluates it.
        """
        grid_search = GridSearchCV(self.pipeline, self.param_grid, cv=self.config.grid_search.cv, scoring=self.config.grid_search.scoring, verbose=1)
        grid_search.fit(self.X_train, self.y_train)

        y_pred = grid_search.predict(self.X_test)
        print(f"R^2: {r2_score(self.y_test, y_pred)}")
        print(f"RMSE: {mean_squared_error(self.y_test, y_pred, squared=False)}")

    def run(self):
        """
        Executes the entire model preparation and training process.
        """
        try:
            self.load_data()
            self.prepare_pipeline()
            self.train_test_split()
            self.train_and_evaluate()
            logger.info("Model preparation and training completed successfully.")
        except Exception as e:
            logger.error(f"Error during model preparation and training: {e}")
            raise