In [1]:
import os

In [2]:
os.chdir('../')

In [3]:
pwd

'd:\\Resarch\\2024\\UW-SoundSpeed'

In [11]:
from dataclasses import dataclass
from pathlib import Path
from typing import List, Dict
from uwsoundspeed.constants import CONFIG_FILE_PATH, PARAMS_FILE_PATH
from uwsoundspeed.utils.common import read_yaml, create_directories

@dataclass(frozen=True)
class PreprocessingConfig:
    numerical_features: List[str]
    numerical_transformer: Dict[str, any]
    categorical_features: List[str]
    categorical_transformer: Dict[str, any]


@dataclass(frozen=True)
class PrepareDataConfig:
    root_dir: Path
    data_file_path: Path
    preprocessing: PreprocessingConfig

@dataclass(frozen=True)
class PCAConfig:
    n_components: float

@dataclass(frozen=True)
class ModelConfig:
    KNeighborsRegressor: Dict[str, List[int]]

@dataclass(frozen=True)
class GridSearchConfig:
    cv: int
    scoring: str

@dataclass(frozen=True)
class TrainTestSplitConfig:
    test_size: float
    random_state: int


@dataclass(frozen=True)
class PrepareBaseModelConfig:
    root_dir: Path
    data_file_path: Path
    preprocessing: PreprocessingConfig
    pca: PCAConfig
    model: ModelConfig
    grid_search: GridSearchConfig
    train_test_split: TrainTestSplitConfig
    knn_model_path: Path

In [28]:

class ConfigurationManager:
    def __init__(
        self, 
        config_filepath = CONFIG_FILE_PATH ,
        params_filepath = PARAMS_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        create_directories([self.config.artifacts_root])

    def get_preprocessing_config(self) -> PreprocessingConfig:
        config = self.params.preprocessing 
        return PreprocessingConfig(
            numerical_features=config['numerical_features'],
            numerical_transformer=config['numerical_transformer'],
            categorical_features=config['categorical_features'],
            categorical_transformer=config['categorical_transformer']
        )
    
    def get_pca_config(self) -> PCAConfig:
        pca = self.params.pca
        return PCAConfig(n_components=pca['n_components'])

    def get_model_config(self) -> ModelConfig:
        model = self.params.model
        return ModelConfig(KNeighborsRegressor=model['KNeighborsRegressor'])

    def get_grid_search_config(self) -> GridSearchConfig:
        grid_search = self.params.grid_search
        return GridSearchConfig(cv=grid_search['cv'], scoring=grid_search['scoring'])

    def get_train_test_split_config(self) -> TrainTestSplitConfig:
        split_config = self.params.train_test_split
        return TrainTestSplitConfig(test_size=split_config['test_size'], random_state=split_config['random_state'])
    

    def get_prepare_base_model_config(self) -> PrepareBaseModelConfig:
    # Assuming 'prepare_knn_model' contains all necessary configuration paths and settings
        config = self.config['prepare_knn_model']
        
        # Create necessary directories specified in the configuration
        create_directories([Path(config['root_dir'])])

        # Construct and return the PrepareBaseModelConfig object with appropriate paths and settings
        prepare_base_model_config = PrepareBaseModelConfig(
            root_dir=Path(config['root_dir']),
            data_file_path=Path(config['data_file_path']),
            preprocessing=self.get_preprocessing_config(),
            pca=self.get_pca_config(),
            model=self.get_model_config(),
            grid_search=self.get_grid_search_config(),
            train_test_split=self.get_train_test_split_config(),
            knn_model_path=Path(config['knn_model_path'])
        )

        return prepare_base_model_config
    
    # def get_prepare_base_model_config(self) -> PrepareBaseModelConfig:
        
    #     return PrepareBaseModelConfig(
    #         root_dir=self.config.prepare_knn_model['root_dir'],
    #         data_file_path=self.config.prepare_knn_model['data_file_path'],
    #         preprocessing=self.get_preprocessing_config(),
    #         pca=self.get_pca_config(),
    #         model=self.get_model_config(),
    #         grid_search=self.get_grid_search_config(),
    #         train_test_split=self.get_train_test_split_config(),
    #         knn_model_path=self.config.prepare_knn_model['knn_model_path'],  
    #     )
    
    # def print_all_configs(self):
    #     print("Preprocessing Configuration:")
    #     preprocessing_config = self.get_preprocessing_config()
    #     print(f"  Numerical Features: {preprocessing_config.numerical_features}")
    #     print(f"  Numerical Transformer: {preprocessing_config.numerical_transformer}")
    #     print(f"  Categorical Features: {preprocessing_config.categorical_features}")
    #     print(f"  Categorical Transformer: {preprocessing_config.categorical_transformer}\n")
        
    #     print("PCA Configuration:")
    #     pca_config = self.get_pca_config()
    #     print(f"  N Components: {pca_config.n_components}\n")
        
    #     print("Model Configuration:")
    #     model_config = self.get_model_config()
    #     for model_name, params in model_config.__dict__.items():
    #         print(f"  {model_name}: {params}")
    #     print("\n")
        
    #     print("Grid Search Configuration:")
    #     grid_search_config = self.get_grid_search_config()
    #     print(f"  CV: {grid_search_config.cv}")
    #     print(f"  Scoring: {grid_search_config.scoring}\n")
        
    #     print("Train Test Split Configuration:")
    #     split_config = self.get_train_test_split_config()
    #     print(f"  Test Size: {split_config.test_size}")
    #     print(f"  Random State: {split_config.random_state}\n")
        
    #     print("KNN Model Path (if applicable):")
    #     base_model_config = self.get_prepare_base_model_config()
    #     print(f"  KNN Model Path: {base_model_config.knn_model_path}\n")



In [15]:
config_manager = ConfigurationManager(config_filepath=CONFIG_FILE_PATH, params_filepath=PARAMS_FILE_PATH)
config_manager.print_all_configs()

[2024-03-11 21:37:09,733: INFO: common: YAML file: config\config.yaml loaded successfully]
[2024-03-11 21:37:09,739: INFO: common: YAML file: params.yaml loaded successfully]
[2024-03-11 21:37:09,742: INFO: common: Directory created at: artifacts]
Preprocessing Configuration:
  Numerical Features: ['Lat', 'Lon', 'SSS', 'SST', 'z']
  Numerical Transformer: {'imputer_strategy': 'median', 'scaler': 'StandardScaler'}
  Categorical Features: ['Month']
  Categorical Transformer: {'imputer_strategy': 'most_frequent', 'onehot': 'OneHotEncoder', 'handle_unknown': 'ignore'}

PCA Configuration:
  N Components: 0.9

Model Configuration:
  KNeighborsRegressor: {'n_neighbors': [1, 2, 3, 4, 5]}


Grid Search Configuration:
  CV: 2
  Scoring: neg_mean_squared_error

Train Test Split Configuration:
  Test Size: 0.2
  Random State: 42

KNN Model Path (if applicable):
  KNN Model Path: artifacts/prepare_knn_model/knn_model.h5



In [32]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import joblib
from uwsoundspeed import logger

class KnnModelTrain:

    def __init__(self, config_manager):
        self.config_manager = config_manager  # Correct initialization
        self.config = self.config_manager.get_prepare_base_model_config()
        self.data = None
        self.preprocessor = None
        self.best_model = None   
        
    def load_data(self):
        base_model_config = self.config_manager.get_prepare_base_model_config()
        self.data = pd.read_pickle(base_model_config.data_file_path)
        logger.info(f"Data loaded successfully from {base_model_config.data_file_path}.")

    def prepare_pipeline(self):
        prep_config = self.config_manager.get_preprocessing_config()
        pca_config = self.config_manager.get_pca_config()
        model_config = self.config_manager.get_model_config().KNeighborsRegressor  


        # Numerical features pipeline
        numerical_pipeline = Pipeline([
            ('imputer', SimpleImputer(strategy=prep_config.numerical_transformer['imputer_strategy'])),
            ('scaler', StandardScaler())
        ])

        # Categorical features pipeline
        categorical_pipeline = Pipeline([
            ('imputer', SimpleImputer(strategy=prep_config.categorical_transformer['imputer_strategy'])),
            ('onehot', OneHotEncoder(handle_unknown=prep_config.categorical_transformer['handle_unknown']))
        ])

        # Combined preprocessing pipeline
        preprocessor = ColumnTransformer(transformers=[
            ('num', numerical_pipeline, prep_config.numerical_features),
            ('cat', categorical_pipeline, prep_config.categorical_features)
        ])

        # Full pipeline with PCA and KNeighborsRegressor placeholder
        self.pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('pca', PCA(n_components=pca_config.n_components)),
        ('knn', KNeighborsRegressor(n_neighbors=model_config['n_neighbors'][0]))  
        ])


        
    def train_test_split(self):
    # Retrieve train-test split configuration using the config_manager
        split_config = self.config_manager.get_train_test_split_config()
        X = self.data.drop('c', axis=1)  # Make sure 'target_column' matches your actual target column name
        y = self.data['c']
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X, y, test_size=split_config.test_size, random_state=split_config.random_state
        )
        logger.info("Train-test split completed.")


    def train_and_evaluate(self):

        model_config = self.config_manager.get_model_config().KNeighborsRegressor
        grid_search_config = self.config_manager.get_grid_search_config()

  
        
        grid_search = GridSearchCV(
            self.pipeline,
            param_grid={'knn__n_neighbors': model_config['n_neighbors']},
            cv=grid_search_config.cv,
            scoring=grid_search_config.scoring,
            verbose=1
        )

        grid_search.fit(self.X_train, self.y_train)
        self.best_model = grid_search.best_estimator_

        y_pred = self.best_model.predict(self.X_test)
        logger.info(f"R^2: {r2_score(self.y_test, y_pred)}")
        logger.info(f"RMSE: {mean_squared_error(self.y_test, y_pred, squared=False)}")

    def save_model(self):
        knn_model_path = self.config_manager.get_prepare_base_model_config().knn_model_path
        joblib.dump(self.best_model, knn_model_path)
        logger.info(f"Best KNN model saved to {knn_model_path}")


    # def train_and_evaluate(self):
    #     model_config = self.config.get_model_config().KNeighborsRegressor
    #     grid_search_config = self.config.get_grid_search_config()
        
    #     grid_search = GridSearchCV(
    #         self.pipeline,
    #         param_grid={'knn__n_neighbors': model_config['n_neighbors']},
    #         cv=grid_search_config.cv,
    #         scoring=grid_search_config.scoring,
    #         verbose=1
    #     )

    #     grid_search.fit(self.X_train, self.y_train)
    #     self.best_model = grid_search.best_estimator_

    #     y_pred = self.best_model.predict(self.X_test)
    #     print(f"R^2: {r2_score(self.y_test, y_pred)}")
    #     print(f"RMSE: {mean_squared_error(self.y_test, y_pred, squared=False)}")
        

            # def train_test_split(self):
    #     split_config = self.config.get_train_test_split_config()
    #     X = self.data.drop('c', axis=1)  
    #     y = self.data['c']  
    #     self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
    #         X, y, test_size=split_config.test_size, random_state=split_config.random_state
    #     )
        

    # def save_model(self):
    #     knn_model_path = self.config.get_prepare_base_model_config().knn_model_path
    #     joblib.dump(self.best_model, knn_model_path)
    #     print(f"Best KNN model saved to {knn_model_path}")

    # def run(self):
    #     self.load_data()
    #     self.prepare_pipeline()
    #     self.train_test_split()
    #     self.train_and_evaluate()
    #     self.save_model()


In [34]:
try:
    config_manager = ConfigurationManager(config_filepath=CONFIG_FILE_PATH, params_filepath=PARAMS_FILE_PATH)  # Ensure CONFIG_FILE_PATH and PARAMS_FILE_PATH are correctly defined
    prepare_data = KnnModelTrain(config_manager)  # Passing config_manager correctly

    prepare_data.load_data()
    prepare_data.prepare_pipeline()
    prepare_data.train_test_split()
    prepare_data.train_and_evaluate()
    prepare_data.save_model()
    # Any additional methods
except Exception as e:
    logger.error(f"Failed to prepare data and model due to: {e}", exc_info=True)
    raise e

[2024-03-12 02:03:45,723: INFO: common: YAML file: config\config.yaml loaded successfully]
[2024-03-12 02:03:45,818: INFO: common: YAML file: params.yaml loaded successfully]
[2024-03-12 02:03:45,824: INFO: common: Directory created at: artifacts]
[2024-03-12 02:03:45,828: INFO: common: Directory created at: artifacts\prepare_knn_model]
[2024-03-12 02:03:46,837: INFO: common: Directory created at: artifacts\prepare_knn_model]
[2024-03-12 02:03:48,948: INFO: 3961613561: Data loaded successfully from artifacts\data_ingestion\raw\complete_dataset_sample.pkl.]
[2024-03-12 02:03:49,271: INFO: 3961613561: Train-test split completed.]
Fitting 2 folds for each of 2 candidates, totalling 4 fits
[2024-03-12 02:04:53,117: INFO: 3961613561: R^2: 0.9775781232291486]
[2024-03-12 02:04:53,136: INFO: 3961613561: RMSE: 3.295134050491929]
[2024-03-12 02:04:53,138: INFO: common: Directory created at: artifacts\prepare_knn_model]
[2024-03-12 02:04:53,314: INFO: 3961613561: Best KNN model saved to artifact