In [1]:
%pwd

'/Users/dkcentral/99_repos/mlops_msr/notebooks'

In [2]:
import os

In [3]:
os.chdir("..")
%pwd

'/Users/dkcentral/99_repos/mlops_msr'

In [4]:
# initialize mlflow server on DagsHub
import dagshub
dagshub.init(repo_owner='micheldpd24', repo_name='mlflow_tracking', mlflow=True)


In [5]:
import mlflow
mlflow.doctor()

[34mSystem information[0m: Darwin Darwin Kernel Version 21.6.0: Mon Jun 24 00:56:10 PDT 2024; root:xnu-8020.240.18.709.2~1/RELEASE_X86_64
[34mPython version[0m: 3.11.5
[34mMLflow version[0m: 2.17.2
[34mMLflow module location[0m: /Users/dkcentral/99_repos/mlops_msr/.venv/lib/python3.11/site-packages/mlflow/__init__.py
[34mTracking URI[0m: https://dagshub.com/micheldpd24/mlflow_tracking.mlflow
[34mRegistry URI[0m: https://dagshub.com/micheldpd24/mlflow_tracking.mlflow
[34mMLflow environment variables[0m: 
  MLFLOW_TRACKING_PASSWORD: f19e5be9a7551a5779324d989a3e80bd393c6b63
  MLFLOW_TRACKING_URI: https://dagshub.com/micheldpd24/mlflow_tracking.mlflow
  MLFLOW_TRACKING_USERNAME: micheldpd24
[34mMLflow dependencies[0m: 
  Flask: 3.0.3
  Jinja2: 3.1.4
  aiohttp: 3.10.10
  alembic: 1.14.0
  boto3: 1.35.36
  botocore: 1.35.36
  docker: 7.1.0
  graphene: 3.4.1
  gunicorn: 23.0.0
  markdown: 3.7
  matplotlib: 3.9.2
  mlflow-skinny: 2.17.2
  numpy: 2.1.3
  pandas: 2.2.3
  pyarrow:

In [6]:
from src.common_utils import read_yaml, create_directories

In [7]:
from src.config import CONFIG_FILE_PATH, PARAMS_FILE_PATH, SCHEMA_FILE_PATH

### Step 1: Defining configuration classes for each stage

In [8]:
from dataclasses import dataclass
from pathlib import Path

#### fichier entity.py

In [9]:
@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    source_url: str
    local_data_file: Path
    unzip_dir: Path

@dataclass(frozen=True)
class DataValidationConfig:
    root_dir: Path
    STATUS_FILE: str
    unzip_data_dir: Path
    all_schema: dict

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path
    reco_dir: Path
    genres: list

@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    X_train_path: Path
    y_train_path: Path
    X_test_path: Path
    y_test_path: Path
    model_name: str
    learning_rate: float
    max_depth: int
    n_estimators: int

@dataclass(frozen=True)
class ModelEvaluationConfig:
    root_dir: Path
    X_test_path: Path
    y_test_path: Path
    model_path: Path
    metric_file_name: Path
    all_params: dict
    mlflow_uri: str

@dataclass(frozen=True)
class UnsModelFitConfig:
    root_dir: Path
    features_path_prefix: str
    genres_path: Path
    model_dir: Path
    model_name_prefix: str
    metrics_path_prefix: str
    all_params: dict
    mlflow_uri: str


### Step 2: Creation of a configuration manager which will create the configuration objects of each class for each step

### config_manager.py

In [None]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
            schema_filepath = SCHEMA_FILE_PATH):

            self.config = read_yaml(config_filepath)
            self.params = read_yaml(params_filepath)
            self.schema = read_yaml(schema_filepath)

            
    def get_data_ingestion_config(self) -> DataIngestionConfig:
          config = self.config.data_ingestion

          create_directories([config.root_dir])

          data_ingestion_config = DataIngestionConfig(
                root_dir= config.root_dir,
                source_url=config.source_URL,
                local_data_file=config.local_data_file,
                unzip_dir=config.unzip_dir
          )

          return data_ingestion_config
    
    def get_data_validation_config(self) -> DataValidationConfig:
        config = self.config.data_validation
        schema = self.schema.COLUMNS

        create_directories([config.root_dir])

        data_validation_config = DataValidationConfig(
            root_dir = config.root_dir,
            STATUS_FILE = config.STATUS_FILE,
            unzip_data_dir = config.unzip_dir,
            all_schema = schema,
        )

        return data_validation_config
    
    def get_data_transformation_config(self) -> DataTransformationConfig:
          config = self.config.data_transformation

          create_directories([config.root_dir])
          create_directories([config.reco_dir])

          data_transformation_config = DataTransformationConfig(
                root_dir = config.root_dir,
                data_path =  config.data_path,
                reco_dir = config.reco_dir,
                genres = config.genres
          )

          return data_transformation_config
    
    def get_model_trainer_config(self) -> ModelTrainerConfig:
          config = self.config.model_trainer
          params = self.params.GradientBoostingClassifier
          
          create_directories([config.root_dir])

          model_trainer_config = ModelTrainerConfig(
                root_dir = config.root_dir,
                X_train_path = config.X_train_path,
                y_train_path = config.y_train_path,
                X_test_path = config.X_test_path,
                y_test_path = config.y_test_path,
                model_name = config.model_name,
                learning_rate = params.learning_rate,
                max_depth = params.max_depth,
                n_estimators = params.n_estimators
          )

          return model_trainer_config
    
    def get_model_evaluation_config(self) -> ModelEvaluationConfig:
          config = self.config.model_evaluation
          params = self.params.GradientBoostingClassifier

          create_directories([config.root_dir])
          
          model_evaluation_config = ModelEvaluationConfig(
                root_dir=config.root_dir,
                X_test_path = config.X_test_path,
                y_test_path = config.y_test_path,
                model_path=config.model_path,
                metric_file_name=config.metric_file_name,
                all_params=params,
                mlflow_uri="https://dagshub.com/micheldpd24/mlflow_tracking.mlflow", # make sure to update this information
          )

          return model_evaluation_config
    
    def get_unsmodel_fit_config(self) -> UnsModelFitConfig:
          config = self.config.unsmodel_fit
          params = self.params.GaussianMixture
          
          create_directories([config.root_dir])

          unsmodel_fit_config = UnsModelFitConfig(
                root_dir = config.root_dir,
                features_path_prefix = config.features_path_prefix,
                genres_path = config.genres_path,
                model_dir = config.model_dir,
                model_name_prefix = config.model_name_prefix,
                metrics_path_prefix = config.metrics_path_prefix,
                all_params=params,
                mlflow_uri="https://dagshub.com/micheldpd24/mlflow_tracking.mlflow" # make sure to update this information
          )

          return unsmodel_fit_config

### Step 3: Creation of each module for each step using their configuration class to instantiate them.

#### Data Ingestion step

In [11]:
import urllib.request as request
import os
from pathlib import Path
from custom_logger import logger
import zipfile
import pandas as pd
from datetime import datetime
import shutil
from src.common_utils import move_file_to_archives, delete_csv_files

class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config
        
    def download_file(self):
        if not os.path.exists(self.config.local_data_file):
            filename, headers = request.urlretrieve(
                url = self.config.source_url,
                filename = self.config.local_data_file
            )
            logger.info(f"{filename} download! With following info: \n{headers}")
        
        else:
            logger.info(f"File already exists.")

    def extract_zip_file(self):
        """
        zip_file_path: str
        Extracts the zip file into the data directory
        Function returns None
        """

        unzip_path = self.config.unzip_dir
        os.makedirs(unzip_path, exist_ok=True)
        with zipfile.ZipFile(self.config.local_data_file, 'r') as zip_ref:
            zip_ref.extractall(unzip_path)
    
    def make_data(self):
        unzip_path = self.config.unzip_dir
        raw_data_path = os.path.join(unzip_path, "dataset.csv")
        
        try:
            df0 = pd.read_csv(raw_data_path, index_col=0)
        except FileNotFoundError as e:
            logger.error(f"Raw data file not found at {raw_data_path}: {e}")
            return
        
        df0.rename(columns={"track_id": "uri",  "track_genre": "genre"}, inplace=True)
        
        columns_to_drop = ["artists", "album_name", "track_name", "popularity", "explicit", 
                           "time_signature", "duration_ms", "mode", "liveness"]
        
        df0.drop(columns=columns_to_drop, axis=1, inplace=True)
        df0.dropna(inplace=True, ignore_index=True)
        
        genres = ['alternative', 'classical', 'country', 'edm', 'hip-hop', 
                  'jazz', 'latin', 'pop', 'r-n-b', 'rock']
        
        df = df0[df0["genre"].isin(genres)].reset_index(drop=True)
        df.drop_duplicates(subset=["uri"], inplace=True, ignore_index=True)

        logger.info(f"Dataset shape after processing: {df.shape}")
        logger.info(f"First few records:\n{df.head()}")

        df.to_csv("data/raw/song_df.csv")


    def get_data(self):
        """Main function to trigger the download, extraction, and data processing."""
        
        # check if song_df exists in "data/raw"
        path_file = os.path.join(self.config.root_dir, "song_df.csv")
        if Path(path_file).exists():
            print("Retraining mode - No need to download and extract the dataset again.")
            logger.info("Retraining mode - No need to download and extract the dataset again.")
        else:
            print("Extracting and preparing dataset")
            logger.info("Extracting and preparing dataset")  
            self.download_file()
            self.extract_zip_file()
            self.make_data()






#### Data Validation step  -> data_validation.py

In [12]:
import pandas as pd
from src.config_manager import DataValidationConfig

class DataValidation:
    def __init__(self, config: DataValidationConfig):
        self.config = config

    def validate_all_columns(self) -> bool:
        try:
            validation_status = None
            data = pd.read_csv(self.config.unzip_data_dir)
            all_cols = list(data.columns)

            all_schema = self.config.all_schema.keys()

            for col in all_cols:
                if col not in all_schema:
                    validation_status = False
                    with open(self.config.STATUS_FILE, 'w') as f:
                        f.write(f"Validation status: {validation_status}")
                else:
                    validation_status =  True
                    with open(self.config.STATUS_FILE, 'w') as f:
                        f.write(f"Validation status: {validation_status}")
            return validation_status
        except Exception as e:
            raise e

#### Data Transformation step --> data_transformation.py

In [13]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from custom_logger import logger
from src.entity import DataTransformationConfig

class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config

    def train_test_splitting(self): 
        data = pd.read_csv(self.config.data_path, index_col=0)
        # data = pd.read_csv(os.path.join(self.config.root_dir, "data_scaled.csv"), index_col=0)

        X = data.drop(columns=["uri", "genre"])
        y = data["genre"]
        check = pd.DataFrame()  # control
        check["genre"] = data["genre"]  # control
        
        # Label encoder
        le = LabelEncoder()
        le.fit(y)
        y = le.transform(y)
        y = pd.DataFrame(y)
        
        check["label"] = y  # control
        print(check.value_counts())  # control
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        X_train.to_csv(os.path.join(self.config.root_dir, "X_train.csv"), index = False)
        y_train.to_csv(os.path.join(self.config.root_dir, "y_train.csv"), index = False)
        X_test.to_csv(os.path.join(self.config.root_dir, "X_test.csv"), index = False)
        y_test.to_csv(os.path.join(self.config.root_dir, "y_test.csv"), index = False)

        logger.info("Splitted data into training and test datasets")
        logger.info(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
        logger.info(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")

        print(X_train.shape, y_train.shape)
        print(X_test.shape, y_test.shape)

    # split data by music genre and save list of music genre into a txt file
    def split_by_genre(self):
        
        data = pd.read_csv(self.config.data_path, index_col=0)
        genres = self.config.genres

        with open(os.path.join(self.config.reco_dir, "genres.txt"), "w") as f:
            f.write("\n".join(genres))
        
        logger.info(f"List of music genres savec into genre.txt")

        for i, genre in enumerate(genres):
            data_genre = data[data["genre"] == genre]
            data_genre = data_genre.drop(columns = ["genre"]).reset_index(drop=True)
            data_genre.to_csv(os.path.join(self.config.reco_dir, f"to_rec_{i}.csv"), index = False)
            
            logger.info(f"{genre} music data extracted")
            logger.info(f"{genre}_data shape: {data_genre.shape}")
    
        logger.info(f"Songs data splitted by music genres")


#### Model Trainer step  --> model_trainer.py

In [14]:
import joblib
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import GradientBoostingClassifier
from src.entity import ModelTrainerConfig

class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config

    def train(self):
        X_train = pd.read_csv(self.config.X_train_path)
        y_train = pd.read_csv(self.config.y_train_path)
        X_train = X_train.values
        y_train = y_train.values.ravel()

        # Define the pipeline min max scaling + grad boot clf
        # min_max_trans = MinMaxScaler()  
        model = GradientBoostingClassifier(
            learning_rate=self.config.learning_rate,
            max_depth=self.config.max_depth,
            n_estimators=self.config.n_estimators,
            random_state=42
        )
        # model = Pipeline(steps=[('t', min_max_trans), ('m', gb)])

        model.fit(X_train, y_train)

        joblib.dump(
            model, os.path.join(
                self.config.root_dir, 
                self.config.model_name
            )
        )

#### Model Evaluation step

In [15]:
# import numpy as np
from time import time
import mlflow
from mlflow import MlflowClient
import mlflow.sklearn
# import dagshub
import joblib
from urllib.parse import urlparse
from sklearn.metrics import accuracy_score, classification_report
from src.entity import ModelEvaluationConfig
from src.common_utils import save_json
from mlflow.models import infer_signature

# make sure to register to DagsHub and create you own repo
# dagshub.init(repo_owner='micheldpd24', repo_name='mlops_music_recsys', mlflow=True)

class ModelEvaluation:
    def __init__(self, config: ModelEvaluationConfig):
        self.config = config

        
    def eval_metrics(self, actual, pred):
        accuracy = accuracy_score(actual, pred)
        cl_report = classification_report(actual, pred)
        
        print("Classification Report:")
        print(cl_report)
        
        return accuracy
    
    def log_into_mlflow(self):
        
        X_test = pd.read_csv(self.config.X_test_path)
        y_test = pd.read_csv(self.config.y_test_path)
        
        X_test = X_test.values
        y_test = y_test.values.ravel()
        
        model = joblib.load(self.config.model_path)

        mlflow.set_registry_uri(self.config.mlflow_uri)
        # tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme

        mlflow.set_experiment(experiment_name="music_clf")
        mlflow.set_experiment_tag('mlflow.note.content', "Songs classification by music genre")
        
        with mlflow.start_run():
            t0 = time()
            predicted_genres = model.predict(X_test)
            time_predict = time() - t0

            accuracy = self.eval_metrics(y_test, predicted_genres)

            # Saving metrics
            scores = {"accuracy": accuracy, "time_predict": time_predict}
            save_json(path=Path(self.config.metric_file_name), data=scores)

            mlflow.log_params(self.config.all_params)
            mlflow.log_metric("accuracy", accuracy)
            mlflow.log_metric("time_predict", time_predict)
            signature = infer_signature(X_test, model.predict(X_test))
            
            print("*** starting log_model: Music Classification ***")  # Control
            # Model registry does not work with file store
            
            # control
            # print("*** tracking_url_type_store *** :\n", tracking_url_type_store)
            mlflow.sklearn.log_model(
                    model, 
                    "GB_model", 
                    signature=signature
            )


            # if tracking_url_type_store != "file":
            #     mlflow.sklearn.log_model(
            #         model, 
            #         "model", 
            #         # registered_model_name="GradBoostClf",
            #         signature=signature
            #     )
            # else:
            #     mlflow.sklearn.log_model(
            #         model, 
            #         "model",
            #         signature=signature
            #     )
            # print("*** en if else check ***")  # control

    def get_best_clf_model(self):
        """
        Retrieves the MLflow run with the best score for a specified metric and loads the associated model.
        
        Args:
            experiment_name (str): The name of the MLflow experiment.
            metric (str): The metric used to determine the best model. Default is "accuracy".
        
        Returns:
            tuple: A tuple containing:
                - loaded_model: The MLflow model object.
                - best_run_id (str): The ID of the best run.
                - best_metric_value (float): The value of the best metric.
        
        Raises:
            ValueError: If the experiment or runs are not found.
        """

        experiment_name = "music_clf"
        model = "GB_model"
        metric = "accuracy"

        # Initialize the MLflow client
        client = MlflowClient()

        # Get the experiment details
        experiment = client.get_experiment_by_name(experiment_name)
        if not experiment:
            raise ValueError(f"Experiment '{experiment_name}' not found!")
        
        # Fetch all runs for the experiment
        runs = client.search_runs(
            experiment_ids=[experiment.experiment_id],
            filter_string="",
            run_view_type=mlflow.entities.ViewType.ACTIVE_ONLY,
            order_by=[f"metrics.{metric} DESC"]  # Order by the specified metric in descending order
        )
        
        if not runs:
            raise ValueError(f"No runs found for experiment '{experiment_name}'!")
        
        # Extract the best run
        best_run = runs[0]
        best_run_id = best_run.info.run_id
        best_metric_value = best_run.data.metrics[metric]
        
        print(f"Best Run ID: {best_run_id}")
        print(f"Best {metric.capitalize()}: {best_metric_value}")
        
        # Load the model associated with the best run
        model_uri = f"runs:/{best_run_id}/{model}"
        loaded_model = mlflow.pyfunc.load_model(model_uri)
        
        print(f"Best {experiment_name} model loaded successfully!")
        
        # save the model
        best_model_path = "models_best/gb_model.joblib"
        joblib.dump(loaded_model, best_model_path)

        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        best_data = {
            "timestamp": timestamp,
            "experiment_name": experiment_name,
            "run_id": best_run_id,
            "model": model, 
            "model_uri": model_uri, 
            "metric": metric, 
            "metric_value": best_metric_value
        }
        best_data_path = os.path.join("data/models_best", f"{experiment_name}_best_model_{timestamp}.json")
        save_json(Path(best_data_path), best_data)
        print(f"Best {experiment_name} model saved successfully!")
        




### Unsupervised Model Fit

In [None]:
import os
import pandas as pd
import joblib
from pathlib import Path
from urllib.parse import urlparse
from time import time
import mlflow
import mlflow.sklearn
from mlflow.models import infer_signature
from mlflow.pyfunc import PythonModel
from sklearn.mixture import GaussianMixture
from sklearn.metrics import (
    silhouette_score,
    calinski_harabasz_score,
    davies_bouldin_score
)
from src.entity import UnsModelFitConfig
from src.common_utils import load_txt, save_json, delete_folder


class UnsModelFit:
    def __init__(self, config: UnsModelFitConfig):
        self.config = config

    def unsfit(self):
        genres = load_txt(Path(self.config.genres_path))

        class ModelWrapper(PythonModel):
            def __init__(self):
                self.model = None


            def load_context(self, context):
                from joblib import load
                self.model = load(context.artifacts["model_path"])

            def predict(self, context, model_input, params=None):
                params = params or {"predict_method": "predict"}
                predict_method = params.get("predict_method")

                if predict_method == "predict":
                    return self.model.predict(model_input)
                elif predict_method == "predict_proba":
                    return self.model.predict_proba(model_input)
                elif predict_method == "predict_log_proba":
                    return self.model.predict_log_proba(model_input)
                else:
                    raise ValueError(f"The prediction method '{predict_method}' is not supported.")

        for element, genre in enumerate(genres):
            try:
                # Load feature file
                features_path = Path(self.config.features_path_prefix+f"{element}.csv")
                if not features_path.exists():
                    print(f"File not found: {features_path}")
                    continue  # Skip this iteration if the file does not exist
                
                features = pd.read_csv(features_path)
                features = features.drop(columns=["uri"], errors="ignore")
                features = features.values
                
                # Initialize Gaussian Mixture Model
                model = GaussianMixture(
                    n_components=self.config.all_params.n_components[element],
                    covariance_type=self.config.all_params.covariance_type,
                    random_state=self.config.all_params.random_state
                )

                # Fit the model
                model.fit(features)
                
                # Save raw model
                
                model_filename = f"{self.config.model_name_prefix}{element}.joblib"
                model_path = os.path.join(self.config.root_dir, model_filename)
    
                joblib.dump(model, model_path)  # save the model to a .joblib file

                # Define artifacts for wrapped model
                artifacts = {"model_path": model_path}

                # Save wrapped PyFunc model
                pyfunc_path = f"models/pyfunc_gm_model_{element}"
                signature = infer_signature(features, params={"predict_method": "predict_proba"}) 
                mlflow.set_registry_uri(self.config.mlflow_uri)
                tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
                mlflow.set_experiment(experiment_name=f"clst_{element}_{genre}")
                mlflow.set_experiment_tag('mlflow.note.content', f"Clustering {genre} songs with Gaussian Mixture Model")

                delete_folder(pyfunc_path)
                with mlflow.start_run():
                    mlflow.pyfunc.save_model(
                        path=pyfunc_path,
                        python_model=ModelWrapper(),
                        input_example=features,
                        signature=signature,
                        artifacts=artifacts,
                        pip_requirements=["joblib", "sklearn"],
                    )

                    wrapped_model= mlflow.pyfunc.load_model(pyfunc_path)

                    # Save the fitted model
                    model_path = os.path.join(
                                    self.config.root_dir, 
                                    self.config.model_name_prefix+f"{element}.joblib"
                    )
                    joblib.dump(model, model_path)

                    # Prediction and metrics
                    t0 = time()
                    prediction = wrapped_model.predict(features, params={"predict_method": "predict"})
                    time_predict = time() - t0
                    
                    
                    silh_score = silhouette_score(features, prediction)
                    chi_score = calinski_harabasz_score(features, prediction)
                    dab_score = davies_bouldin_score(features, prediction)

                    scores = {
                        "silhouette_score": silh_score,
                        "calinski_harabasz_score": chi_score,
                        "davies_bouldin_score": dab_score,
                        "time_predict": time_predict
                    }

                    metrics_file_path = Path(f"{self.config.metrics_path_prefix}{element}.json")
                    save_json(metrics_file_path, data=scores)

                    # Log metrics and parameters
                    model_params = {
                        "n_components": self.config.all_params.n_components[element],
                        "covariance_type": self.config.all_params.covariance_type,
                        "random_state": self.config.all_params.random_state
                    }
                    
                    mlflow.log_params(model_params)
                    mlflow.log_metric("silhouette_score", silh_score)
                    mlflow.log_metric("calinski_harabasz_score", chi_score)
                    mlflow.log_metric("davies_bouldin_score", dab_score)
                    mlflow.log_metric("time_predict", time_predict)
                    signature = infer_signature(features, wrapped_model.predict(features)) 
                    
                    mlflow.sklearn.log_model(
                                wrapped_model, 
                                "GM_model", 
                                signature=signature,
                    )
                    print(f"Logged model for {genre}: {element}")

            except Exception as e:
                print(f"Error in processing element {element}: {e}")
            
    def get_best_clst_model(self):
        """
        Retrieves the MLflow run with the best score for a specified metric and loads the associated model.
        
        Args:
            experiment_name (str): The name of the MLflow experiment.
            metric (str): The metric used to determine the best model. Default is "accuracy".
        
        Raises:
            ValueError: If the experiment or runs are not found.
        """

        genres = load_txt(Path(self.config.genres_path))
        
        # Initialize the MLflow client
        client = MlflowClient()

        for element, genre in enumerate(genres):
            
            experiment_name = f"clst_{element}_{genre}"
            model = "GM_model"
            metric = "silhouette_score"

            # Get the experiment details
            experiment = client.get_experiment_by_name(experiment_name)
            if not experiment:
                raise ValueError(f"Experiment '{experiment_name}' not found!")
            
            # Fetch all runs for the experiment
            runs = client.search_runs(
                experiment_ids=[experiment.experiment_id],
                filter_string="",
                run_view_type=mlflow.entities.ViewType.ACTIVE_ONLY,
                order_by=[f"metrics.{metric} DESC"]  # Order by the specified metric in descending order
            )
            
            if not runs:
                raise ValueError(f"No runs found for experiment '{experiment_name}'!")
            
            # Extract the best run
            best_run = runs[0]
            best_run_id = best_run.info.run_id
            best_metric_value = best_run.data.metrics[metric]
            
            print(f"Best Run ID: {best_run_id}")
            print(f"Best {metric.capitalize()}: {best_metric_value}")
            
            # Load the model associated with the best run
            model_uri = f"runs:/{best_run_id}/{model}"
            loaded_model = mlflow.pyfunc.load_model(model_uri)
            
            print(f"Best {experiment_name} model loaded successfully!")

            # save the model
            best_model_path = Path(f"models_best/gm_model_{element}")
            delete_folder(best_model_path)
            mlflow.sklearn.save_model(loaded_model, best_model_path)
            
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            best_data = {
                "timestamp": timestamp,
                "experiment_name": experiment_name,
                "run_id": best_run_id,
                "model": model, 
                "model_uri": model_uri, 
                "metric": metric, 
                "metric_value": best_metric_value
            }
            save_json(Path(f"data/models_best/{experiment_name}_best_{timestamp}.json"), best_data)


### Step 4: Pipeline steps to instantiate the classes and call each process

#### Data Ingestion step

In [17]:
config = ConfigurationManager()
data_ingestion_config = config.get_data_ingestion_config()
data_ingestion = DataIngestion(config = data_ingestion_config)
# data_ingestion.download_file()
# data_ingestion.extract_zip_file()
# data_ingestion.make_data()
data_ingestion.get_data()

Retraining mode - No need to download and extract the dataset again.


#### Data Validation step

In [18]:
config = ConfigurationManager()
data_validation_config = config.get_data_validation_config()
data_validation = DataValidation(config=data_validation_config)
data_validation.validate_all_columns()

True

#### Data Transformation step

In [19]:
try:
    with open(Path("data/status.txt"), 'r') as f:
        status = f.read().split(" ")[-1]
            
    if status == "True":
        config = ConfigurationManager()
        print("***")
        data_transformation_config = config.get_data_transformation_config()
        print("***")
        data_transformation = DataTransformation(config = data_transformation_config)
        data_transformation.train_test_splitting()
        data_transformation.split_by_genre()
    else:
        raise Exception("Your data schema is not valid")
        
except Exception as e:
    print(e)

***
***
genre        label
edm          3        1003
alternative  0         999
jazz         5         999
country      2         994
hip-hop      4         991
r-n-b        8         989
classical    1         933
latin        6         810
pop          7         798
rock         9         759
Name: count, dtype: int64
(7420, 9) (7420, 1)
(1855, 9) (1855, 1)


#### Model trainer step

In [20]:
config = ConfigurationManager()
model_trainer_config = config.get_model_trainer_config()
model_trainer = ModelTrainer(config= model_trainer_config)
model_trainer.train()

#### Model evaluation step

In [22]:
config = ConfigurationManager()
model_evaluation_config = config.get_model_evaluation_config()
model_evaluation = ModelEvaluation(config = model_evaluation_config)
model_evaluation.log_into_mlflow()
model_evaluation.get_best_clf_model()

Classification Report:
              precision    recall  f1-score   support

           0       0.55      0.63      0.59       195
           1       0.88      0.85      0.86       191
           2       0.56      0.66      0.60       192
           3       0.65      0.70      0.67       199
           4       0.63      0.60      0.61       204
           5       0.73      0.76      0.75       203
           6       0.73      0.69      0.71       177
           7       0.58      0.49      0.53       152
           8       0.47      0.43      0.45       195
           9       0.80      0.67      0.73       147

    accuracy                           0.65      1855
   macro avg       0.66      0.65      0.65      1855
weighted avg       0.65      0.65      0.65      1855

*** starting log_model: Music Classification ***


2024/11/17 20:46:00 INFO mlflow.tracking._tracking_service.client: 🏃 View run mysterious-horse-759 at: https://dagshub.com/micheldpd24/mlflow_tracking.mlflow/#/experiments/0/runs/c3861f0ad49a45a9b5133a154c2b93da.
2024/11/17 20:46:00 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/micheldpd24/mlflow_tracking.mlflow/#/experiments/0.


Best Run ID: c3861f0ad49a45a9b5133a154c2b93da
Best Accuracy: 0.6506738544474393


Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Best music_clf model loaded successfully!
Best music_clf model saved successfully!


### Unsupervised Model Fit

In [23]:
config = ConfigurationManager()
unsmodel_fit_config = config.get_unsmodel_fit_config()
unsmodel_fit = UnsModelFit(config= unsmodel_fit_config)
unsmodel_fit.unsfit()
unsmodel_fit.get_best_clst_model()

The folder models/pyfunc_gm_model_0 does not exist or is not a valid directory.


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

2024/11/17 20:46:46 INFO mlflow.tracking._tracking_service.client: 🏃 View run clumsy-frog-615 at: https://dagshub.com/micheldpd24/mlflow_tracking.mlflow/#/experiments/11/runs/4681a737b4d24117951ad34417553082.
2024/11/17 20:46:46 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/micheldpd24/mlflow_tracking.mlflow/#/experiments/11.


Logged model for alternative: 0
The folder models/pyfunc_gm_model_1 does not exist or is not a valid directory.


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Logged model for classical: 1


2024/11/17 20:46:56 INFO mlflow.tracking._tracking_service.client: 🏃 View run blushing-lamb-393 at: https://dagshub.com/micheldpd24/mlflow_tracking.mlflow/#/experiments/12/runs/97415837942749379345006f19971a32.
2024/11/17 20:46:56 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/micheldpd24/mlflow_tracking.mlflow/#/experiments/12.


The folder models/pyfunc_gm_model_2 does not exist or is not a valid directory.


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Logged model for country: 2


2024/11/17 20:47:11 INFO mlflow.tracking._tracking_service.client: 🏃 View run respected-shark-469 at: https://dagshub.com/micheldpd24/mlflow_tracking.mlflow/#/experiments/13/runs/d9fc74043f73455f8f6335e2cf9cabe3.
2024/11/17 20:47:11 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/micheldpd24/mlflow_tracking.mlflow/#/experiments/13.


The folder models/pyfunc_gm_model_3 does not exist or is not a valid directory.


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Logged model for edm: 3


2024/11/17 20:47:26 INFO mlflow.tracking._tracking_service.client: 🏃 View run stylish-trout-626 at: https://dagshub.com/micheldpd24/mlflow_tracking.mlflow/#/experiments/14/runs/790370eb496e41df9f762743b99077f6.
2024/11/17 20:47:26 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/micheldpd24/mlflow_tracking.mlflow/#/experiments/14.


The folder models/pyfunc_gm_model_4 does not exist or is not a valid directory.


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Logged model for hip-hop: 4


2024/11/17 20:47:41 INFO mlflow.tracking._tracking_service.client: 🏃 View run popular-shark-106 at: https://dagshub.com/micheldpd24/mlflow_tracking.mlflow/#/experiments/15/runs/f47e3d84fff949b39ba6de4204cf0f4c.
2024/11/17 20:47:41 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/micheldpd24/mlflow_tracking.mlflow/#/experiments/15.


The folder models/pyfunc_gm_model_5 does not exist or is not a valid directory.


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Logged model for jazz: 5


2024/11/17 20:47:56 INFO mlflow.tracking._tracking_service.client: 🏃 View run kindly-crow-58 at: https://dagshub.com/micheldpd24/mlflow_tracking.mlflow/#/experiments/16/runs/f3de68727af944478e75695b681197e8.
2024/11/17 20:47:56 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/micheldpd24/mlflow_tracking.mlflow/#/experiments/16.


The folder models/pyfunc_gm_model_6 does not exist or is not a valid directory.


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Logged model for latin: 6


2024/11/17 20:48:11 INFO mlflow.tracking._tracking_service.client: 🏃 View run welcoming-zebra-640 at: https://dagshub.com/micheldpd24/mlflow_tracking.mlflow/#/experiments/17/runs/138b73f5fc804df7838d383a838e07ae.
2024/11/17 20:48:11 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/micheldpd24/mlflow_tracking.mlflow/#/experiments/17.


The folder models/pyfunc_gm_model_7 does not exist or is not a valid directory.


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Logged model for pop: 7


2024/11/17 20:48:26 INFO mlflow.tracking._tracking_service.client: 🏃 View run big-loon-596 at: https://dagshub.com/micheldpd24/mlflow_tracking.mlflow/#/experiments/18/runs/c95fd747f41047f495d8c34ba936bb85.
2024/11/17 20:48:26 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/micheldpd24/mlflow_tracking.mlflow/#/experiments/18.


The folder models/pyfunc_gm_model_8 does not exist or is not a valid directory.


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Logged model for r-n-b: 8


2024/11/17 20:48:41 INFO mlflow.tracking._tracking_service.client: 🏃 View run wistful-bass-410 at: https://dagshub.com/micheldpd24/mlflow_tracking.mlflow/#/experiments/19/runs/92ccefc1efd842c99d0ccd215fdb4153.
2024/11/17 20:48:41 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/micheldpd24/mlflow_tracking.mlflow/#/experiments/19.


The folder models/pyfunc_gm_model_9 does not exist or is not a valid directory.


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Logged model for rock: 9


2024/11/17 20:48:56 INFO mlflow.tracking._tracking_service.client: 🏃 View run rumbling-smelt-628 at: https://dagshub.com/micheldpd24/mlflow_tracking.mlflow/#/experiments/20/runs/d8e05924f82a4ffdb13665220a7a8c01.
2024/11/17 20:48:56 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/micheldpd24/mlflow_tracking.mlflow/#/experiments/20.


Best Run ID: 4681a737b4d24117951ad34417553082
Best Silhouette_score: 0.5495264854082571


Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Best clst_0_alternative model loaded successfully!
The folder models_best/pyfunc_clst_0_alternative_model does not exist or is not a valid directory.
Best Run ID: 97415837942749379345006f19971a32
Best Silhouette_score: 0.5490066926843931


Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Best clst_1_classical model loaded successfully!
The folder models_best/pyfunc_clst_1_classical_model does not exist or is not a valid directory.
Best Run ID: d9fc74043f73455f8f6335e2cf9cabe3
Best Silhouette_score: 0.5560450067624154


Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Best clst_2_country model loaded successfully!
The folder models_best/pyfunc_clst_2_country_model does not exist or is not a valid directory.
Best Run ID: 790370eb496e41df9f762743b99077f6
Best Silhouette_score: 0.5882012550391456


Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Best clst_3_edm model loaded successfully!
The folder models_best/pyfunc_clst_3_edm_model does not exist or is not a valid directory.
Best Run ID: f47e3d84fff949b39ba6de4204cf0f4c
Best Silhouette_score: 0.5758989107410665


Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Best clst_4_hip-hop model loaded successfully!
The folder models_best/pyfunc_clst_4_hip-hop_model does not exist or is not a valid directory.
Best Run ID: f3de68727af944478e75695b681197e8
Best Silhouette_score: 0.5766833876471491


Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Best clst_5_jazz model loaded successfully!
The folder models_best/pyfunc_clst_5_jazz_model does not exist or is not a valid directory.
Best Run ID: 138b73f5fc804df7838d383a838e07ae
Best Silhouette_score: 0.6589549301935923


Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Best clst_6_latin model loaded successfully!
The folder models_best/pyfunc_clst_6_latin_model does not exist or is not a valid directory.
Best Run ID: c95fd747f41047f495d8c34ba936bb85
Best Silhouette_score: 0.5684911205654885


Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Best clst_7_pop model loaded successfully!
The folder models_best/pyfunc_clst_7_pop_model does not exist or is not a valid directory.
Best Run ID: 92ccefc1efd842c99d0ccd215fdb4153
Best Silhouette_score: 0.5489443609907552


Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Best clst_8_r-n-b model loaded successfully!
The folder models_best/pyfunc_clst_8_r-n-b_model does not exist or is not a valid directory.
Best Run ID: d8e05924f82a4ffdb13665220a7a8c01
Best Silhouette_score: 0.533007744907941


Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Best clst_9_rock model loaded successfully!
The folder models_best/pyfunc_clst_9_rock_model does not exist or is not a valid directory.


## Find and register the best model

In [None]:
# # function to find and load the best model

# import mlflow
# from mlflow.tracking import MlflowClient

# def load_best_model_from_experiment(experiment_name: str = "music_clf", model: str = "GB_model", metric: str = "accuracy"):
#     """
#     Retrieves the MLflow run with the best score for a specified metric and loads the associated model.
    
#     Args:
#         experiment_name (str): The name of the MLflow experiment.
#         metric (str): The metric used to determine the best model. Default is "accuracy".
    
#     Returns:
#         tuple: A tuple containing:
#             - loaded_model: The MLflow model object.
#             - best_run_id (str): The ID of the best run.
#             - best_metric_value (float): The value of the best metric.
    
#     Raises:
#         ValueError: If the experiment or runs are not found.
#     """
#     # Initialize the MLflow client
#     client = MlflowClient()
    
#     # Get the experiment details
#     experiment = client.get_experiment_by_name(experiment_name)
#     if not experiment:
#         raise ValueError(f"Experiment '{experiment_name}' not found!")
    
#     # Fetch all runs for the experiment
#     runs = client.search_runs(
#         experiment_ids=[experiment.experiment_id],
#         filter_string="",
#         run_view_type=mlflow.entities.ViewType.ACTIVE_ONLY,
#         order_by=[f"metrics.{metric} DESC"]  # Order by the specified metric in descending order
#     )
    
#     if not runs:
#         raise ValueError(f"No runs found for experiment '{experiment_name}'!")
    
#     # Extract the best run
#     best_run = runs[0]
#     best_run_id = best_run.info.run_id
#     best_metric_value = best_run.data.metrics[metric]
    
#     print(f"Best Run ID: {best_run_id}")
#     print(f"Best {metric.capitalize()}: {best_metric_value}")
    
#     # Load the model associated with the best run
#     model_uri = f"runs:/{best_run_id}/{model}"
#     loaded_model = mlflow.pyfunc.load_model(model_uri)
    
#     print(f"Best {experiment_name} model loaded successfully!")

#     return loaded_model, best_run_id, best_metric_value
    

## Find best GM clustering model 

In [None]:

# from joblib import dump
# genres = load_txt(Path("data/to_rec/genres.txt"))
# for element, genre in enumerate(genres):
#     experiment_name = f"clst_{element}_{genre}"
#     model = "GM_model"
#     metric = "silhouette_score"
#     loaded_model, best_run_id, best_metric_value = load_best_model_from_experiment(experiment_name, model, metric)
#     mlflow.sklearn.save_model(loaded_model, f"models_best/gm_model_{element}")

## Find the best GB classifier model

In [None]:
# loaded_model, best_run_id, best_metric_value = load_best_model_from_experiment(experiment_name="music_clf", model="GB_model", model_name="GB_CLF", metric="accuracy")
# mlflow.sklearn.save_model(loaded_model,f"models_best/gb_model" )