In [1]:
import os 
%pwd

'c:\\Users\\pachp\\Desktop\\projects\\customer_churn\\research'

In [2]:
os.chdir("../")
%pwd

'c:\\Users\\pachp\\Desktop\\projects\\customer_churn'

In [3]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    model_path: Path

In [4]:
from customer_churn.constants import *
from customer_churn.utils.common_utils import read_yaml, create_directories

In [5]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer

        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir=config.root_dir,
            model_path=config.model_path
        )

        return model_trainer_config

In [6]:
import os 
import sys
import numpy as np
import pandas as pd
from dataclasses import dataclass
from catboost import CatBoostClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from box.exceptions import BoxValueError
from customer_churn import logging
from customer_churn.utils.common_utils import save_object, evaluate_model
from imblearn.combine import SMOTEENN


In [7]:
class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config

    def initiate_model_trainer(self, train_arr, test_arr):
        try:
            logging.info("Split the data into Train and Test...........")
            X_train, y_train, X_test, y_test = (
                train_arr[:,:-1],
                train_arr[:,-1],
                test_arr[:,:-1],
                test_arr[:,-1]
            )

            sme = SMOTEENN(random_state=42)
            x_res, y_res = sme.fit_resample(X_train, y_train)

            params={
                # 'RandomForestClassifier':{
                #      'criterion':['gini', 'entropy', 'log_loss'],                 
                #      'max_features':['auto','sqrt','log2',None],
                #      'max_depth':[int(x) for x in np.linspace(10, 1000, 10)],
                #      'min_samples_split':[1,3,4,5,7,8,9,10],
                #      'n_estimators': #[8,16,32,64,128,256,512,1024,2048] ,
                #             [int(x) for x in np.linspace(start=200, stop=2000, num=10)],
                #      'min_samples_leaf':[1,2,3,4,5,6,7,8]
                #  },
                # 'GradientBoostingClassifier':{
                #     'loss':['log_loss', 'exponential'],
                #     'learning_rate':[.1,.01,.05,.001],
                #     'n_estimators': [8,16,32,64,128,256,512,1024],
                #     'subsample':[0.6,0.7,0.75,0.8,0.85,0.9],
                #     'criterion':['squared_error', 'friedman_mse'],
                #     'max_features':['auto','sqrt','log2'],
                #     },
                # 'LogisticRegression':{
                #     'max_iter':[100, 200, 300]
                #     },
                # 'XGBClassifier':{
                #     'eval_metric':['auc','logloss','error'],
                #     'eta':[0.01,0.02,0.05, 0.08, 0.1, 0.15, 0.19],
                #     'max_depth':[3,4,5,6,7,8,9,10],
                #     'subsample':[0.5,0.6,0.7,0.8,0.9]
                #     },
                # 'CatBoostClassifier':{},
                # 'AdaBoostClassifier':{                    
                #     'n_estimators': [8,16,32,64,128,256,512,1024]
                #     },
                'KNeighborsClassifier':{
                    # 'n_neighbors':[1,2,3,4,5,6,7,8,9],
                    'weights':['uniform','distance'],
                    # 'algorithm':['auto', 'ball_tree','kd_tree']
                    }
                
            }

            models={
                    # 'LogisticRegression':LogisticRegression(),
                    'KNeighborsClassifier':KNeighborsClassifier(),
            #         'XGBClassifier':XGBClassifier(),
            #         'CatBoostClassifier':CatBoostClassifier(verbose=True),
            #         'AdaBoostClassifier':AdaBoostClassifier(),
            #         'GradientBoostingClassifier':GradientBoostingClassifier(),
            #         'RandomForestClassifier':RandomForestClassifier()
              }


            model_report:dict=evaluate_model(X_train=x_res, y_train=y_res, X_test=X_test, 
                                             y_test=y_test, models=models, param=params)
            
            best_model_score = max(sorted(model_report.values()))

            best_model_name = list(model_report.keys())[
                list(model_report.values()).index(best_model_score)
            ]

            best_model = models[best_model_name]

            if best_model_score < 0.6:
                raise ValueError("No Best Model Found.....")
            
            
            logging.info("Best found model on both train and test dataset")

            save_object(
                file_path=self.config.model_path,
                obj=best_model
            )

            predicted = best_model.predict(X_test)

            accuracy_score1 = accuracy_score(y_test, predicted)

            return accuracy_score1

        
        except BoxValueError:
            raise ValueError("Error occured at initiate model training.....")
        except Exception as e:
            raise e

In [8]:
df = pd.read_csv('artifacts/df.csv')
train_arr = np.array(df)

In [9]:
df1 = pd.read_csv('artifacts/df1.csv')
test_arr = np.array(df1)

In [10]:
try:
    config = ConfigurationManager()   
    model_trainer_config = config.get_model_trainer_config()
    model_trainer = ModelTrainer(config=model_trainer_config)
    model_trainer.initiate_model_trainer(train_arr,test_arr)
except Exception as e:
    raise e

[2024-03-14 00:47:16,016: INFO: common_utils: yaml file: config\config.yaml loaded successfully]
[2024-03-14 00:47:16,022: INFO: common_utils: yaml file: params.yaml loaded successfully]
[2024-03-14 00:47:16,026: INFO: common_utils: created directory at: artifacts]
[2024-03-14 00:47:16,031: INFO: common_utils: created directory at: artifacts/model_trainer]
[2024-03-14 00:47:16,031: INFO: 2742073462: Split the data into Train and Test...........]
[2024-03-14 00:47:32,340: INFO: 2742073462: Best found model on both train and test dataset]


In [11]:
df = pd.read_csv('artifacts/df.csv')
train_arr = np.array(df)
train_arr.shape

(5634, 1161)