In [1]:
import os
import sys
import pandas as pd

from dataclasses import dataclass

import numpy as np 
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.model_selection import GridSearchCV

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score, precision_recall_curve, auc
import xgboost as xgb # xgb.XGBClassifier


In [2]:


def evaluate_model_kpi(model, X_train, y_train, X_val, y_val, threshold=0.5, model_name=None):
    ''' Write about the meaning of each metric. '''   

    if model_name == None:
        model_name = type(model)
    
    # Make predictions using the model and the DataFrame
    y_train_pred = (model.predict_proba(X_train)[:,1] >= threshold).astype(bool)
    y_val_pred = (model.predict_proba(X_val)[:,1] >= threshold).astype(bool)
    
    y_train_prob = model.predict_proba(X_train)[:,1]
    y_val_prob = model.predict_proba(X_val)[:,1]
    
    # Calculate accuracy
    accuracy_train = accuracy_score(y_train, y_train_pred)
    accuracy_val = accuracy_score(y_val, y_val_pred)
    
    # Calculate precision, recall, and f1 score
    prf_train = precision_recall_fscore_support(y_train, y_train_pred, average=None)
    prf_val = precision_recall_fscore_support(y_val, y_val_pred, average=None)
    
    # Calculate area under the curve
    auc_train = roc_auc_score(y_train, y_train_prob)
    auc_val = roc_auc_score(y_val, y_val_prob)
    
    # Calculate area under the precision-recall curve
    precision_train, recall_train, thresholds_train = precision_recall_curve(y_train, y_train_pred)
    auc_precision_recall_train = auc(recall_train, precision_train)
    
    precision_val, recall_val, thresholds_train = precision_recall_curve(y_val, y_val_prob)
    auc_precision_recall_val = auc(recall_val, precision_val)
    
    # Store the metrics in a DataFrame
    metrics = pd.DataFrame({
        'Algorithm': model_name,
         
        'AUC-ROC Train': [auc_train],
        'AUC-ROC Val': [auc_val],
        'AUC-PRC Train': [auc_precision_recall_train],
        'AUC-PRC Val': [auc_precision_recall_val],
        
        'Accuracy Train': [accuracy_train],
        'Accuracy Val': [accuracy_val],
        
        'Precision Train: 0': [prf_train[0][0]],
        'Precision Val: 0': [prf_val[0][0]],
        'Precision Train: 1': [prf_train[0][1]],
        'Precision Val: 1': [prf_val[0][1]],
        
        'Recall Train: 0': [prf_train[1][0]],
        'Recall Val: 0': [prf_val[1][0]],
        'Recall Train: 1': [prf_train[1][1]],
        'Recall Val: 1': [prf_val[1][1]],
        
        'F1-score Train: 0': [prf_train[2][0]],
        'F1-score Val: 0': [prf_val[2][0]],
        'F1-score Train: 1': [prf_train[2][1]],
        'F1-score Val: 1': [prf_val[2][1]],
    })
    
    return metrics

In [3]:

@dataclass
class DataConfig:
    raw_data_path: str=os.path.join('artifacts',"data.csv")
    train_data_path: str=os.path.join('artifacts',"train.csv")
    test_data_path: str=os.path.join('artifacts',"test.csv")
    val_data_path: str=os.path.join('artifacts',"validation.csv")

class DataIngest():

    def __init__(self):        
        self.config=DataConfig()

    def create_train_and_test(self):

        df=pd.read_csv(self.config.raw_data_path)
        
        #os.makedirs(os.path.dirname(self.ingestion_config.train_data_path), exist_ok=True)

        train_set, test_set=train_test_split(df,test_size=0.4,random_state=42)
        test_set, val_set=train_test_split(test_set,test_size=0.5,random_state=42)

        train_set.to_csv(self.config.train_data_path,index=False,header=True)
        test_set.to_csv(self.config.test_data_path,index=False,header=True)
        val_set.to_csv(self.config.val_data_path,index=False,header=True)

        return(
            self.config.train_data_path,
            self.config.test_data_path,
            self.config.val_data_path
        )
    


In [4]:
os.path.join('artifacts',"data.csv")

'artifacts\\data.csv'

In [5]:
obj=DataIngest()
train_data,test_data,val_data=obj.create_train_and_test()

In [156]:

class DataTranformTrain():

    def __init__(self, label, drop_labels_list, perform_cross_validation=True):        
        self.config=DataConfig()
        self.perform_cross_validation = perform_cross_validation
        self.label = label
        self.drop_labels_list = drop_labels_list

    def preprocessor_pipeline(self, df):

        numerical_columns = [feature for feature in df.columns if df[feature].dtype != 'O']
        categorical_columns = [feature for feature in df.columns if df[feature].dtype == 'O']

        num_pipeline= Pipeline(
            steps=[
            ("imputer",SimpleImputer(strategy="median")),
            ("scaler",StandardScaler())
            ]
        )

        cat_pipeline=Pipeline(
            steps=[
            ("imputer",SimpleImputer(strategy="most_frequent")),
            ("one_hot_encoder",OneHotEncoder()),
            ("scaler",StandardScaler(with_mean=False))
            ]
        )

        preprocessor=ColumnTransformer(
            [
            ("num_pipeline",num_pipeline,numerical_columns),
            ("cat_pipelines",cat_pipeline,categorical_columns)
            ]
        )

        return preprocessor


    def algorithms_and_grid(self):

        models = {
            "Decision Tree": DecisionTreeClassifier(),
            "Random Forest": RandomForestClassifier(),
            "Gradient Boosting": GradientBoostingClassifier(),
            "Logistic Regression": LogisticRegression(),
            "XGBoost": xgb.XGBClassifier(),
        }

        params = {
            "Decision Tree": {
                'model__criterion':['log_loss', 'entropy', 'gini'],
                'model__max_depth': [3, 5, 8, 10, 20],
                # 'model__splitter':['best','random'],
                # 'model__max_features':['sqrt','log2'],
            },
            "Random Forest":{
                'model__bootstrap': [True],
                'model__max_depth': [3, 5, 8, 10, 20],
                #'model__max_features': [2, 3, 5, 10, 20],
                #'model__min_samples_leaf': [3, 4, 5, 10, 20],
                #'model__n_estimators': [10, 50, 100]
            },
            "Gradient Boosting":{
                "model__loss":["log_loss"],
                "model__learning_rate": [0.01, 0.05, 0.1, 0.2, 0.5],
                #"model__min_samples_split": np.linspace(0.1, 0.5, 12),
                #"model__min_samples_leaf": np.linspace(0.1, 0.5, 12),
                "model__max_depth":[3,5,8,10,15],
                #"model__max_features":["log2", "sqrt"],
                #"model__criterion": ["friedman_mse",  "mae"],
                #"model__subsample":[0.5, 0.618, 0.8, 1.0],
                #"model__n_estimators": [10, 15, 20]
            },
            "Logistic Regression":{
                'model__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
                'model__penalty': ['l2'],
                'model__max_iter': [10000],
                #'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
            },
            "XGBoost":{
                'model__max_depth': [3, 4, 5, 6, 7, 8],
                'model__learning_rate': [0.01, 0.05], #, 0.1, 0.2, 0.5],
                #"model__gamma":[0.5, 1], #, 2],
                #'model__n_estimators': [50, 100], #, 200],
            },     
        }

        return models, params


    def grid_search(self):
        
        models, params = self.algorithms_and_grid()

        train_df=pd.read_csv(self.config.train_data_path)
        test_df=pd.read_csv(self.config.test_data_path)
        val_df=pd.read_csv(self.config.val_data_path)

        X = pd.concat([train_df, test_df]).drop(columns=self.drop_labels_list, axis=1).reset_index(drop=True)
        y = pd.concat([train_df, test_df])[self.label].reset_index(drop=True)

        indices_train = np.arange(train_df.shape[0])
        indices_test = np.arange(test_df.shape[0], train_df.shape[0]+test_df.shape[0])
        cv = [(indices_train, indices_test)]

        if self.perform_cross_validation:
            preprocessor = self.preprocessor_pipeline(df = X)
            indices_train = indices_train.tolist() + indices_test.tolist()
        else:
            preprocessor = self.preprocessor_pipeline(df = train_df.drop(columns=self.drop_labels_list, axis=1))
            indices_train = indices_train.tolist()

        model_list = []
        AUC_ROC_list = []
        algo_best_param = {}
        algo_best_model = []

        for i in range(len(list(models))):
            model = list(models.values())[i]
            param = list(params.values())[i]

            pipeline = Pipeline([
                ('preprocessing', preprocessor),
                ('model', model)
            ])

            if self.perform_cross_validation:
                grid = GridSearchCV(estimator = pipeline, param_grid = param, cv = 3, n_jobs = -1, scoring = 'roc_auc', error_score="raise")
            else:
                grid = GridSearchCV(estimator = pipeline, param_grid = param, cv = cv, scoring = 'roc_auc', error_score="raise")

            grid.fit(X, y)

            bp = grid.best_params_
            nbp = {}
            for k, v in bp.items():
                nbp[k[k.index('__')+2:]] = v

            final_pipeline = Pipeline([
                ('preprocessing', preprocessor),
                ('model', model.set_params(**nbp))
            ])


            final_pipeline.fit(X.iloc[indices_train], y.iloc[indices_train])
            
            # Evaluate Train and Validation dataset
            metric = evaluate_model_kpi(model=final_pipeline, 
                                        X_train=X.iloc[indices_train], 
                                        y_train=y.iloc[indices_train], 
                                        X_val=val_df, 
                                        y_val=val_df[self.label], 
                                        threshold=0.5,
                                        model_name=list(models.keys())[i]
                                        )
            
            model_list.append(list(models.keys())[i])
            AUC_ROC_list.append(metric['AUC-ROC Val'][0])

            algo_best_param[list(models.keys())[i]] = nbp

            algo_best_model.append((list(models.keys())[i], final_pipeline))

        algo_best_model_metric = pd.DataFrame(list(zip(model_list, AUC_ROC_list)), columns=['Model Name', 'AUC_ROC']).sort_values(by=["AUC_ROC"],ascending=False).reset_index(drop=True)

        return (
            algo_best_model_metric, 
            algo_best_param, 
            algo_best_model
        )

In [164]:
tt = DataTranformTrain(label = 'label_1', drop_labels_list = ['label_1', 'label_X', 'label_2'], perform_cross_validation=True)

In [165]:
#p = tt.preprocessor_pipeline(df = pd.read_csv(train_data))

In [166]:
algo_best_model_metric, algo_best_param, algo_best_model = tt.grid_search()

In [167]:
algo_best_model_metric

Unnamed: 0,Model Name,AUC_ROC
0,Logistic Regression,0.718969
1,XGBoost,0.686383
2,Random Forest,0.677399
3,Gradient Boosting,0.649378
4,Decision Tree,0.567283


In [168]:
algo_best_model_metric['Model Name'][0]

'Logistic Regression'

In [169]:
algo_best_param

{'Decision Tree': {'criterion': 'gini', 'max_depth': 5},
 'Random Forest': {'bootstrap': True, 'max_depth': 5},
 'Gradient Boosting': {'learning_rate': 0.05,
  'loss': 'log_loss',
  'max_depth': 3},
 'Logistic Regression': {'C': 0.001, 'max_iter': 10000, 'penalty': 'l2'},
 'XGBoost': {'learning_rate': 0.05, 'max_depth': 3}}

In [170]:
best_algo_name = algo_best_model_metric['Model Name'][0]

best_algo = [model for name, model in algo_best_model if name == best_algo_name][0]


In [171]:
best_algo