In [12]:
import warnings
warnings.filterwarnings('ignore')


import sys
sys.path.append('/home/mshoush/5th/common_files') 
import pandas as pd
import numpy as np
import os
from sklearn.metrics import roc_auc_score, mean_squared_error
from hyperopt import Trials, STATUS_OK, tpe, fmin, hp
from catboost import Pool, CatBoostClassifier, CatBoostRegressor
from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier, LGBMRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from DatasetManager import DatasetManager
import gc
import hyperopt


from catboost import CatBoostClassifier, CatBoostRegressor
from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier, LGBMRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import roc_auc_score, mean_squared_error
from hyperopt import STATUS_OK

def create_and_evaluate_model(args):
    global trial_nr
    if trial_nr % 50 == 0:
        print(trial_nr)
    print("Trial %s out of %s" % (trial_nr, n_iter))
    trial_nr += 1

    score = 0
    for current_train_names, current_test_names in dataset_manager.get_idx_split_generator(dt_for_splitting, n_splits=3):
        train_idxs = case_ids.isin(current_train_names)
        X_train = X_all[train_idxs]
        y_train = y_all[train_idxs]
        X_test = X_all[~train_idxs]
        y_test = y_all[~train_idxs]

        if task_type == "classification":
            if cls_method == "catboost":
                model = CatBoostClassifier(loss_function='Logloss',
                                           learning_rate=args['learning_rate'],
                                           depth=int(args['max_depth']),
                                           subsample=args['subsample'],
                                           bootstrap_type='Bernoulli',
                                           verbose=False,
                                           random_seed=22,
                                           posterior_sampling=True,
                                           thread_count=8)
            elif cls_method == "xgboost":
                model = XGBClassifier(learning_rate=args['learning_rate'],
                                      max_depth=int(args['max_depth']),
                                      subsample=args['subsample'],
                                      verbosity=0,
                                      random_state=22)
            elif cls_method == "lightgbm":
                model = LGBMClassifier(learning_rate=args['learning_rate'],
                                       max_depth=int(args['max_depth']),
                                       subsample=args['subsample'],
                                       verbosity=-1,
                                       random_state=22)
            elif cls_method == "randomforest":
                model = RandomForestClassifier(n_estimators=args['n_estimators'],
                                                max_depth=int(args['max_depth']),
                                                random_state=22)
            else:
                raise ValueError("Invalid cls_method for classification")
            score_function = roc_auc_score
        elif task_type == "regression":
            if cls_method == "catboost":
                model = CatBoostRegressor(loss_function='RMSE',
                                          learning_rate=args['learning_rate'],
                                          depth=int(args['max_depth']),
                                          subsample=args['subsample'],
                                          bootstrap_type='Bernoulli',
                                          verbose=False,
                                          random_seed=22,
                                          thread_count=8)
            elif cls_method == "xgboost":
                model = XGBRegressor(learning_rate=args['learning_rate'],
                                     max_depth=int(args['max_depth']),
                                     subsample=args['subsample'],
                                     verbosity=0,
                                     random_state=22)
            elif cls_method == "lightgbm":
                model = LGBMRegressor(learning_rate=args['learning_rate'],
                                       max_depth=int(args['max_depth']),
                                       subsample=args['subsample'],
                                       verbosity=-1,
                                       random_state=22)
            elif cls_method == "randomforest":
                model = RandomForestRegressor(n_estimators=args['n_estimators'],
                                               max_depth=int(args['max_depth']),
                                               random_state=22)
            else:
                raise ValueError("Invalid cls_method for regression")
            score_function = mean_squared_error
        else:
            raise ValueError("Invalid task_type")
        
        if cls_method!="catboost":            
            model.fit(X_train, y_train)
        else:
            model.fit(X_train, y_train, cat_features=cat_feat_idx)
        preds = model.predict(X_test)
        score += score_function(y_test, preds)

    if task_type == "classification":
        return {'loss': -score / n_splits, 'status': STATUS_OK, 'model': model}
    elif task_type == "regression":
        return {'loss': score / n_splits, 'status': STATUS_OK, 'model': model}



case_id_col = 'case_id'
activity_col = 'activity'
resource_col = 'resource'
timestamp_col = 'timestamp'
label_col = 'label'
treatment_col = "Treatment1"

dataset_ref_to_datasets = {
    #"bpic2012": ["bpic2012"],
    "bpic2017": ["bpic2017"],
}

encoding_dict = {  
    "laststate": ["static", "last"],
    "agg": ["static", "agg"], 
    "index": ["static", "index"],             
    "combined": ["static", "last", "agg"]
}

task_types = ["regression", "classification",]

cls_methods = ['xgboost', 'lightgbm', 'randomforest', 'catboost'] 

for cls_method in cls_methods:       
    for cls_encoding in encoding_dict.keys():        
        
        for task_type in task_types:   
            
            for dataset_name in dataset_ref_to_datasets.keys():
                n_iter = 1  # Update this value as needed
                trial_nr = 0
                n_splits = 3
                print(f"dataset_name: {dataset_name}, task_type: {task_type}, cls_method: {cls_method}, cls_encoding: {cls_encoding}")
                            
                params_dir = f"./../predictive_results/{task_type}/{dataset_name}/"           
                # Check if params_dir exists, otherwise create it
                if not os.path.exists(params_dir):
                    os.makedirs(params_dir)
                
                dataset_manager = DatasetManager(dataset_name, task_type)
                #print(f"Label_col: {str(dataset_manager.label_col)}")
                
                # Load the training data
                if cls_method!="catboost":
                    cls_method_all = "other"
                else:
                    cls_method_all = cls_method
                train = pd.read_parquet(f"./../prepared_data/{task_type}/{dataset_name}/train_{cls_method_all}_{cls_encoding}_encoded_{dataset_name}.parquet")
                cat_feat_idx = np.where((train.dtypes == 'object') & ~train.columns.isin([str(dataset_manager.label_col), "Treatment"]))[0]
                print(cat_feat_idx)                
                
                
                # Load the prefix data
                dt_prefixes = pd.read_parquet(f"./../prepared_data/{task_type}/{dataset_name}/train_prefixes_{dataset_name}.parquet")
                
                y_all = train[dataset_manager.label_col]
                if task_type == "classification":
                    y_all = y_all.astype(int)  # Ensure the target variable is integer type for classification
                elif task_type == "regression":
                    y_all = y_all.astype(float)  # Ensure the target variable is float type for regression
                else:
                    raise ValueError("Invalid task_type")
                                
                X_all = train.drop([str(dataset_manager.label_col)], axis=1)
                
                case_ids = dt_prefixes.groupby(dataset_manager.case_id_col).first()["orig_case_id"].reset_index(drop=True)
                dt_for_splitting = pd.DataFrame({dataset_manager.case_id_col: case_ids, dataset_manager.label_col: y_all}).drop_duplicates().reset_index(drop=True)
                
                #print('Optimizing parameters...')
                
                if cls_method == "catboost":
                    space = {
                        'learning_rate': hp.uniform("learning_rate", 0.01, 0.8),
                        'one_hot_max_size': hp.quniform('one_hot_max_size', 4, 255, 1),
                        'subsample': hp.uniform("subsample", 0.5, 1),
                        'max_depth': hp.quniform('max_depth', 6, 16, 1),
                        'colsample_bytree': hp.uniform("colsample_bytree", 0.5, 1),
                        'bagging_temperature': hp.uniform('bagging_temperature', 0.0, 100),
                        'random_strength': hp.uniform('random_strength', 0.0, 100),
                        'l2_leaf_reg': hp.loguniform('l2_leaf_reg', 0, np.log(10)),
                        'n_estimators': hp.choice('n_estimators', [250, 500, 1000]),
                        'min_child_weight': hp.quniform('min_child_weight', 1, 6, 1)
                    }
                elif cls_method == "xgboost":
                    space = {
                        'learning_rate': hp.uniform("learning_rate", 0.01, 0.8),
                        'max_depth': hp.quniform('max_depth', 2, 10, 1),
                        'min_child_weight': hp.quniform('min_child_weight', 1, 6, 1),
                        'subsample': hp.uniform("subsample", 0.5, 1),
                        'colsample_bytree': hp.uniform("colsample_bytree", 0.5, 1),
                        'gamma': hp.uniform("gamma", 0, 10),
                        'n_estimators': hp.choice('n_estimators', [100, 250, 500]),
                    }
                elif cls_method == "lightgbm":
                    space = {
                        'learning_rate': hp.uniform("learning_rate", 0.01, 0.8),
                        'max_depth': hp.quniform('max_depth', 2, 10, 1),
                        'min_child_weight': hp.quniform('min_child_weight', 1, 6, 1),
                        'subsample': hp.uniform("subsample", 0.5, 1),
                        'colsample_bytree': hp.uniform("colsample_bytree", 0.5, 1),
                        'bagging_fraction': hp.uniform("bagging_fraction", 0.5, 1),
                        'feature_fraction': hp.uniform("feature_fraction", 0.5, 1),
                        'n_estimators': hp.choice('n_estimators', [100, 250, 500]),
                    }
                elif cls_method == "randomforest":
                    space = {
                        'n_estimators': hp.choice('n_estimators', [100, 250, 500]),
                        'max_depth': hp.quniform('max_depth', 2, 10, 1),
                        'min_samples_split': hp.quniform('min_samples_split', 2, 20, 1),
                        'min_samples_leaf': hp.quniform('min_samples_leaf', 1, 10, 1),
                        'max_features': hp.choice('max_features', ['auto', 'sqrt', 'log2', None]),
                    }
                else:
                    print("No Valid cls_method")
                    
                trials = Trials()
                best = fmin(create_and_evaluate_model, space, algo=tpe.suggest, max_evals=n_iter, trials=trials)

                best_params = hyperopt.space_eval(space, best)
                
                best_params_df = pd.DataFrame([best_params])
                #print(best_params_df)
                outfile = os.path.join(params_dir, f"optimal_params_{cls_encoding}_{cls_method}_{dataset_name}.parquet")
                best_params_df.to_parquet(outfile)


dataset_name: bpic2012, task_type: regression, cls_method: catboost, cls_encoding: laststate
bpic2012
[17 18]
0                                                    
Trial 0 out of 1                                     
100%|██████████| 1/1 [03:09<00:00, 189.49s/trial, best loss: 61.10785811724414]
dataset_name: bpic2012, task_type: classification, cls_method: catboost, cls_encoding: laststate
bpic2012
[17 18]
0                                                    
Trial 0 out of 1                                     
100%|██████████| 1/1 [01:51<00:00, 111.05s/trial, best loss: -0.6037291591803658]
dataset_name: bpic2012, task_type: regression, cls_method: catboost, cls_encoding: agg
bpic2012
[4 5]
0                                                    
Trial 0 out of 1                                     
  0%|          | 0/1 [00:00<?, ?trial/s, best loss=?]

In [None]:
import sys
sys.path.append('/home/mshoush/5th/common_files') 
import pandas as pd
import numpy as np
import os
from sklearn.metrics import roc_auc_score, mean_squared_error
from hyperopt import Trials, STATUS_OK, tpe, fmin, hp
from catboost import Pool, CatBoostClassifier, CatBoostRegressor
from DatasetManager import DatasetManager
import gc
import hyperopt


def create_and_evaluate_model(args):
    global trial_nr
    if trial_nr % 50 == 0:
        print(trial_nr)
    print("Trial %s out of %s" % (trial_nr, n_iter))
    trial_nr += 1

    score = 0
    for current_train_names, current_test_names in dataset_manager.get_idx_split_generator(dt_for_splitting, n_splits=3):
        train_idxs = case_ids.isin(current_train_names)
        X_train = X_all[train_idxs]
        y_train = y_all[train_idxs]
        X_test = X_all[~train_idxs]
        y_test = y_all[~train_idxs]

        if task_type == "classification":
            model = CatBoostClassifier(loss_function='Logloss',
                                       learning_rate=args['learning_rate'],
                                       depth=int(args['max_depth']),
                                       subsample=args['subsample'],
                                       bootstrap_type='Bernoulli',
                                       verbose=False,
                                       random_seed=22,
                                       posterior_sampling=True,
                                       thread_count=8)
            score_function = roc_auc_score
        elif task_type == "regression":
            model = CatBoostRegressor(loss_function='RMSE',
                                      learning_rate=args['learning_rate'],
                                      depth=int(args['max_depth']),
                                      subsample=args['subsample'],
                                      bootstrap_type='Bernoulli',
                                      verbose=False,
                                      random_seed=22,
                                      thread_count=8)
            score_function = mean_squared_error
        else:
            raise ValueError("Invalid task_type")

        pool_train = Pool(X_train, y_train, cat_features=cat_feat_idx)
        pool_test = Pool(X_test, cat_features=cat_feat_idx)

        model.fit(pool_train)
        preds = model.predict(pool_test)
        score += score_function(y_test, preds)

    if task_type == "classification":
        return {'loss': -score / n_splits, 'status': STATUS_OK, 'model': model}
    elif task_type == "regression":
        return {'loss': score / n_splits, 'status': STATUS_OK, 'model': model}


case_id_col = 'case_id'
activity_col = 'activity'
resource_col = 'resource'
timestamp_col = 'timestamp'
label_col = 'label'
treatment_col = "Treatment1"



dataset_ref_to_datasets = {
    "bpic2012": ["bpic2012"],
    #"bpic2017": ["bpic2017"],
}

encoding_dict = {  
    "index": ["static", "index"],  
    "laststate": ["static", "last"],
    "agg": ["static", "agg"],        
    "combined": ["static", "last", "agg"]
}

task_types = ["regression", "classification",]

cls_methods = ['catboost']

for cls_method in cls_methods:
    print(f"cls_method: {cls_method}")
    
    for cls_encoding in encoding_dict.keys():
        print(f"cls_encoding: {cls_encoding}")
        
        for task_type in task_types:    
            print(f"task_type: {task_type}")
            #for dataset_name in datasets:  
            for dataset_name in dataset_ref_to_datasets.keys():
                n_iter = 1  # Update this value as needed
                trial_nr = 0
                n_splits = 3
                print(f"dataset_name: {dataset_name}")
                            
                params_dir = "./../predictive_results/%s/%s/" % (task_type, dataset_name)            
                # Check if params_dir exists, otherwise create it
                if not os.path.exists(params_dir):
                    os.makedirs(params_dir)
                
                dataset_manager = DatasetManager(dataset_name, task_type)
                print(f"Label_col: {str(dataset_manager.label_col)}")
                
                # Load the training data
                train = pd.read_parquet(f"./../prepared_data/{task_type}/{dataset_name}/train_{cls_method}_{cls_encoding}_encoded_{dataset_name}.parquet")
                #print(train.head())
                cat_feat_idx = np.where((train.dtypes == 'object') & ~train.columns.isin([str(dataset_manager.label_col), "Treatment"]))[0]
                print(f"cat_feat_idx: {cat_feat_idx}")
                
                # Load the prefix data
                dt_prefixes = pd.read_parquet(f"./../prepared_data/{task_type}/{dataset_name}/train_prefixes_{dataset_name}.parquet")
                
                y_all = train[dataset_manager.label_col]
                #print(y_all)
                # Inside the loop where you extract labels and features from the training data
                if task_type == "classification":
                    y_all = y_all.astype(int)  # Ensure the target variable is integer type for classification
                elif task_type == "regression":
                    y_all = y_all.astype(float)  # Ensure the target variable is float type for regression
                else:
                    raise ValueError("Invalid task_type")
                                
                # Extract labels and features from the training data
                
                X_all = train.drop([str(dataset_manager.label_col)], axis=1)
                
                case_ids = dt_prefixes.groupby(dataset_manager.case_id_col).first()["orig_case_id"].reset_index(drop=True)
                dt_for_splitting = pd.DataFrame({dataset_manager.case_id_col: case_ids, dataset_manager.label_col: y_all}).drop_duplicates().reset_index(drop=True)
                
                print('Optimizing parameters...')
                
                if cls_method == "catboost":
                    space = {
                        'learning_rate': hp.uniform("learning_rate", 0.01, 0.8),
                        'one_hot_max_size': hp.quniform('one_hot_max_size', 4, 255, 1),
                        'subsample': hp.uniform("subsample", 0.5, 1),
                        'max_depth': hp.quniform('max_depth', 6, 16, 1),
                        'colsample_bytree': hp.uniform("colsample_bytree", 0.5, 1),
                        'bagging_temperature': hp.uniform('bagging_temperature', 0.0, 100),
                        'random_strength': hp.uniform('random_strength', 0.0, 100),
                        'l2_leaf_reg': hp.loguniform('l2_leaf_reg', 0, np.log(10)),
                        'n_estimators': hp.choice('n_estimators', [250, 500, 1000]),
                        'min_child_weight': hp.quniform('min_child_weight', 1, 6, 1)
                    }
                else:
                    print("No Valid cls_method")
                    
                trials = Trials()
                best = fmin(create_and_evaluate_model, space, algo=tpe.suggest, max_evals=n_iter, trials=trials)

                best_params = hyperopt.space_eval(space, best)
                
                best_params_df = pd.DataFrame([best_params])
                print(best_params_df)
                outfile = os.path.join(params_dir, f"optimal_params_{cls_encoding}_{cls_method}_{dataset_name}.parquet")
                best_params_df.to_parquet(outfile)


In [None]:
import sys
sys.path.append('/home/mshoush/5th/common_files') 
import pandas as pd
import numpy as np
import os
from sklearn.metrics import roc_auc_score
from hyperopt import Trials, STATUS_OK, tpe, fmin, hp
from catboost import Pool, CatBoostClassifier
from DatasetManager import DatasetManager
import gc
import hyperopt


 


def create_and_evaluate_model(args):
    global trial_nr
    if trial_nr % 50 == 0:
        print(trial_nr)
    print("Trial %s out of %s" % (trial_nr, n_iter))
    trial_nr += 1

    score = 0
    for current_train_names, current_test_names in dataset_manager.get_idx_split_generator(dt_for_splitting, n_splits=3):
        train_idxs = case_ids.isin(current_train_names)
        X_train = X_all[train_idxs]
        y_train = y_all[train_idxs]
        X_test = X_all[~train_idxs]
        y_test = y_all[~train_idxs]

        cls = CatBoostClassifier(loss_function='Logloss',
                                 learning_rate=args['learning_rate'],
                                 depth=int(args['max_depth']),
                                 subsample=args['subsample'],
                                 bootstrap_type='Bernoulli',
                                 verbose=False,
                                 random_seed=22,
                                 posterior_sampling=True,
                                 thread_count=8)

        pool_train = Pool(X_train, y_train, cat_features=cat_feat_idx)
        pool_test = Pool(X_test, cat_features=cat_feat_idx)

        cls.fit(pool_train)
        preds_pos_label_idx = np.where(cls.classes_ == 1)[0][0]
        preds = cls.predict_proba(pool_test)[:, preds_pos_label_idx]
        score += roc_auc_score(y_test, preds)
    return {'loss': -score / n_splits, 'status': STATUS_OK, 'model': cls}



case_id_col = 'case_id'
activity_col = 'activity'
resource_col = 'resource'
timestamp_col = 'timestamp'
label_col = 'label'
treatment_col = "Treatment1"


n_iter = 50  # Update this value as needed
trial_nr = 0
n_splits = 3

dataset_ref_to_datasets = {
    "bpic2012": ["bpic2012"],
    #"bpic2017": ["bpic2017"],
    
    
}

encoding_dict = {    
    "laststate": ["static", "last"],
    "agg": ["static", "agg"],    
    "index": ["static", "index"],
    "combined": ["static", "last", "agg"]
    }

task_types = ["classification", ] # "regression"

cls_methods = ['catboost']


for cls_method in cls_methods:
    print(f"cls_method: {cls_method}")
    
    for cls_encoding in encoding_dict.keys():
        print(f"cls_encoding: {cls_encoding}")
        
        for task_type in task_types:    
            print(f"task_type: {task_type}")
            #for dataset_name in datasets:  
            for dataset_name in dataset_ref_to_datasets.keys():
                print(f"dataset_name: {dataset_name}")
                            
                params_dir = "./../predictive_results/%s/%s/" % (task_type, dataset_name)            
                # Check if params_dir exists, otherwise create it
                if not os.path.exists(params_dir):
                    os.makedirs(params_dir)
                
                
                
                dataset_manager = DatasetManager(dataset_name, task_type)
                print(f"Label_col: {str(dataset_manager.label_col)}")
                # Load the training data
                train = pd.read_parquet(f"./../prepared_data/{task_type}/{dataset_name}/train_{cls_encoding}_encoded_{dataset_name}.parquet")
                cat_feat_idx = np.where((train.dtypes == 'object') & ~train.columns.isin([str(dataset_manager.label_col), "Treatment"]))[0]
                print(f"cat_feat_idx: {cat_feat_idx}")
                
                # Load the prefix data
                dt_prefixes = pd.read_parquet(f"./../prepared_data/{task_type}/{dataset_name}/train_prefixes_{dataset_name}.parquet")
                
                # Extract labels and features from the training data
                y_all = train[str(dataset_manager.label_col)]
                X_all = train.drop([str(dataset_manager.label_col)], axis=1)
                
                case_ids = dt_prefixes.groupby(dataset_manager.case_id_col).first()["orig_case_id"].reset_index(drop=True)
                dt_for_splitting = pd.DataFrame({dataset_manager.case_id_col: case_ids, dataset_manager.label_col: y_all}).drop_duplicates().reset_index(drop=True)
                
                print('Optimizing parameters...')
                
                if cls_method=="catboost":
                    space = {
                    'learning_rate': hp.uniform("learning_rate", 0.01, 0.8),
                    'one_hot_max_size': hp.quniform('one_hot_max_size', 4, 255, 1),
                    'subsample': hp.uniform("subsample", 0.5, 1),
                    'max_depth': hp.quniform('max_depth', 6, 16, 1),
                    'colsample_bytree': hp.uniform("colsample_bytree", 0.5, 1),
                    'bagging_temperature': hp.uniform('bagging_temperature', 0.0, 100),
                    'random_strength': hp.uniform('random_strength', 0.0, 100),
                    'l2_leaf_reg': hp.loguniform('l2_leaf_reg', 0, np.log(10)),
                    'n_estimators': hp.choice('n_estimators', [250, 500, 1000]),
                    'min_child_weight': hp.quniform('min_child_weight', 1, 6, 1)
                    }
                else:
                    print("No Valid cls_method")
                    
                trials = Trials()
                best = fmin(create_and_evaluate_model, space, algo=tpe.suggest, max_evals=n_iter, trials=trials)

                best_params = hyperopt.space_eval(space, best)
                
                best_params_df = pd.DataFrame([best_params])
                print(best_params_df)
                outfile = os.path.join(params_dir, f"optimal_params_{cls_encoding}_{cls_method}_{dataset_name}.parquet")
                best_params_df.to_parquet(outfile)
                
                
                
                # outfile = os.path.join(params_dir, f"optimal_params_{cls_encoding}_{cls_method}_{dataset_name}.parquet")
                # best_params.to_parquet(os.path.join(params_dir, outfile))

                                                

                                   
                
                
                                
