In [None]:
import os
import mlflow
import pandas as pd
from mlops.util_funcs import scrub_data, eval_metrics_logs
from sklearn.feature_extraction import DictVectorizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [None]:
##########################    READING IN DATASETS    ##########################

In [None]:
# import data
df_train = pd.read_csv("data/train_df.csv")
df_val = pd.read_csv("data/test_df.csv")

In [None]:
# creating target variables
y_train = df_train["Response"]
y_val = df_val["Response"]

In [None]:
# clean and preprocess the data
train_data = scrub_data(df_train)
val_data = scrub_data(df_val)

In [None]:
# dataframe to dictionary
train_dict = train_data.to_dict(orient='records')
val_dict = val_data.to_dict(orient='records')

In [None]:
dv = DictVectorizer()
X_train = dv.fit_transform(train_dict)
X_val = dv.transform(val_dict)

In [None]:
##########################    SETTING UP REMOTE MLFLOW ENVIRONMENT   ##########################

In [None]:
# fill in AWS profile
os.environ["AWS_PROFILE"] = "demiga-g"

# Setting tracking uri (unique resource identifier)
TRACKING_SERVER_HOST = '54.82.38.62'
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:5000")

# Check validity of url
print(f"tracking URI: '{mlflow.get_tracking_uri()}'")

In [None]:
# Setting experiment
mlflow.set_experiment(experiment_name='all-models-experiment')

# Delete the `Default` experiment
expt_id = mlflow.get_experiment_by_name('Default').experiment_id

if expt_id == "0":
    try:
        mlflow.delete_experiment(expt_id)
    except mlflow.exceptions.MlflowException:
        pass
else:
    print('`Default` deleted...')

In [None]:
##########################    LOGGING DEFAULT MODELS    ##########################

In [None]:
# auto-logging scikit-learn models and their parameters to mlflow

mlflow.sklearn.autolog(log_datasets=False)

for model_class in (SVC, 
                    LogisticRegression,
                    RandomForestClassifier, 
                    GradientBoostingClassifier
):

    with mlflow.start_run():

        mlmodel = model_class()
        mlmodel.fit(X_train, y_train)

        y_pred = mlmodel.predict(X_val)
        
        eval_metrics_logs(y_val, y_pred.round())
        
mlflow.sklearn.autolog(disable=True)

In [None]:
# auto-logging xgboost model and its parameters

# transformation of data to matrix for xgboost algo
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)
_params = {
    'objective': 'binary:logistic',
    'eval_metric': "logloss",
    'seed': 42,
}

# logging xgboost
mlflow.xgboost.autolog(log_datasets=False)

with mlflow.start_run():
    
    mlflow.log_params(_params)
    
    booster = xgb.train(
        params=_params,
        dtrain=train,
        num_boost_round=500,
        evals=[(valid, "validation")],
        early_stopping_rounds=250, 
        verbose_eval=False
    )
    y_pred = booster.predict(valid)
    
    eval_metrics_logs(y_val, y_pred.round())
    
mlflow.xgboost.autolog(disable=True)

In [None]:
##########################    LOGGING TUNED MODELS    ##########################

In [None]:
# Defining a random state 
random_state = np.random.default_rng(42)

In [None]:
###########   LOGISTIC REGRESSION    ###########

In [None]:
mlflow.set_experiment("log-reg-experiment")


# objective function for logistic regression
def log_reg_objective(params):
    with mlflow.start_run():
        mlflow.log_params(params)
    
        # Create the logistic regression model with the given hyperparameters
        model = LogisticRegression(**params, max_iter=5000)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        
        precision = eval_metrics_logs(y_val, y_pred.round())
    
    return {'loss': -precision, 'status': STATUS_OK}

# define parameters to tune
space = {
    'C': hp.loguniform('C', -20, 4),
    'class_weight': hp.choice('class_weight', [None, 'balanced']),
    'penalty': hp.choice('penalty', ['l1', 'l2']),
    'solver': hp.choice('solver', ['liblinear', 'saga'])
}


# log in the model to mlflow
best_result = fmin(
    fn = log_reg_objective,
    space = space,
    algo = tpe.suggest,
    max_evals = 30,
    trials = Trials(),
    rstate=random_state,
    verbose=False
)

In [None]:
#checking the best parameters
params = {'C': 0.08966267017951414, 
          'class_weight': None,
          'penalty': 'l1',
          'solver':	'liblinear'
}

mlflow.sklearn.autolog()

lr = LogisticRegression(**params, max_iter=5000)
lr.fit(X_train, y_train)

mlflow.sklearn.autolog(disable=True)

In [None]:
###########   RANDOM FOREST CLASSIFIER    ###########

In [None]:
mlflow.set_experiment('rfc-experiment')

# objective function for random forest classifier
def rfc_objective(params):
    with mlflow.start_run():
        mlflow.log_params(params)
        
        model = RandomForestClassifier(**params, n_jobs=-1, 
                                       random_state=42)
        model.fit(X_train, y_train)
        
        y_pred = model.predict(X_val)
        
        precision = eval_metrics_logs(y_val, y_pred.round())
        
    return {'loss': -precision, 'status': STATUS_OK}


# defining the hyperparameters
search_space = {
    'n_estimators': scope.int(hp.quniform('n_estimators', 10, 150, 10)),
    'criterion': hp.choice('criterion', ['gini', 'entropy']),
    'max_depth': scope.int(hp.quniform('max_depth', 10, 30, 10)),   
}

# logging results into mlflow
best_result = fmin(
    fn=rfc_objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=30,
    rstate=random_state,
    verbose=False
)


In [None]:
# logging the best hyperparameters
params = {'criterion': 'entropy', 
          'max_depth': 20, 
          'n_estimators': 60}

mlflow.sklearn.autolog()

rfc = RandomForestClassifier(**params, n_jobs=-1, 
                             random_state=42)
rfc.fit(X_train, y_train)      

mlflow.sklearn.autolog(disable=True)

In [None]:
###########   SKLEARN GRADIENT BOOSTING CLASSIFIER    ###########

In [None]:
mlflow.set_experiment('sk-gbc-experiment')

# objective function for sklearn gradient boosting classifier
def sgbc_objective(params):
    with mlflow.start_run():
        mlflow.log_params(params)
        
        model = GradientBoostingClassifier(**params, random_state=42)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        
        precision = eval_metrics_logs(y_val, y_pred.round())
        
        return {'loss': -precision, 'status': STATUS_OK}


# defining the search space
search_space = {
    'n_estimators': scope.int(hp.quniform('n_estimators', 50, 100, 10)),
    'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 16, 2)), 
    'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 1, 10, 1)), 
}

# logging result into mlflow
best_result = fmin(
    fn = sgbc_objective,
    space = search_space,
    algo = tpe.suggest,
    max_evals = 30,
    trials = Trials(),
    rstate=random_state,
    verbose=False
)

In [None]:
# logging the best fine-tuned model
params = {'min_samples_leaf': 8,
'min_samples_split': 14,
'n_estimators': 90}

mlflow.sklearn.autolog()

gbc = GradientBoostingClassifier(**params, random_state=42)
gbc.fit(X_train, y_train)

mlflow.sklearn.autolog(disable=True)

In [None]:
###########   XGB CLASSIFIER    ###########

In [None]:
mlflow.set_experiment('xgbc-experiment')

# objective function for xgboost classifier
def xgbc_objective(params):
    
    with mlflow.start_run():
        mlflow.log_params(params)
        model = xgb.XGBClassifier(**params, 
                                  random_state=42, 
                                  objective='binary:logistic')
        model.set_params(early_stopping_rounds=250)
        model.fit(X_train, y_train, 
                  eval_set=[(X_val, y_val)],
                  verbose=False)
        y_pred = model.predict(X_val)
        
        precision = eval_metrics_logs(y_val, y_pred.round())
        
    return {'loss': -precision, 'status': STATUS_OK}



search_space = {
    'n_estimators': scope.int(hp.quniform('n_estimators', 50, 80, 10)),
    'max_leaves': scope.int(hp.quniform('max_leaves', 1, 100, 10)),
    'min_child_weight': hp.loguniform('min_child_weight', 1.5, 2.5),
}

best_result = fmin(
    fn = xgbc_objective,
    space = search_space,
    algo = tpe.suggest,
    max_evals = 30,
    trials = Trials(),
    rstate=random_state,
    verbose=False
)

In [None]:
params = {'max_leaves':	50, 
'min_child_weight':	7.727056599504389, 
'n_estimators':	50}

mlflow.xgboost.autolog()

xgbc = xgb.XGBClassifier(**params, 
                          random_state=42, 
                          objective='binary:logistic')
xgbc.set_params(early_stopping_rounds=250)
xgbc.fit(X_train, y_train,  eval_set=[(X_val, y_val)], verbose=False)

mlflow.xgboost.autolog(disable=True)

In [None]:
###########   SUPPORT VECTOR CLASSIFIER    ###########

In [None]:
mlflow.set_experiment('svc-experiment')

# objective function for SVM classifier
def svc_objective(params):
    with mlflow.start_run():
        mlflow.log_params(params)
        
        model = SVC(**params, random_state=42)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        
        precision = eval_metrics_logs(y_val, y_pred.round())
        
    return {'loss': -precision, 'status': STATUS_OK}


search_space = {
    'C': hp.uniform('C', 0, 10),
    # 'kernel': hp.choice('kernel', ['poly', 'rbf', 'sigmoid']), 
    'degree': scope.int(hp.randint('degree', 1, 12)),
    # 'gamma': hp.choice('gamma', ['scale', 'auto']),
    # 'class_weight': hp.choice('class_weight', [None, 'balanced'])   
}

best_result = fmin(
    fn=svc_objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=30,
    rstate=random_state,
    verbose=False
)

In [None]:
params = {
    'C': 2.521616767609426,
    'degree': 2
}

mlflow.sklearn.autolog()

svc = SVC(**params, random_state=42)
svc.fit(X_train, y_train)

mlflow.sklearn.autolog(disable=True)

In [None]:
###########   XG-BOOST CLASSIFIER    ###########

In [None]:
mlflow.set_experiment("xgboost-experiment")

# objective function for extreme gradient boost
def exgobjective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=500,
            evals=[(valid, "validation")],
            early_stopping_rounds=250, 
            verbose_eval=False
        )
        y_pred = booster.predict(valid)
        
        precision = eval_metrics_logs(y_val, y_pred.round())
        
    return {'loss': -precision, 'status': STATUS_OK}


search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 20, 70, 10)),
    'learning_rate': hp.loguniform('learning_rate', -30, -3),
    'min_child_weight': hp.loguniform('min_child_weight', 1.5, 2.5),
    # 'scale_pos_weight': hp.uniform('scale_pos_weight', 1, 10),
    'subsample': hp.uniform('subsample', 0.5, 1),
    # 'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
    # 'gamma': hp.uniform('gamma', 0, 1),
    'objective': 'binary:logistic',
    'eval_metric': "logloss",
    'seed': 42,
}

best_result = fmin(
    fn = exgb_objective,
    space = search_space,
    algo = tpe.suggest,
    max_evals = 30,
    trials = Trials(),
    rstate=random_state,
    verbose=False
)

In [None]:
# params = space_eval(search_space, best_result)
params = {
    'eval_metric':'logloss',
    'gamma':0.9511548717715149,
    'learning_rate':0.014685011379954318,
    'max_depth':149,
    'min_child_weight':7.668601934406394,
    'objective':'binary:logistic',
    'seed':	42,
    'subsample':0.51735171792841
}

mlflow.xgboost.autolog()

booster = xgb.train(
    params=params,
    dtrain=train,
    num_boost_round=500,
    evals=[(valid, "validation")],
    early_stopping_rounds=20,
    verbose_eval=False
)

mlflow.xgboost.autolog(disable=True)

In [None]:
###########   XG-BOOST CLASSIFIER    ###########

In [None]:
mlflow.set_experiment('best_model')

with mlflow.start_run():
    params = {
        'min_samples_leaf': 8, 
        'min_samples_split': 14, 
        'n_estimators': 90
    }
    
    # Log the parameters
    mlflow.log_params(params)

    gbc = GradientBoostingClassifier(**params, random_state=42)
    gbc.fit(X_train, y_train)
    y_pred = gbc.predict(X_val)
    
    # Calculate the evaluation metrics
    metrics = {
        'f1': f1_score(y_val, y_pred.round()), 
        'precision': precision_score(y_val, y_pred.round(), zero_division=0),
        'recall': recall_score(y_val, y_pred.round()),
        'pr_auc': roc_auc_score(y_val, y_pred.round()),
        'accuracy': accuracy_score(y_val, y_pred.round())
    }
    
    # Log the evaluation metrics
    mlflow.log_metrics(metrics)
    
    # Log the model
    mlflow.sklearn.log_model(gbc, artifact_path='model')
   
    # Log the preprocessor
    mlflow.log_artifact('models/preprocessor.b', artifact_path='preprocessor')
    mlflow.sklearn.autolog(disable=True)