In [1]:
from mlops.util_funcs import scrub_data, eval_metrics_logs
from mlops.project_libraries import *

##########################    READING IN DATASETS    ##########################

In [2]:
# import data
df_train = pd.read_csv("../data/train_df.csv")
df_val = pd.read_csv("../data/test_df.csv")

In [3]:
# creating target variables
y_train = df_train["Response"]
y_val = df_val["Response"]

In [4]:
# clean and preprocess the data
train_data = scrub_data(df_train)
val_data = scrub_data(df_val)

In [5]:
# dataframe to dictionary
train_dict = train_data.to_dict(orient='records')
val_dict = val_data.to_dict(orient='records')

model_docker = make_pipeline(
    DictVectorizer(),
    LogisticRegression()
)

model_docker.fit(train_dict, y_train)

with open('model.pkl', 'wb') as file:
    pickle.dump(model_docker, file)

##########################    SETTING UP REMOTE MLFLOW ENVIRONMENT   ##########################

In [6]:
# fill in AWS profile
os.environ["AWS_PROFILE"] = "demiga-g"

# Setting tracking uri (unique resource identifier)
TRACKING_SERVER_HOST = '13.51.13.52' #'127.0.0.1'
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:5000")

# Check validity of url
print(f"tracking URI: '{mlflow.get_tracking_uri()}'")

tracking URI: 'http://13.51.13.52:5000'


In [7]:
# Setting experiment
mlflow.set_experiment(experiment_name='all-models')

# Delete the `Default` experiment
expt_id = mlflow.get_experiment_by_name('Default').experiment_id

if expt_id == "0":
    try:
        mlflow.delete_experiment(expt_id)
    except mlflow.exceptions.MlflowException:
        pass
else:
    print('`Default` deleted...')

##########################    LOGGING DEFAULT MODELS    ##########################

In [8]:
# random state to use
random_state = np.random.default_rng(42)

# dictionary vectorizer to use
dv = DictVectorizer()

In [9]:
# auto-logging scikit-learn models, their parameters, and the dictionary vectorizer

mlflow.sklearn.autolog(log_datasets=False)

for model_class in (SVC, 
                    LogisticRegression,
                    RandomForestClassifier, 
                    GradientBoostingClassifier
):

    with mlflow.start_run():
      
      pipeline = make_pipeline(
        dv,
        model_class()
      )

      pipeline.fit(train_dict, y_train)
      y_pred = pipeline.predict(val_dict)
      
      eval_metrics_logs(y_val, y_pred.round())
      
      # Log the model and the vectorizer in the pipeline as one
      mlflow.sklearn.log_model(pipeline, artifact_path='model')



In [10]:
# prepare dataset for xgboost classifier
train_dict_vect = dv.fit_transform(train_dict)
val_dict_vect = dv.transform(val_dict)

# required xgboost params
params = {
    'objective': 'binary:logistic',
    'eval_metric': "logloss",
    'random_state': 42,
    'early_stopping_rounds': 250
}

In [11]:
# auto-logging xgboost model, its default parameters, and dictionary vectorizer

mlflow.xgboost.autolog()

with mlflow.start_run():
    mlflow.log_params(params)
    
    # fit the model    
    model = XGBClassifier(**params)
    model.fit(train_dict_vect, y_train, 
              eval_set=[(val_dict_vect, y_val)],
              verbose=False)
    
    y_pred = model.predict(val_dict_vect)
    precision = eval_metrics_logs(y_val, y_pred.round())
    
    # make pipeline with the model and dictionary vectorizer    
    pipeline = make_pipeline(dv, model)
        
    # log the pipeline
    mlflow.sklearn.log_model(pipeline, artifact_path="model")

mlflow.xgboost.autolog(disable=True)  

<html lang=en>
<title>404 Not Found</title>
<h1>Not Found</h1>
<p>The requested URL was not found on the server. If you entered the URL manually please check your spelling and try again.</p>
'


##########################    LOGGING TUNED MODELS    ##########################

###########   LOGISTIC REGRESSION    ###########

In [12]:
mlflow.set_experiment("log-reg")

# objective function for logistic regression
def log_reg_objective(params):
    with mlflow.start_run():
        mlflow.log_params(params)
    
        # pipeline for logistic regression and vectorizer
        pipeline = make_pipeline(
            dv,
            LogisticRegression(**params, max_iter=5000)
        )
        
        pipeline.fit(train_dict, y_train)
        y_pred = pipeline.predict(val_dict)
        
        precision = eval_metrics_logs(y_val, y_pred.round())
        
        # log the model and the vectorizer as one
        mlflow.sklearn.log_model(pipeline, artifact_path='model')
        mlflow.sklearn.autolog(disable=True)
            
    return {'loss': -precision, 'status': STATUS_OK}

# define parameters to tune
space = {
    'C': hp.loguniform('C', -20, 4),
    'class_weight': hp.choice('class_weight', [None, 'balanced']),
    'penalty': hp.choice('penalty', ['l1', 'l2']),
    'solver': hp.choice('solver', ['liblinear', 'saga'])
}


# log in the model to mlflow
best_result = fmin(
    fn = log_reg_objective,
    space = space,
    algo = tpe.suggest,
    max_evals = 30,
    trials = Trials(),
    rstate=random_state,
    verbose=False
)

2024/07/26 07:06:23 INFO mlflow.tracking.fluent: Experiment with name 'log-reg' does not exist. Creating a new experiment.


###########   RANDOM FOREST CLASSIFIER    ###########

In [13]:
mlflow.set_experiment('rfc')

# objective function for random forest classifier
def rfc_objective(params):
    with mlflow.start_run():
        mlflow.log_params(params)
        
        # pipeline for random forest and vectorizer
        pipeline = make_pipeline(
            dv,
            RandomForestClassifier(**params, n_jobs=-1, 
                                   random_state=42)
        )
        
        pipeline.fit(train_dict, y_train)
        y_pred = pipeline.predict(val_dict)
        
        precision = eval_metrics_logs(y_val, y_pred.round())
        
        # log the model and the vectorizer as one
        mlflow.sklearn.log_model(pipeline, artifact_path='model')
        mlflow.sklearn.autolog(disable=True)
        
    return {'loss': -precision, 'status': STATUS_OK}


# defining the hyperparameters
search_space = {
    'n_estimators': scope.int(hp.quniform('n_estimators', 10, 150, 10)),
    'criterion': hp.choice('criterion', ['gini', 'entropy']),
    'max_depth': scope.int(hp.quniform('max_depth', 10, 30, 10)),   
}

# logging results into mlflow
best_result = fmin(
    fn=rfc_objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=30,
    rstate=random_state,
    verbose=False
)


2024/07/26 07:14:43 INFO mlflow.tracking.fluent: Experiment with name 'rfc' does not exist. Creating a new experiment.


###########   SKLEARN GRADIENT BOOSTING CLASSIFIER    ###########

In [14]:
mlflow.set_experiment('gbc-sklearn')

# objective function for sklearn gradient boosting classifier
def sgbc_objective(params):
    with mlflow.start_run():
        mlflow.log_params(params)
        
        # pipeline for gradient boosting and vectorizer
        pipeline = make_pipeline(
            dv,
            GradientBoostingClassifier(**params, random_state=42)
        )
        
        pipeline.fit(train_dict, y_train)
        y_pred = pipeline.predict(val_dict)
        
        precision = eval_metrics_logs(y_val, y_pred.round())
        
        # log the model and the vectorizer
        mlflow.sklearn.log_model(pipeline, artifact_path='model')
        mlflow.sklearn.autolog(disable=True)
    
    return {'loss': -precision,'status': STATUS_OK}

# defining the search space
search_space = {
    'n_estimators': scope.int(hp.quniform('n_estimators', 50, 100, 10)),
    'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 16, 2)), 
    'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 1, 10, 1)), 
}

# logging result into mlflow
best_result = fmin(
    fn = sgbc_objective,
    space = search_space,
    algo = tpe.suggest,
    max_evals = 30,
    trials = Trials(),
    rstate=random_state,
    verbose=False
)

2024/07/26 07:21:23 INFO mlflow.tracking.fluent: Experiment with name 'gbc-sklearn' does not exist. Creating a new experiment.


###########   XGB CLASSIFIER    ###########

In [15]:
mlflow.set_experiment('xgbc')


# objective function for xgboost classifier
def xgbc_objective(params):
    
    with mlflow.start_run():
        mlflow.log_params(params)
        
        model = XGBClassifier(**params,
                              random_state=42,
                              objective='binary:logistic')
        model.set_params(early_stopping_rounds=250)
        
        model.fit(train_dict_vect, y_train, 
                  eval_set=[(val_dict_vect, y_val)],
                  verbose=False)
        y_pred = model.predict(val_dict_vect)
        
        precision = eval_metrics_logs(y_val, y_pred.round())
        
        # make pipeline with the model and dictionary vectorizer
        pipeline = make_pipeline(dv, model)
        
        # log the pipeline
        mlflow.sklearn.log_model(pipeline, artifact_path="model")
        mlflow.xgboost.autolog(disable=True)
        
    return {'loss': -precision, 'status': STATUS_OK}



search_space = {
    'n_estimators': scope.int(hp.quniform('n_estimators', 50, 80, 10)),
    'max_leaves': scope.int(hp.quniform('max_leaves', 1, 100, 10)),
    'min_child_weight': hp.loguniform('min_child_weight', 1.5, 2.5),
}

best_result = fmin(
    fn = xgbc_objective,
    space = search_space,
    algo = tpe.suggest,
    max_evals = 30,
    trials = Trials(),
    rstate=random_state,
    verbose=False
)

2024/07/26 07:28:08 INFO mlflow.tracking.fluent: Experiment with name 'xgbc' does not exist. Creating a new experiment.


###########   SUPPORT VECTOR CLASSIFIER    ###########

In [16]:
mlflow.set_experiment('svc')

# objective function for SVM classifier
def svc_objective(params):
    with mlflow.start_run():
        mlflow.log_params(params)
        
        # make pipeline with the model and vectorizer
        pipeline = make_pipeline(
            dv,
            SVC(**params, random_state=42)
        )
        pipeline.fit(train_dict, y_train)
        y_pred = pipeline.predict(val_dict)
        
        precision = eval_metrics_logs(y_val, y_pred.round())
        
        # log in the model and vectorizer
        mlflow.sklearn.log_model(pipeline, artifact_path='model')
        mlflow.sklearn.autolog(disable=True)
        
    return {'loss': -precision, 'status': STATUS_OK}


search_space = {
    'C': hp.uniform('C', 0, 10),
    # 'kernel': hp.choice('kernel', ['poly', 'rbf', 'sigmoid']), 
    'degree': scope.int(hp.randint('degree', 1, 12)),
    # 'gamma': hp.choice('gamma', ['scale', 'auto']),
    # 'class_weight': hp.choice('class_weight', [None, 'balanced'])   
}

best_result = fmin(
    fn=svc_objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=30,
    rstate=random_state,
    verbose=False
)

2024/07/26 07:34:22 INFO mlflow.tracking.fluent: Experiment with name 'svc' does not exist. Creating a new experiment.
