In [1]:
from mlops.util_funcs import clean_data, transform_data, eval_metrics_logs
from mlops.project_libraries import *

##########################    READING IN DATASETS    ##########################

In [2]:
# import training and testing features
df_train = pd.read_csv("../data/training_features.csv")
df_test = pd.read_csv("../data/testing_features.csv")

In [3]:
# importing training and testing targets
y_train = pd.read_csv("../data/training_targets.csv")
y_test = pd.read_csv("../data/testing_targets.csv")

# converting the data to arrays
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

In [4]:
# clean and preprocess the data
train_clean = clean_data(df_train)
test_clean = clean_data(df_test)

In [5]:
# normalize and fill null values
train_norm = transform_data(train_clean)
test_norm = transform_data(test_clean)

In [6]:
# dataframe to dictionary
train_dict = train_norm.to_dict(orient='records')
val_dict = test_norm.to_dict(orient='records')

model_docker = make_pipeline(
    DictVectorizer(),
    LogisticRegression()
)

model_docker.fit(train_dict, y_train)

with open('model.pkl', 'wb') as file:
    pickle.dump(model_docker, file)

##########################    SETTING UP REMOTE MLFLOW ENVIRONMENT   ##########################

In [7]:
# fill in AWS profile
os.environ["AWS_PROFILE"] = "demiga-g"

# Setting tracking uri (unique resource identifier)
TRACKING_SERVER_HOST = '127.0.0.1' #'13.51.13.52' 
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:5000")

# Check validity of url
print(f"tracking URI: '{mlflow.get_tracking_uri()}'")

tracking URI: 'http://127.0.0.1:5000'


In [8]:
# Setting experiment
mlflow.set_experiment(experiment_name='all-models')

# Delete the `Default` experiment
expt_id = mlflow.get_experiment_by_name('Default').experiment_id

if expt_id == "0":
    try:
        mlflow.delete_experiment(expt_id)
    except mlflow.exceptions.MlflowException:
        pass
else:
    print('`Default` deleted...')

##########################    LOGGING DEFAULT MODELS    ##########################

In [9]:
# random state to use
import mlflow.models.signature


random_state = np.random.default_rng(0)
SEED = 0

# dictionary vectorizer to use
dv = DictVectorizer(sparse=False)

# defining model signature
signature = mlflow.models.signature.infer_signature(train_dict, y_train)

In [10]:
# defining random state for models
svc_config = partial(SVC, random_state=SEED, probability=True)
log_reg_config = partial(LogisticRegression, random_state=SEED, max_iter=1500)
rfc_config = partial(RandomForestClassifier, random_state=SEED)
gbc_config = partial(GradientBoostingClassifier, random_state=SEED)
xgb_config = partial(XGBClassifier, random_state=SEED)
dtc_config = partial(DecisionTreeClassifier, random_state=SEED)

models = [
    (svc_config, "SVC"),
    (log_reg_config, "LOG-REG"),
    (rfc_config, "RFC"),
    (gbc_config, "GBC"),
    (xgb_config, "XG-BOOST"),
    (dtc_config, "DTC"),
    (GaussianNB, "GAUS")
]

In [11]:
# auto-logging scikit-learn models, their parameters, and the dictionary vectorizer

mlflow.sklearn.autolog(log_datasets=False)

for model_class, model_name in models:
  
    with mlflow.start_run():
      
      pipeline = make_pipeline(
        dv,
        model_class()
      )

      pipeline.fit(train_dict, y_train)
      y_pred = pipeline.predict(val_dict)
      
      eval_metrics_logs(y_test, y_pred.round())
      
      # Log the model and the vectorizer in the pipeline as one
      mlflow.sklearn.log_model(pipeline, artifact_path='model', signature=signature)



##########################    LOGGING TUNED MODELS    ##########################

###########   LOGISTIC REGRESSION    ###########

In [12]:
mlflow.set_experiment("log-reg")

# objective function for logistic regression
def log_reg_objective(params):
    with mlflow.start_run():
        mlflow.log_params(params)
    
        # pipeline for logistic regression and vectorizer
        pipeline = make_pipeline(dv, log_reg_config(**params))
        
        pipeline.fit(train_dict, y_train)
        y_pred = pipeline.predict(val_dict)
        
        precision = eval_metrics_logs(y_test, y_pred.round())
        
        # log the model and the vectorizer as one
        mlflow.sklearn.log_model(pipeline, artifact_path='model', signature=signature)
        mlflow.sklearn.autolog(disable=True)
            
    return {'loss': -precision, 'status': STATUS_OK}

# define parameters to tune
space = {
    # e^-20 and e^4
    'C': hp.loguniform('C', -20, 4),
    'class_weight': hp.choice('class_weight', [None, 'balanced']),
    'penalty': hp.choice('penalty', ['l1', 'l2']),
    'solver': hp.choice('solver', ['liblinear', 'saga'])
}


# log in the model to mlflow
best_result = fmin(
    fn = log_reg_objective,
    space = space,
    algo = tpe.suggest,
    max_evals = 30,
    trials = Trials(),
    rstate=random_state,
    verbose=False
)



###########   RANDOM FOREST CLASSIFIER    ###########

In [13]:
mlflow.set_experiment('rfc')

# objective function for random forest classifier
def rfc_objective(params):
    with mlflow.start_run():
        mlflow.log_params(params)
        
        # pipeline for random forest and vectorizer
        pipeline = make_pipeline(dv,rfc_config(**params, n_jobs=-1))
        
        pipeline.fit(train_dict, y_train)
        y_pred = pipeline.predict(val_dict)
        
        precision = eval_metrics_logs(y_test, y_pred.round())
        
        # log the model and the vectorizer as one
        mlflow.sklearn.log_model(pipeline, artifact_path='model', signature=signature)
        mlflow.sklearn.autolog(disable=True)
        
    return {'loss': -precision, 'status': STATUS_OK}


# defining the hyperparameters
search_space = {
    'n_estimators': scope.int(hp.quniform('n_estimators', 40, 120, 10)),
    'criterion': hp.choice('criterion', ['gini', 'entropy']),
    'max_depth': scope.int(hp.quniform('max_depth', 10, 50, 10)),   
}

# logging results into mlflow
best_result = fmin(
    fn=rfc_objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=30,
    rstate=random_state,
    verbose=False
)


###########   SKLEARN GRADIENT BOOSTING CLASSIFIER    ###########

In [None]:
mlflow.set_experiment('gbc-sklearn')

# objective function for sklearn gradient boosting classifier
def sgbc_objective(params):
    with mlflow.start_run():
        mlflow.log_params(params)
        
        # pipeline for gradient boosting and vectorizer
        pipeline = make_pipeline(dv, gbc_config(**params))
        
        pipeline.fit(train_dict, y_train)
        y_pred = pipeline.predict(val_dict)
        
        precision = eval_metrics_logs(y_test, y_pred.round())
        
        # log the model and the vectorizer
        mlflow.sklearn.log_model(pipeline, artifact_path='model', signature=signature)
        mlflow.sklearn.autolog(disable=True)
    
    return {'loss': -precision,'status': STATUS_OK}

# defining the search space
search_space = {
    'n_estimators': scope.int(hp.quniform('n_estimators', 10, 70, 10)),
    'min_samples_split': scope.int(hp.quniform('min_samples_split', 10, 20, 2)), 
    'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 6, 20, 1)), 
}

# logging result into mlflow
best_result = fmin(
    fn = sgbc_objective,
    space = search_space,
    algo = tpe.suggest,
    max_evals = 30,
    trials = Trials(),
    rstate=random_state,
    verbose=False
)

###########   SUPPORT VECTOR CLASSIFIER    ###########

In [None]:
mlflow.set_experiment('svc')

# objective function for SVM classifier
def svc_objective(params):
    with mlflow.start_run():
        mlflow.log_params(params)
        
        # make pipeline with the model and vectorizer
        pipeline = make_pipeline(dv, svc_config(**params))
        pipeline.fit(train_dict, y_train)
        y_pred = pipeline.predict(val_dict)
        
        precision = eval_metrics_logs(y_test, y_pred.round())
        
        # log in the model and vectorizer
        mlflow.sklearn.log_model(pipeline, artifact_path='model', signature=signature)
        mlflow.sklearn.autolog(disable=True)
        
    return {'loss': -precision, 'status': STATUS_OK}


search_space = {
    'C': hp.uniform('C', 0, 15),
    'kernel': hp.choice('kernel', ['poly', 'rbf', 'sigmoid']), 
    'degree': scope.int(hp.randint('degree', 1, 12)),
    'gamma': hp.choice('gamma', ['scale', 'auto']),
}

best_result = fmin(
    fn=svc_objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=30,
    rstate=random_state,
    verbose=False
)

###########   XGB CLASSIFIER    ###########

In [None]:
mlflow.set_experiment('xgbc')


# objective function for xgboost classifier
def xgbc_objective(params):
    
    with mlflow.start_run():
        mlflow.log_params(params)
        
        # pipeline for xgboost and vectorizer
        pipeline = make_pipeline(dv, xgb_config(**params))
        
        pipeline.fit(train_dict, y_train)
        y_pred = pipeline.predict(val_dict)
        
        precision = eval_metrics_logs(y_test, y_pred.round())
        
        # log the pipeline
        mlflow.sklearn.log_model(pipeline, artifact_path="model", signature=signature)
        mlflow.xgboost.autolog(disable=True)
        
    return {'loss': -precision, 'status': STATUS_OK}


search_space = {
    'n_estimators': scope.int(hp.quniform('n_estimators', 50, 80, 10)),
    'max_leaves': scope.int(hp.quniform('max_leaves', 1, 100, 10)),
    'min_child_weight': hp.loguniform('min_child_weight', 1.5, 2.5),
}

best_result = fmin(
    fn = xgbc_objective,
    space = search_space,
    algo = tpe.suggest,
    max_evals = 30,
    trials = Trials(),
    rstate=random_state,
    verbose=False
)