In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.utils import resample
from imblearn.over_sampling import SMOTE

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier

import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

from sklearn.metrics import roc_auc_score, f1_score

import random
import pickle
import os

%matplotlib inline

In [3]:
import mlflow

mlflow.set_tracking_uri("http://10.138.0.5:5000")
mlflow.set_experiment("experiment-002")

<Experiment: artifact_location='gs://project-mlflow-bucket/1', experiment_id='1', lifecycle_stage='active', name='experiment-002', tags={}>

In [4]:
def process_dataframe(filepath):    
    df = pd.read_csv(filepath)
    
    df.drop(['nativeCountry'], axis=1, inplace=True)
    
    target = 'incomeTarget'

    transformed_target = []

    for _, value in df['incomeTarget'].iteritems():
        if value == ' <=50K':
            transformed_target.append(0)
        else:
            transformed_target.append(1)
    df['incomeTarget'] = transformed_target
    
    y = df[target]
    X = df.drop('incomeTarget', axis=1, inplace=True)
    X = pd.get_dummies(df)

    # Upsample using SMOTE
    sm = SMOTE(random_state=12)
    X_train_sm, y_train_sm = sm.fit_resample(X, y)

    
    # df_new = pd.DataFrame(X_upsampled, columns=X.columns)
    df_new = pd.DataFrame(X_train_sm, columns=X.columns)
    
    return df_new, y_train_sm

In [5]:
X_train, y_train = process_dataframe('../data/adult-train.csv')
X_val, y_val = process_dataframe('../data/adult-val.csv')

len(X_train), len(y_train), len(X_val), len(y_val)

(34340, 34340, 15100, 15100)

In [8]:
X_train.to_csv('../tests/df_new_test.csv', index=False)
y_train.to_csv('../tests/y_train_test.csv', index=False)

In [12]:
pd.read_csv('../tests/y_train_test.csv').columns.tolist()

['incomeTarget']

In [14]:
print(y_train.name)

incomeTarget


In [16]:
dv = DictVectorizer()

train_dicts = X_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = X_val.to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [18]:
X_train

(34340, 66)

In [6]:
logreg = LogisticRegression(max_iter=500)
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_val)

In [7]:
os.makedirs('models/', exist_ok=True)

with open('models/logreg.bin', 'wb') as f_out:
    pickle.dump((dv, logreg), f_out)

In [8]:
with mlflow.start_run():
    mlflow.set_tag("developer", "enchristos")
    
    mlflow.log_param("train-data-path", "../data/adult-train.csv")
    mlflow.log_param("valid-data-path", "../data/adult-test.csv")
    
    max_iter = 500
    mlflow.log_param("max_iter", max_iter)
    logreg = LogisticRegression(max_iter=max_iter)
    logreg.fit(X_train, y_train)
    
    y_pred = logreg.predict(X_val)
    auc = roc_auc_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    mlflow.log_metric("auc", auc)
    mlflow.log_metric("fi_score", f1)
    mlflow.log_metric("loss_metric", 1-f1)
    
    mlflow.log_artifact(local_path="models/logreg.bin", artifact_path="models_pickle")

In [9]:
with mlflow.start_run():
    mlflow.set_tag("developer", "enchristos")
    
    mlflow.log_param("train-data-path", "../data/adult-train.csv")
    mlflow.log_param("valid-data-path", "../data/adult-test.csv")
    
    learning_rate = 0.2
    mlflow.log_param("learning_rate", learning_rate)
    gbc = GradientBoostingClassifier(learning_rate=learning_rate)
    gbc.fit(X_train, y_train)
    
    y_pred = gbc.predict(X_val)
    
    auc = roc_auc_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    
    mlflow.log_metric("auc", auc)
    mlflow.log_metric("fi_score", f1)
    mlflow.log_metric("loss_metric", 1-f1)
    
    with open('models/gbc.bin', 'wb') as f_out:
        pickle.dump((dv, gbc), f_out)
    
    mlflow.log_artifact(local_path="models/gbc.bin", artifact_path="models_pickle")

In [10]:
with mlflow.start_run():
    mlflow.set_tag("developer", "enchristos")
    
    mlflow.log_param("train-data-path", "../data/adult-train.csv")
    mlflow.log_param("valid-data-path", "../data/adult-test.csv")
    
    n_estimators = 100
    mlflow.log_param("n_estimators", n_estimators)
    rfc = RandomForestClassifier(n_estimators=n_estimators)
    rfc.fit(X_train, y_train)
    
    y_pred = rfc.predict(X_val)
    auc = roc_auc_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    mlflow.log_metric("auc", auc)
    mlflow.log_metric("fi_score", f1)
    mlflow.log_metric("loss_metric", 1-f1)
    
    with open('models/rfc.bin', 'wb') as f_out:
        pickle.dump((dv, gbc), f_out)
    
    mlflow.log_artifact(local_path="models/rfc.bin", artifact_path="models_pickle")

In [11]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model", "XGBClassifier")
        mlflow.set_tag("developer", "enchristos")
        mlflow.log_params(params)
        booster = xgb.XGBClassifier(
            n_estimators = params['n_estimators'],
            max_depth = int(params['max_depth']),
            learning_rate = params['learning_rate'],
            gamma = params['gamma'],
            min_child_weight = params['min_child_weight'],
            subsample = params['subsample'],
            colsample_bytree = params['colsample_bytree'],
            random_state = params['seed']
        )
        
        booster.fit(X_train, y_train)
        
        y_pred = booster.predict(X_val)
        auc = roc_auc_score(y_val, y_pred)
        f1 = f1_score(y_val, y_pred)
        mlflow.log_metric("auc", auc)
        mlflow.log_metric("fi_score", f1)
        mlflow.log_metric("loss_metric", 1-f1)

    return {'loss': 1-f1, 'status': STATUS_OK}

In [12]:
search_space = {
    'max_depth' : scope.int(hp.quniform('max_depth', 4, 10, 0.1)),
    'learning_rate' : hp.quniform('learning_rate', 0.01, 0.5, 0.01),
    'n_estimators' : hp.choice('n_estimators', range(0, 50, 1)),
    'gamma' : hp.quniform('gamma', 0.01, 0.50, 0.01),
    'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 0.1),
    'subsample' : hp.quniform('subsample', 0.1, 1, 0.01),
    'colsample_bytree' : hp.quniform('colsample_bytree', 0.1, 1.0, 0.01),
    'seed': 42
}


best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=500,
    trials=Trials()
)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [06:46<00:00,  1.23trial/s, best loss: 0.08421052631578951]


In [13]:
best_result

{'colsample_bytree': 0.26,
 'gamma': 0.46,
 'learning_rate': 0.47000000000000003,
 'max_depth': 7.0,
 'min_child_weight': 2.9000000000000004,
 'n_estimators': 39,
 'subsample': 0.9}

In [16]:
with mlflow.start_run():
    
    best_params = {
         'colsample_bytree': 0.26,
         'gamma': 0.46,
         'learning_rate': 0.47000000000000003,
         'max_depth': 7.0,
         'min_child_weight': 2.9000000000000004,
         'n_estimators': 39,
         'subsample': 0.9,
         'seed': 42
    }

    mlflow.log_params(best_params)

    booster = xgb.XGBClassifier(
            n_estimators = best_params['n_estimators'],
            max_depth = int(best_params['max_depth']),
            learning_rate = best_params['learning_rate'],
            gamma = best_params['gamma'],
            min_child_weight = best_params['min_child_weight'],
            subsample = best_params['subsample'],
            colsample_bytree = best_params['colsample_bytree'],
            random_state = best_params['seed']
        )
        
    booster.fit(X_train, y_train)
        
    y_pred = booster.predict(X_val)
    auc = roc_auc_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    mlflow.log_metric("auc", auc)
    mlflow.log_metric("fi_score", f1)
    mlflow.log_metric("loss_metric", 1-f1)

    with open("models/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)
        
    mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

    mlflow.xgboost.log_model(booster, artifact_path="models_mlflow")

In [17]:
mlflow.sklearn.autolog()

for model_class in (RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, xgb.XGBClassifier):
    with mlflow.start_run():
        mlflow.set_tag("developer", "enchristos")
        mlflow.log_param("train-data-path", "../data/adult-train.csv")
        mlflow.log_param("valid-data-path", "../data/adult-test.csv")
        
        mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")
        
        mlmodel = model_class()
        mlmodel.fit(X_train, y_train)
        
        y_pred = mlmodel.predict(X_val)
        auc = roc_auc_score(y_val, y_pred)
        f1 = f1_score(y_val, y_pred)
        mlflow.log_metric("auc", auc)
        mlflow.log_metric("fi_score", f1)
        mlflow.log_metric("loss_metric", 1-f1)

In [18]:
mlflow.get_artifact_uri()

'gs://project-mlflow-bucket/1/d90c95eeb7514ad49998ae9c690f9bfb/artifacts'