# XGboost model train

In [28]:
!pip install mlflow



In [29]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import logging
import mlflow
from urllib.parse import urlparse
import xgboost as xgb


## Splitting dataset into train and test

In [30]:
def get_split_train_data():
  """Return a tuple containing split train data into X_train X_test, y_train and y_test."""
  df = pd.read_csv('./preprocessed_application_train.csv')
  train, test = train_test_split(df)
  X_train = train.drop(['TARGET'], axis=1)
  X_test = test.drop(['TARGET'], axis=1)
  y_train = train[['TARGET']]
  y_test = test[['TARGET']]
  return X_train, X_test, y_train, y_test

## Adding MLFLow workflow

### Configuring logs

In [31]:
def get_configured_logger():
  """Return a logger for console outputs configured to print warnings."""
  logging.basicConfig(level=logging.WARN)
  return logging.getLogger(__name__)

### Training model on split data

In [32]:
def train_xgboost_classifier(X_train, y_train):
  """Return GradientBoostingClassifier fit on input ndarrays X_train and y_train.

  Keyword arguments:
  X_train -- ndarray containing all train columns except target column
  y_train -- ndarray target column values to train the model
  """
  clf = xgb.XGBClassifier(objective ='binary:logistic', colsample_bytree = 0.3, learning_rate = 0.1,max_depth = 5, alpha = 10, n_estimators = 10)
  clf = clf.fit(X_train, y_train)
  return clf

### Getting model evaluation metrics

In [33]:
def eval_metrics(actual, pred):
  """Return a tuple containing model classification accuracy and confusion matrix.

  Keyword arguments:
  actual -- ndarray y_test containing true target values
  pred -- ndarray of the predicted target values by the model
  """
  accuracy = accuracy_score(actual, pred)
  conf_matrix = confusion_matrix(actual, pred)
  return accuracy, conf_matrix

In [34]:
def get_model_evaluation_metrics(clf, X_test, y_test):
  """Return a tuple containing model classification accuracy and confusion matrix.
  
  Keyword arguments:
  clf -- classifier model
  X_test -- ndarray containing all test columns except target column
  y_test -- ndarray target column values to test the model
  """
  predicted_repayments = clf.predict(X_test)
  (accuracy, conf_matrix) = eval_metrics(y_test, predicted_repayments)
  return accuracy, conf_matrix

### Tracking model on MLFLow

In [35]:
def track_model_params(clf):
  """Log model params on MLFlow UI.

  Keyword arguments:
  clf -- classifier model
  """
  clf_params = clf.get_params()
  for param in clf_params:
      param_value = clf_params[param]
      mlflow.log_param(param, param_value)

In [36]:
def track_model_metrics(clf, X_test, y_test):
  """Log model metrics on MLFlow UI.
  
  Keyword arguments:
  clf -- classifier model
  X_test -- ndarray containing all test columns except target column
  y_test -- ndarray target column values to test the model
  """
  (accuracy, conf_matrix) = get_model_evaluation_metrics(clf, X_test, y_test)
  mlflow.log_metric('accuracy', accuracy)
  #mlflow.log_metric('conf_matrix', conf_matrix)

In [37]:
def track_model_version(clf):
  """Version model on MLFlow UI.

  Keyword arguments:
  clf -- classifier model
  """
  tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
  if tracking_url_type_store != 'file':
      mlflow.sklearn.log_model(clf, 'model', registered_model_name='GradientBoostingClassifier')
  else:
      mlflow.sklearn.log_model(clf, 'model')

In [38]:
def set_mlflow_run_tags():
  """Set current MLFlow run tags."""
  tags = {'model_name': 'XGBoostClassifier'}
  mlflow.set_tags(tags)

In [39]:
def train_and_track_model_in_mlflow():
  """Train model and track it with MLFLow"""
  (X_train, X_test, y_train, y_test) = get_split_train_data()
  logger = get_configured_logger()
  clf = train_xgboost_classifier(X_train, y_train)
  with mlflow.start_run():
    track_model_params(clf)
    track_model_metrics(clf, X_test, y_test)
    track_model_version(clf)
    set_mlflow_run_tags()

In [40]:
if __name__ == '__main__':
    train_and_track_model_in_mlflow()

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
