# 0.0. Imports

In [2]:
import sys
import joblib 

sys.path.insert(0, '../src/')

from sklearn.pipeline import Pipeline
from feature_engine.discretisation import EqualFrequencyDiscretiser, EqualWidthDiscretiser
from feature_engine.imputation import MeanMedianImputer
from feature_engine.wrappers import SklearnTransformerWrapper
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

from data.data_load import DataLoad
from data.data_validation import DataValidation
from data.data_transformation import DataTransformation
from utils.utils import load_config_file
from data.data_preprocess import DataPreprocess
from train.train import TrainModels
from evaluation.classifier_eval import ModelEvaluation

# 1.0. Data load

In [3]:
dl = DataLoad()
df = dl.load_data('train_dataset_name')

[2m2023-12-04 20:21:36[0m [[32m[1minfo     [0m] [1mComecando a carga dos dados com o nome: train_dataset_name[0m


# 2.0. Data validation

In [4]:
dv = DataValidation()
is_valid = dv.run(df)

[2m2023-12-04 20:21:36[0m [[32m[1minfo     [0m] [1mValidacao iniciou[0m
[2m2023-12-04 20:21:36[0m [[32m[1minfo     [0m] [1mValidation columns passed...[0m
[2m2023-12-04 20:21:36[0m [[32m[1minfo     [0m] [1mValidacao com sucesso.[0m


# 3.0. Data transformation

In [5]:
dt = DataTransformation(df)
X_train, X_valid, y_train, y_valid = dt.train_test_spliting()

# 4.0. Experimentations

In [6]:
import mlflow 
from mlflow.tracking import MlflowClient


* 'schema_extra' has been renamed to 'json_schema_extra'


In [7]:
mlflow.set_tracking_uri('http://127.0.0.1:5000')
mlflow.set_experiment('prob_loan')

<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1701729825978, experiment_id='1', last_update_time=1701729825978, lifecycle_stage='active', name='prob_loan', tags={}>

## 4.1. Select best model

In [8]:
current_experiment = dict(mlflow.get_experiment_by_name('prob_loan'))

In [11]:
current_experiment

{'artifact_location': 'mlflow-artifacts:/1',
 'creation_time': 1701729825978,
 'experiment_id': '1',
 'last_update_time': 1701729825978,
 'lifecycle_stage': 'active',
 'name': 'prob_loan',
 'tags': {}}

In [10]:
experiment_id = current_experiment['experiment_id']

In [14]:
df_mlflow = mlflow.search_runs(filter_string='metrics.valid_roc_auc < 1').sort_values('metrics.valid_roc_auc', ascending=False)

In [19]:
df_mlflow.columns

Index(['run_id', 'experiment_id', 'status', 'artifact_uri', 'start_time',
       'end_time', 'metrics.precision_score', 'metrics.roc_auc',
       'metrics.true_negatives', 'metrics.precision_recall_auc',
       'metrics.false_negatives', 'metrics.log_loss',
       'metrics.false_positives', 'metrics.example_count', 'metrics.score',
       'metrics.train_roc_auc', 'metrics.valid_roc_auc',
       'metrics.accuracy_score', 'metrics.f1_score', 'metrics.true_positives',
       'metrics.recall_score', 'params.class_weight', 'params.discretizer',
       'params.warm_start', 'params.imputer', 'params.solver', 'params.scaler',
       'params.max_iter', 'params.fit_intercept', 'params.tol',
       'params.multi_class', 'params.C', 'tags.mlflow.user',
       'tags.mlflow.source.type', 'tags.mlflow.datasets',
       'tags.mlflow.source.name', 'tags.mlflow.runName',
       'tags.mlflow.log-model.history', 'tags.model_name'],
      dtype='object')

In [18]:
run_id = df_mlflow.loc[df_mlflow['metrics.valid_roc_auc'].idxmax()]['run_id']
run_id

'7b3f992cc3544ec595cc6a9d809b9c6b'