In [1]:
import pandas as pd
import numpy as np
import pycaret.classification as pc

import matplotlib.pyplot as plt

import mlflow
from mlflow.models.signature import infer_signature
from mlflow.tracking import MlflowClient
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Configurando MLFlow

In [2]:
mlflow.set_tracking_uri("sqlite:///mlruns.db")

experiment_name = 'Kobe Bryant Shot Selection'
experiment = mlflow.get_experiment_by_name(experiment_name)
if experiment is None:
    experiment_id = mlflow.create_experiment(experiment_name)
    experiment = mlflow.get_experiment(experiment_id)
experiment_id = experiment.experiment_id

In [3]:
from sklearn.model_selection import train_test_split

columns=['lat', 'lon', 'minutes_remaining', 'period', 'playoffs', 'shot_distance', 'shot_made_flag']
train_perc = 0.8

with mlflow.start_run(experiment_id=experiment_id, run_name = 'PreparacaoDados'):
    df_dev = pd.read_parquet('../data/raw/dataset_kobe_dev.parquet')
    df_dev = df_dev.dropna()
    df_dev = df_dev[columns].copy()
    
    df_dev.to_parquet("../data/data_filtered.parquet")
    
    
    xtrain, xtest, ytrain, ytest = train_test_split(df_dev[['lat', 'lon', 'minutes_remaining', 'period', 'playoffs', 'shot_distance']],
                                                    df_dev['shot_made_flag'],                                                
                                                    train_size=train_perc,
                                                    stratify=df_dev['shot_made_flag'])    
    xtrain['shot_made_flag'] = ytrain
    xtest['shot_made_flag'] = ytest
    xtrain.to_parquet('../data/base_train.parquet')
    xtest.to_parquet('../data/base_test.parquet')

    mlflow.log_params({
        'perc-teste': 1-train_perc,
        'colunas-selecionadas': columns
    })
    mlflow.log_metrics({
        'qtd_linhas_treino': xtrain.shape[0],
        'qtd_linhas_teste': xtest.shape[0],
    })
# from sklearn.model_selection import train_test_split

# train_perc = 0.8
# data_cols = ["lat", "lon", "minutes_remaining", "period", "playoffs", "shot_distance", 'shot_made_flag']


# with mlflow.start_run(experiment_id=experiment_id, run_name = 'PreparacaoDados'):
#     df_dev = pd.read_parquet('../data/raw/dataset_kobe_dev.parquet')
#     df_dev = df_dev.dropna(subset=["lat", "lon", "minutes_remaining", "period", "playoffs", "shot_distance"], inplace=True)
#     mlflow.log_metric('dimensao_dataset_filtrado', len(df_dev))
    
    
#     le = LabelEncoder()
#     df_dev["period"] = le.fit_transform(df_dev["period"])
#     df_dev["playoffs"] = le.fit_transform(df_dev["playoffs"])
#     mlflow.log_metric("categorias_period", len(le.classes_))
#     mlflow.log_metric("categorias_playoffs", len(le.classes_))

#     scaler = StandardScaler()
#     scaler.fit(df_dev[["lat", "lon", "minutes_remaining", "shot_distance"]])
  
#     df_dev["lat_norm"] = scaler.transform(df_dev[["lat"]])[:, 0]
#     df_dev["lon_norm"] = scaler.transform(df_dev[["lon"]])[:, 0]
#     df_dev["minutes_remaining_norm"] = scaler.transform(df_dev[["minutes_remaining"]])[:, 0]
#     df_dev["shot_distance_norm"] = scaler.transform(df_dev[["shot_distance"]])[:, 0]


#     df_dev.drop(columns=["lat", "lon", "minutes_remaining", "shot_distance"], inplace=True)
#     df_dev = df_dev[["period", "playoffs", "lat_norm", "lon_norm", "minutes_remaining_norm", "shot_distance_norm", "shot_made_flag"]]
    
#     df_dev.to_parquet("../data/data_filtered.parquet")

#     mlflow.log_metric("dimensao_dataset_final", len(df_dev))

#     #Separação dos dados em treino e teste
#     X = df_dev[["period", "playoffs", "lat_norm", "lon_norm", "minutes_remaining_norm", "shot_distance_norm"]]
#     y = df_dev["shot_made_flag"]
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=train_perc, stratify=y)

#     # Saving data sets
#     X_train.to_parquet("/data/processed/base_train.parquet")
#     X_test.to_parquet("/data/processed/base_test.parquet")
#     y_train.to_parquet("/data/processed/base_train_target.parquet")
#     y_test.to_parquet("/data/processed/base_test_target.parquet")

#     # Log metrics
#     mlflow.log_param("porcentagem_teste", train_perc)
#     mlflow.log_metric("dimensao_treino", len(X_train))
#     mlflow.log_metric("dimensao_teste", len(X_test))

The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh()

All git commands will error until this is rectified.

$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - error|e|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet



# Treinamento do Modelo

In [4]:
from sklearn.model_selection import validation_curve


def plot_parameter_validation_curve(X, Y, param_name, grid_search,
                                    model, model_name, scoring,
                                    logx):
    print('Parameter:', param_name)
    print('GridSearch:', grid_search[param_name])
    print('Scoring:', scoring)
    plt.figure(figsize=(6,4))
    train_scores, test_scores = validation_curve(model,
                                                 X = X, 
                                                 y = Y, 
                                                 param_name=param_name, 
                                                 param_range= grid_search[param_name],
                                                 scoring=scoring,
                                                 cv=10,
                                                 n_jobs=-1)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.title("Curva Validação Modelo " + model_name)
    plt.xlabel(param_name)
    plt.ylabel("Score ("+scoring+")")
    if logx:
        plt.semilogx(grid_search[param_name], train_scores_mean,'-o', label="Treino",
                     color="darkorange", lw=2)
        plt.semilogx(grid_search[param_name], test_scores_mean,'-o', label="Validação-Cruzada",
                     color="navy", lw=2)
    else:
        plt.plot(grid_search[param_name], train_scores_mean,'-o', label="Treino",
                     color="darkorange", lw=2)
        plt.plot(grid_search[param_name], test_scores_mean,'-o', label="Validação-Cruzada",
                 color="navy", lw=2)
    plt.fill_between(grid_search[param_name], train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.2,
                     color="darkorange", lw=2)
    plt.fill_between(grid_search[param_name], test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.2,
                     color="navy", lw=2)
    plt.legend(loc='best')
    plt.grid(True)
    plt.show()

# Validação Cruzada de Dados

In [5]:
from sklearn.model_selection import validation_curve


def plot_parameter_validation_curve(X, Y, param_name, grid_search,
                                    model, model_name, scoring,
                                    logx):
    print('Parameter:', param_name)
    print('GridSearch:', grid_search[param_name])
    print('Scoring:', scoring)
    plt.figure(figsize=(6,4))
    train_scores, test_scores = validation_curve(model,
                                                 X = X, 
                                                 y = Y, 
                                                 param_name=param_name, 
                                                 param_range= grid_search[param_name],
                                                 scoring=scoring,
                                                 cv=10,
                                                 n_jobs=-1)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.title("Curva Validação Modelo " + model_name)
    plt.xlabel(param_name)
    plt.ylabel("Score ("+scoring+")")
    if logx:
        plt.semilogx(grid_search[param_name], train_scores_mean,'-o', label="Treino",
                     color="darkorange", lw=2)
        plt.semilogx(grid_search[param_name], test_scores_mean,'-o', label="Validação-Cruzada",
                     color="navy", lw=2)
    else:
        plt.plot(grid_search[param_name], train_scores_mean,'-o', label="Treino",
                     color="darkorange", lw=2)
        plt.plot(grid_search[param_name], test_scores_mean,'-o', label="Validação-Cruzada",
                 color="navy", lw=2)
    plt.fill_between(grid_search[param_name], train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.2,
                     color="darkorange", lw=2)
    plt.fill_between(grid_search[param_name], test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.2,
                     color="navy", lw=2)
    plt.legend(loc='best')
    plt.grid(True)
    plt.gcf()

In [7]:
import os
from sklearn.metrics import log_loss, f1_score

registered_model_name = 'model_kobe'
nexamples = 5
model_version = -1
models = ['lr','dt']


with mlflow.start_run(experiment_id=experiment_id, run_name = 'Treinamento'):
  
    exp = pc.setup(
        data=xtrain,
        target = 'shot_made_flag',
        fold_strategy = 'stratifiedkfold',
        test_data = xtest,
        normalize=True,
        log_experiment = False
    )
    list_models = pc.compare_models(['lr','dt'], n_select=2, sort='f1')

    for plot_type in ['lr','dt']:
        print('=> Aplicando ', plot_type)
        artifact = pc.plot_model(list_models[0], plot=plot_type, save=True)
        mlflow.log_artifact(artifact)


    # REGRESSAO
    exp.plot_model(list_models[0], plot='vc', save = True)
    yhat_test = exp.predict_model(list_models[0])
    plot_parameter_validation_curve(xtrain.drop('shot_made_flag', axis=1), ytrain, 'C', {'C': [0.001, 0.01, 0.1, 1, 10]},
                                        list_models[0], 'Regressão Logística', 'f1', logx=False)
    plt.savefig('lr_validation_curve.png')
    mlflow.log_artifact('lr_validation_curve.png')
    # os.remove('lr_validation_curve.png')
    
    mlflow.log_metrics({
        'lr_log_loss': log_loss(yhat_test.target, yhat_test.prediction_label),
        'lr_f1': f1_score(yhat_test.target, yhat_test.prediction_label),
    })
    
    # ARVORE
    yhat_test = exp.predict_model(list_models[1])
    plot_parameter_validation_curve(xtrain.drop('shot_made_flag', axis=1), ytrain, 'max_depth', {'max_depth': [2, 3, 4, 5, 6, 7, 8]},
                                    list_models[1], 'Árvore Decisão', 'f1', logx=False)
    plt.savefig('dt_validation_curve.png')
    mlflow.log_artifact('dt_validation_curve.png')
    # os.remove('dt_validation_curve.png')

    mlflow.log_metrics({
        'dt_log_loss': log_loss(yhat_test.target, yhat_test.prediction_label),
        'dt_f1': f1_score(yhat_test.target, yhat_test.prediction_label),
    })
    
     # FINALIZACAO MELHOR MODELO
    tune_model = exp.tune_model(list_models[0],
                                optimize = 'f1',
                                search_library = 'scikit-learn',
                                search_algorithm = 'random',
                                n_iter = 4)
    yhat_test = exp.predict_model(tune_model, raw_score=True)

    
    mlflow.log_metrics({
        'final_model_log_loss': log_loss(yhat_test.target, yhat_test.prediction_label),
        'final_model_f1': f1_score(yhat_test.target, yhat_test.prediction_label),
    })
    yhat_test.to_parquet('../data/processed/prediction_test.parquet')
    mlflow.log_artifact('../data/processed/prediction_test.parquet')
    
    final_model = exp.finalize_model(tune_model)

    artifact = pc.plot_model(final_model, plot=plot_type, save=True)
    mlflow.log_artifact(artifact)
    
    # EXPORTACAO PARA LOG E REGISTRO DO MODELO
    exp.save_model(final_model, f'./{registered_model_name}') 
    # Carrega novamente o pipeline + bestmodel
    model_pipe = exp.load_model(f'./{registered_model_name}')
    # Assinatura do Modelo Inferida pelo MLFlow
    model_features = list(xtrain.drop('shot_made_flag', axis=1).columns)
    inf_signature = infer_signature(xtrain[model_features], 
                                    model_pipe.predict_proba(xtrain.drop('target', axis=1)))
    # Exemplo de entrada para o MLmodel
    input_example = {x: xtrain[x].values[:nexamples] for x in model_features}
    # Log do pipeline de modelagem do sklearn e registrar como uma nova versao
    mlflow.sklearn.log_model(
        sk_model=model_pipe,
        artifact_path="sklearn-model",
        registered_model_name=registered_model_name,
        signature = inf_signature,
        input_example = input_example,
        pyfunc_predict_fn='predict_proba'
    )
    # Criacao do cliente do servico MLFlow e atualizacao versao modelo
    client = MlflowClient()
    if model_version == -1:
        model_version = client.get_latest_versions(registered_model_name)[-1].version
    # Registrar o modelo como staging
    client.set_registered_model_alias(
        name    = registered_model_name, 
        alias   = "staging", 
        version = model_version
    )


Unnamed: 0,Description,Value
0,Session id,2407
1,Target,shot_made_flag
2,Target type,Binary
3,Original data shape,"(20285, 7)"
4,Transformed data shape,"(20285, 7)"
5,Transformed train set shape,"(16228, 7)"
6,Transformed test set shape,"(4057, 7)"
7,Numeric features,6
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
dt,Decision Tree Classifier,0.5378,0.5221,0.5789,0.5141,0.5445,0.0788,0.0794,0.015
lr,Logistic Regression,0.5744,0.5956,0.4814,0.5636,0.5192,0.1417,0.1432,0.011


Processing:   0%|          | 0/14 [00:00<?, ?it/s]

=> Aplicando  lr


ValueError: Plot Not Available. Please see docstring for list of available Plots.