In [24]:
from pycaret.classification import predict_model, get_config 
from mlflow import MlflowClient
from mlflow.models import infer_signature
from sklearn.metrics import log_loss, f1_score, roc_curve, auc
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import mlflow
import matplotlib.pyplot as plt
import utils
import pandas as pd
import pycaret.classification as pc 



In [7]:
output_dir = "../data/processed"
ds_train = "/base_train.parquet"
ds_test = "/base_test.parquet"

mlflow_url = "sqlite:///mlruns.db"
experiment_name = "PipelineTreinamento"


In [5]:
df_train = pd.read_parquet(output_dir+ds_train)
df_test = pd.read_parquet(output_dir+ds_test)

In [8]:
mlflow.set_tracking_uri(mlflow_url)
if not mlflow.get_experiment_by_name(experiment_name):
    mlflow.create_experiment(experiment_name)

In [9]:
setup = pc.setup(data=df_train, test_data=df_test, target="shot_made_flag", preprocess= False, normalize=False, session_id=1 )

Unnamed: 0,Description,Value
0,Session id,1
1,Target,shot_made_flag
2,Target type,Binary
3,Original data shape,"(20285, 7)"
4,Transformed data shape,"(20285, 7)"
5,Transformed train set shape,"(16228, 7)"
6,Transformed test set shape,"(4057, 7)"
7,Numeric features,6


In [10]:
lr_model = pc.create_model("lr")
dt_model = pc.create_model("dt")

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.5835,0.6099,0.5052,0.5716,0.5364,0.1609,0.1619
1,0.5829,0.6077,0.522,0.5682,0.5441,0.1609,0.1614
2,0.5786,0.6039,0.5032,0.566,0.5328,0.1514,0.1522
3,0.5786,0.5852,0.4968,0.567,0.5296,0.1509,0.152
4,0.5847,0.6075,0.4723,0.58,0.5206,0.161,0.1637
5,0.5705,0.5958,0.4865,0.5577,0.5196,0.1346,0.1356
6,0.5508,0.5699,0.4542,0.535,0.4913,0.094,0.095
7,0.6081,0.6166,0.5123,0.6061,0.5552,0.2094,0.2118
8,0.5771,0.6023,0.4897,0.5657,0.5249,0.1474,0.1486
9,0.5604,0.5927,0.4457,0.5485,0.4918,0.1117,0.1136


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.5287,0.5099,0.5814,0.5051,0.5405,0.0615,0.0622
1,0.533,0.5087,0.6072,0.5087,0.5536,0.0719,0.0731
2,0.5416,0.5224,0.5871,0.5176,0.5502,0.0866,0.0873
3,0.5237,0.5039,0.6103,0.5011,0.5503,0.0544,0.0556
4,0.5638,0.5415,0.6065,0.5384,0.5704,0.1305,0.1315
5,0.5484,0.5405,0.5845,0.5243,0.5528,0.0994,0.1
6,0.525,0.5126,0.5948,0.5022,0.5446,0.0555,0.0564
7,0.52,0.499,0.5794,0.4978,0.5355,0.0448,0.0454
8,0.5197,0.4914,0.5736,0.4972,0.5327,0.0439,0.0443
9,0.5314,0.5216,0.5659,0.5081,0.5355,0.0656,0.066


In [12]:
lr_prediction = pc.predict_model(lr_model, data=df_test)
lr_log_loss = log_loss(df_test["shot_made_flag"], lr_prediction["prediction_score"])

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.5716,0.5945,0.4724,0.561,0.5129,0.1356,0.1372


In [14]:
dt_prediction = pc.predict_model(dt_model, data=df_test)
dt_log_loss = log_loss(df_test["shot_made_flag"], dt_prediction["prediction_score"])
dt_f1_score = f1_score(df_test['shot_made_flag'], dt_prediction['prediction_label'].astype(int), average='binary')


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Decision Tree Classifier,0.5418,0.5174,0.5901,0.5177,0.5515,0.0872,0.088


In [16]:
mlflow.set_experiment(experiment_name)
mlflow.start_run()

<ActiveRun: >

In [17]:
mlflow.log_metric("lr_log_loss", lr_log_loss)
mlflow.log_metric("dt_log_loss", dt_log_loss)
mlflow.log_metric("dt_f1_score", dt_f1_score)

In [25]:
plt.figure(figsize=(7,5))
utils.plot_validation_curve(df_train.drop('shot_made_flag', axis=1), df_train['shot_made_flag'],'C', {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}, lr_model, 'Regressão Logística', 'f1',True)

AttributeError: module 'utils' has no attribute 'plot_validation_curve'

<Figure size 700x500 with 0 Axes>