In [1]:
import pandas as pd

import mlflow
import logging

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, RocCurveDisplay
from urllib.parse import urlparse
from xgboost import XGBClassifier

In [2]:
logging.basicConfig(level=logging.WARN)
logger = logging.getLogger(__name__)

#### Dataset download

In [3]:
csv_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'

In [4]:
try:
    data = pd.read_csv(csv_url, delimiter=';')
except Exception as e:
    logger.exception(
        f'Unable to download the dataset, check your internet connection. Error {e}'
    )

In [5]:
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


#### Train and test

In [6]:
train, test = train_test_split(data, test_size=0.2, stratify=data['quality'])

ore = OrdinalEncoder()

x_train = train.drop('quality', axis=1)
x_test = test.drop('quality', axis=1)

y_train = ore.fit_transform(train[['quality']])
y_test = ore.fit_transform(test[['quality']])

In [7]:
def evalutation_metrics(real, predict):
    accuracy = accuracy_score(real, predict, normalize=True)
    f1 = f1_score(real, predict, average='macro')
    
    return accuracy, f1

In [14]:
ore.

array(['quality'], dtype=object)

#### Mlflow runs

In [12]:
with mlflow.start_run():

    learning_rate = 0.05
    max_depth = 1000
    n_estimators = 100
    objective='multi:softmax'

    xgc = XGBClassifier(learning_rate=learning_rate,max_depth=max_depth, n_estimators=n_estimators, objective=objective, num_class=6)
    xgc.fit(x_train, y_train)

    predict = xgc.predict(x_test)

    (accuracy, f1) = evalutation_metrics(y_test, predict)

    print(f'XGBoost learning_rate {learning_rate}, max_depth {max_depth}, n_estimators {n_estimators}')
    print(f'accuracy {accuracy}')
    print(f'f1 score {f1}')
    
    mlflow.log_metric('accuracy', accuracy)
    mlflow.log_metric('f1 score', f1)
    
    mlflow.log_param('learning_rate', learning_rate)
    mlflow.log_param('max_depth', max_depth)
    mlflow.log_param('n_estimators', n_estimators)
    mlflow.log_param('objective', objective)
    
    tracking_url_type_score = urlparse(mlflow.get_tracking_uri()).scheme
    mlflow.xgboost.log_model(xgc, 'FirstWineModel')

XGBoost learning_rate 0.05, max_depth 1000, n_estimators 100
accuracy 0.684375
f1 score 0.43804239578887466
