In [1]:
import numpy as np
import pandas as pd
import time

from sklearn import metrics
from sklearn import model_selection
from sklearn.linear_model import SGDClassifier

import prometheus_client as prom

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('winequality-red.zip', sep=';')
df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [4]:
df['quality'].value_counts()

quality
5    681
6    638
7    199
4     53
8     18
3     10
Name: count, dtype: int64

In [5]:
warnings.filterwarnings('ignore')

# Подготовим данные к классификации. Условно разделим вино на хорошее и плохое.
# Хорошим вином будем называть то, параметр quality которого — 6 и более.
df['quality_class'] = df['quality'].apply(lambda x: 1 if x >= 6 else 0)
X = df.drop(['quality_class', 'quality'], axis=1)
y = df['quality_class']

# Разделим выборку на обучающую и тестовую в соотношении 70/30
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=42)

# Инициализация модели с поддержкой инкрементального обучения
model = SGDClassifier(random_state=42)

# Первая итерация обучения с использованием всей обучающей выборки
model.partial_fit(X_train, y_train, classes=[0, 1])

# Создание и регистрация метрики F1 для Prometheus
f1_score_metric = prom.Gauge('model_f1_score', 'F1 Score of the model')

# Функция для обновления метрики
def update_metrics():
    y_pred = model.predict(X_test)
    f1_score = metrics.f1_score(y_test, y_pred)
    print(f'F1_score (SGDClassifier): {f1_score}')
    f1_score_metric.set(f1_score)

# Экспортер для Prometheus
prom.start_http_server(8000)

# Начальная запись метрики
update_metrics()

# Цикл для симуляции дообучения на новых данных
for i in range(10):  # Эмулируем 10 шагов дообучения
    # Эмулируем получение новых данных (например, случайное подмножество обучающей выборки)
    X_new, _, y_new, _ = model_selection.train_test_split(X_train, y_train, train_size=0.1)

    # Дообучение модели на новых данных
    model.partial_fit(X_new, y_new)

    # Обновление метрики
    update_metrics()

    # Пауза перед следующей итерацией
    time.sleep(10)

F1_score (SGDClassifier): 0.714859437751004
F1_score (SGDClassifier): 0.1310344827586207
F1_score (SGDClassifier): 0.714859437751004
F1_score (SGDClassifier): 0.7068145800316957
F1_score (SGDClassifier): 0.7016491754122939
F1_score (SGDClassifier): 0.07194244604316546
F1_score (SGDClassifier): 0.714859437751004
F1_score (SGDClassifier): 0.4181360201511335
F1_score (SGDClassifier): 0.33613445378151263
F1_score (SGDClassifier): 0.3697916666666667
F1_score (SGDClassifier): 0.18808777429467086
