In [None]:
import json
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

In [11]:
def load_json_array(file_path):
    """Загрузка массива JSON объектов из файла"""
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    
    if isinstance(data, list):
        return data
    else:
        return [data]

def prepare_training_data(json_data_list):
    """Подготовка данных для обучения из списка JSON"""
    
    features_list = []
    targets = []
    
    for data in json_data_list:
        metrics = data['diagnosis']['metrics']
        
        feature_row = {
            'db_time_total': metrics.get('db_time_total', 0),
            'db_time_committed': metrics.get('db_time_committed', 0),
            'cpu_time': metrics.get('cpu_time', 0),
            'io_time': metrics.get('io_time', 0),
            'lock_time': metrics.get('lock_time', 0),
            'cpu_percent': metrics.get('cpu_percent', 0),
            'io_percent': metrics.get('io_percent', 0),
            'lock_percent': metrics.get('lock_percent', 0),
            'tps': metrics.get('tps', 0),
            'qps': metrics.get('qps', 0),
            'avg_query_latency_ms': metrics.get('avg_query_latency_ms', 0),
            'rollback_rate': metrics.get('rollback_rate', 0),
            'total_commits': metrics.get('total_commits', 0),
            'total_rollbacks': metrics.get('total_rollbacks', 0),
            'total_calls': metrics.get('total_calls', 0),
            'load_scenario': data['ground_truth']['load_scenario']
        }
        
        features_list.append(feature_row)
        targets.append(data['ground_truth']['active_config'])
    
    df = pd.DataFrame(features_list)
    df['active_config'] = targets
    
    return df

json_data = load_json_array('status.json')
training_df = prepare_training_data(json_data)
print(f"Обработано записей: {len(training_df)}")
training_df.sample(3)

Обработано записей: 72


Unnamed: 0,db_time_total,db_time_committed,cpu_time,io_time,lock_time,cpu_percent,io_percent,lock_percent,tps,qps,avg_query_latency_ms,rollback_rate,total_commits,total_rollbacks,total_calls,load_scenario,active_config
64,22.0,11.0,11.0,8.8,2.2,50.0,40.0,10.0,800.0,2400.0,1.5,0.0,48000,0,144000,mixed,oltp
6,0.89,0.25,0.25,0.25,0.39,28.0,28.0,44.0,1.3,1.1,27.8,0.0,379150,1,1501300,init,cold
55,28.0,11.2,11.2,14.0,2.8,40.0,50.0,10.0,100.0,100.0,10.0,0.0,6000,0,6000,etl,mixed


In [None]:
def train_catboost_model(json_data_list):
    """Обучение CatBoost модели для классификации load_scenario"""
    
    df = prepare_training_data(json_data_list)
    
    X = df.drop('load_scenario', axis=1)
    y = df['load_scenario']
    
    categorical_features = ['active_config']
    for col in categorical_features:
        X[col] = X[col].astype('category')
    
    print(f"Dataset shape: {X.shape}")
    print(f"Target classes: {y.unique()}")
    print(f"Class distribution:\n{y.value_counts()}")
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    model = CatBoostClassifier(
        iterations=1000,
        learning_rate=0.05,
        depth=8,
        loss_function='MultiClass',
        verbose=100,
        random_state=42,
        early_stopping_rounds=50,
        cat_features=categorical_features, 
        eval_metric='MultiClass'
    )
    
    model.fit(
        X_train, y_train,
        eval_set=(X_test, y_test),
        plot=True,
        verbose=False
    )
    
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)
    
    print(f"\nAccuracy: {accuracy_score(y_test, y_pred):.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    feature_importance = model.get_feature_importance()
    feature_names = X.columns
    
    print("\nFeature Importance:")
    for name, importance in sorted(zip(feature_names, feature_importance), 
                                  key=lambda x: x[1], reverse=True):
        print(f"{name}: {importance:.4f}")
    
    return model, X.columns.tolist(), categorical_features

In [None]:
model, X, categorical_features = train_catboost_model(json_data)

Dataset shape: (1440, 16)
Target classes: ['init' 'oltp' 'olap' 'iot' 'locks' 'reporting' 'etl' 'cold' 'mixed']
Class distribution:
load_scenario
init         160
oltp         160
olap         160
iot          160
locks        160
reporting    160
etl          160
cold         160
mixed        160
Name: count, dtype: int64


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))


Accuracy: 1.0000

Classification Report:
              precision    recall  f1-score   support

        cold       1.00      1.00      1.00        32
         etl       1.00      1.00      1.00        32
        init       1.00      1.00      1.00        32
         iot       1.00      1.00      1.00        32
       locks       1.00      1.00      1.00        32
       mixed       1.00      1.00      1.00        32
        olap       1.00      1.00      1.00        32
        oltp       1.00      1.00      1.00        32
   reporting       1.00      1.00      1.00        32

    accuracy                           1.00       288
   macro avg       1.00      1.00      1.00       288
weighted avg       1.00      1.00      1.00       288


Feature Importance:
tps: 15.2281
total_calls: 11.1445
io_percent: 10.6204
total_rollbacks: 8.6054
avg_query_latency_ms: 8.4270
total_commits: 8.1186
lock_time: 7.1063
db_time_committed: 6.5220
lock_percent: 6.4700
qps: 6.0204
cpu_percent: 3.4505
io_tim

In [None]:
model.save_model('catboost_model.cbm')

model_info = {
    'feature_columns': [
        'db_time_total', 'db_time_committed', 'cpu_time', 'io_time', 
        'lock_time', 'cpu_percent', 'io_percent', 'lock_percent', 
        'tps', 'qps', 'avg_query_latency_ms', 'rollback_rate', 
        'total_commits', 'total_rollbacks', 'total_calls', 'active_config'
    ],
    'categorical_features': ['active_config'],
    'class_names': model.classes_.tolist()
}

with open('model_info.json', 'w') as f:
    json.dump(model_info, f, indent=2)

print("Model saved successfully!")
print(f"Classes: {model.classes_}")

Model saved successfully!
Classes: ['cold' 'etl' 'init' 'iot' 'locks' 'mixed' 'olap' 'oltp' 'reporting']
