Допустим, у нас есть следующие данные для обучения модели:

Количество запросов по каждому эндпоинту
Время выполнения запроса для каждого эндпоинта
Количество обрабатываемых в настоящий момент запросов каждым эндпоинтом
Поля запроса

время выполнения запроса = константа + a * количество запросов за квант времени +  c * количество обрабатываемых запросов




Для сбора трейсов создаем два микросервиса
Первый запускает указанное количество параллельных запросов на второй, поведение которого нас интересует

# 1. Подготовка данных

Выгружаем логи из Kibana и формируем датасет

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('dataset.csv', sep = ',')

In [3]:
data.head()

Unnamed: 0,_id,_index,_score,_type,duration,flags,logs,operationName,process.serviceName,process.tags,references,spanID,startTime,startTimeMillis,tags,traceID
0,SBLFPYcB2l6FFtNvAOI_,jaeger-span-2023-04-01,,_doc,11134287,1,[],/generate_image,microservice1,[],"[{""refType"":""CHILD_OF"",""traceID"":""6a4f6c9ac042...",cebfe836833e2928,1680368519439038,"Apr 1, 2023 @ 17:01:59.439","[{""key"":""http.method"",""type"":""string"",""value"":...",6a4f6c9ac042bb15625ca73744a3e20c
1,SxLFPYcB2l6FFtNvAOI_,jaeger-span-2023-04-01,,_doc,14217736,1,[],/generate_image,microservice1,[],"[{""refType"":""CHILD_OF"",""traceID"":""5db9064592c5...",3b4ae2984cdde39a,1680368517689946,"Apr 1, 2023 @ 17:01:57.689","[{""key"":""http.method"",""type"":""string"",""value"":...",5db9064592c5c99b24567628d9c2c858
2,LBLEPYcB2l6FFtNv1uLW,jaeger-span-2023-04-01,,_doc,1919361,1,[],/generate_image,microservice1,[],"[{""refType"":""CHILD_OF"",""traceID"":""c276ea34f545...",c6d301cb75970abc,1680368517497959,"Apr 1, 2023 @ 17:01:57.497","[{""key"":""http.method"",""type"":""string"",""value"":...",c276ea34f5457d57ab93705a6d6a7440
3,RhLFPYcB2l6FFtNvAOI_,jaeger-span-2023-04-01,,_doc,18544720,1,[],/generate_image,microservice1,[],"[{""refType"":""CHILD_OF"",""traceID"":""7e86c07a6753...",09e2bc29d3d3f7af,1680368513084708,"Apr 1, 2023 @ 17:01:53.084","[{""key"":""http.method"",""type"":""string"",""value"":...",7e86c07a67535107b743a3fa3cf54daa
4,LhLEPYcB2l6FFtNv1uLW,jaeger-span-2023-04-01,,_doc,6652137,1,[],/generate_image,microservice1,[],"[{""refType"":""CHILD_OF"",""traceID"":""979f8fcddafd...",6676906f444be20c,1680368512983191,"Apr 1, 2023 @ 17:01:52.983","[{""key"":""http.method"",""type"":""string"",""value"":...",979f8fcddafd7971eeca4b3df3ec04b0


Теперь необходимо преобразовать данные в нужный формат.

Сначала необходимо извлечь из столбца ***tags*** данные о запросе - эндпоинт и параметры запроса 

In [4]:
import re
def get_endpoint_data(data):
    pattern = pattern = r'"key":"http.target","type":"string","value":"([^"]+)"'
    match = re.search(pattern, data)
    if match:
        url = match.group(1)
        return match.group(1)
    else:
        print('Value not found.')
    

In [5]:
data['endpoint'] = data['tags'].apply(get_endpoint_data)

Удаляем ненужные столбцы

In [6]:
data.drop(['_id','_index','_score','_type','flags','logs','operationName', 'process.tags','process.serviceName','spanID', 'tags', 'traceID', 'references'], axis=1, inplace=True)
data.head()

Unnamed: 0,duration,startTime,startTimeMillis,endpoint
0,11134287,1680368519439038,"Apr 1, 2023 @ 17:01:59.439",/generate_image?x=244&y=82
1,14217736,1680368517689946,"Apr 1, 2023 @ 17:01:57.689",/generate_image?x=307&y=120
2,1919361,1680368517497959,"Apr 1, 2023 @ 17:01:57.497",/generate_image?x=108&y=51
3,18544720,1680368513084708,"Apr 1, 2023 @ 17:01:53.084",/generate_image?x=207&y=178
4,6652137,1680368512983191,"Apr 1, 2023 @ 17:01:52.983",/generate_image?x=217&y=50


Теперь необходимо извлечь параметры из столбца ***endpoint*** нужно извлечь параметры и добавить в датасет столбцы с их названием и присвоить им их значения. Если значение параметра — строка, применяем *one hot encoding*

In [7]:
# Регулярное выражение для извлечения параметров запроса и их значений
pattern = r'\?([^#]*)'

# Извлечение пути запроса и параметров
params = data['endpoint'].apply(lambda x: re.search(pattern, x).group(1))

# Создание списка параметров запроса
param_names = []
for param in params:
    param_list = param.split('&')
    for p in param_list:
        name, value = p.split('=')
        if name not in param_names:
            param_names.append(name)

In [8]:
# Извлеченные названия параметров
param_names

['x', 'y', 'additional']

In [9]:
# Создание нового датафрейма для параметров запроса
df = None
params_df=None
params_df = pd.DataFrame(columns=param_names)
for i, param in enumerate(params):
    param_list = param.split('&')
    param_values = {}
    for p in param_list:
        name, value = p.split('=')
        #param_values[name] = value
        try:
            param_values[name] = int(value)
        except:
            if isinstance(value, str):
                param_values[name+'_'+value] = 1
    df = pd.concat([df, pd.DataFrame.from_records([param_values])])

In [10]:
df = df.reset_index(drop=True)
df.fillna(0, inplace=True)
df.head()

Unnamed: 0,x,y,additional_three,additional_two,additional_one
0,244,82,0.0,0.0,0.0
1,307,120,0.0,0.0,0.0
2,108,51,0.0,0.0,0.0
3,207,178,0.0,0.0,0.0
4,217,50,0.0,0.0,0.0


In [11]:
# Объединяем датафреймы
data = pd.merge(data, df, left_index=True, right_index=True)
data.head()

Unnamed: 0,duration,startTime,startTimeMillis,endpoint,x,y,additional_three,additional_two,additional_one
0,11134287,1680368519439038,"Apr 1, 2023 @ 17:01:59.439",/generate_image?x=244&y=82,244,82,0.0,0.0,0.0
1,14217736,1680368517689946,"Apr 1, 2023 @ 17:01:57.689",/generate_image?x=307&y=120,307,120,0.0,0.0,0.0
2,1919361,1680368517497959,"Apr 1, 2023 @ 17:01:57.497",/generate_image?x=108&y=51,108,51,0.0,0.0,0.0
3,18544720,1680368513084708,"Apr 1, 2023 @ 17:01:53.084",/generate_image?x=207&y=178,207,178,0.0,0.0,0.0
4,6652137,1680368512983191,"Apr 1, 2023 @ 17:01:52.983",/generate_image?x=217&y=50,217,50,0.0,0.0,0.0


Теперь преобразуем значения столбцов со временем и удалим лишние столбцы

In [12]:
# Преобразуем значение из логов в количество миллисекунд
def transform_time(s):
    items = s.split(',')
    
    # Возвращаем без миллионнных долей секунды
    return ''.join(items[:-1])

data['starttime_m'] = data['startTime'].apply(transform_time)
data['duration_m'] = data['duration'].apply(transform_time)

In [13]:
data.head()

Unnamed: 0,duration,startTime,startTimeMillis,endpoint,x,y,additional_three,additional_two,additional_one,starttime_m,duration_m
0,11134287,1680368519439038,"Apr 1, 2023 @ 17:01:59.439",/generate_image?x=244&y=82,244,82,0.0,0.0,0.0,1680368519439,11134
1,14217736,1680368517689946,"Apr 1, 2023 @ 17:01:57.689",/generate_image?x=307&y=120,307,120,0.0,0.0,0.0,1680368517689,14217
2,1919361,1680368517497959,"Apr 1, 2023 @ 17:01:57.497",/generate_image?x=108&y=51,108,51,0.0,0.0,0.0,1680368517497,1919
3,18544720,1680368513084708,"Apr 1, 2023 @ 17:01:53.084",/generate_image?x=207&y=178,207,178,0.0,0.0,0.0,1680368513084,18544
4,6652137,1680368512983191,"Apr 1, 2023 @ 17:01:52.983",/generate_image?x=217&y=50,217,50,0.0,0.0,0.0,1680368512983,6652


In [14]:
import numpy as np
data['duration_m'] = data['duration_m'].astype(np.int64)
data['starttime_m'] = data['starttime_m'].astype(np.int64)
print(data.dtypes)

duration             object
startTime            object
startTimeMillis      object
endpoint             object
x                     int64
y                     int64
additional_three    float64
additional_two      float64
additional_one      float64
starttime_m           int64
duration_m            int64
dtype: object


In [15]:
#  Преобразование столбца starttime_m в формат datetime
data['starttime_m'] = pd.to_datetime(data['starttime_m'], unit='ms')

# Сортировка датафрейма по starttime_m
data = data.sort_values(by=['starttime_m'])

# Создание столбцов start и end
data['start'] = data['starttime_m']
data['end'] = data['start'] + pd.to_timedelta(data['duration_m'], unit='ms')

# Создание столбца parallel_processes
parallel_processes = []
for i in range(len(data)):
    start = data.iloc[i]['start']
    end = data.iloc[i]['end']
    parallel = len(data[(data['start'] <= start) & (data['end'] > start) | (data['start'] < end) & (data['end'] >= end)])
    parallel_processes.append(parallel)
data['parallel_processes'] = parallel_processes

Удаляем лишние столбцы и получаем готовый датасет

In [16]:
data.drop(['duration', 'startTime','startTimeMillis', 'starttime_m', 'start', 'end', 'endpoint'], axis=1, inplace=True)
data.to_csv('dataset_final.csv', index=False)
data.head()


Unnamed: 0,x,y,additional_three,additional_two,additional_one,duration_m,parallel_processes
669,200,200,0.0,0.0,0.0,3373,1
668,150,300,0.0,0.0,0.0,3390,1
667,150,300,0.0,0.0,0.0,3972,1
666,152,350,0.0,0.0,0.0,4023,1
665,152,350,0.0,0.0,0.0,2947,1


# 2. Обучение модели

Используем линейную регрессию из библиотеки scikit-learn. Данные подготавливаются путем разбиения на тренировочный и тестовый наборы, а затем модель обучается на тренировочных данных. После этого мы можем использовать обученную модель для предсказания продолжительности на тестовых данных и оценки качества модели с помощью коэффициента детерминации (R^2 score).

In [17]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [18]:
X = data[['x', 'y', 'additional_three', 'additional_two', 'additional_one', 'parallel_processes']]
y = data['duration_m']

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
# Создание и обучение модели
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

In [21]:
# Предсказание на тестовых данных
y_pred = model.predict(X_test)

In [22]:
# Оценка качества модели
r2 = r2_score(y_test, y_pred)
print('R^2 score:', r2)

R^2 score: 0.8229834139934271
