In [1]:
!pip install plotly==5.6.0



In [2]:
import pandas as pd
pd.set_option('display.max_rows', None)

import os
from io import StringIO
import requests as r
from requests.auth import HTTPBasicAuth
from datetime import datetime

In [3]:
###########
# Constants
###########

JENKINS_URL = 'http://jenkins:8080'
JENKINS_USER = os.getenv('JENKINS_USER')
JENKINS_PASS = os.getenv('JENKINS_PASS')
JENKINS_JOB_NAME = 'kubedepend'

In [4]:
custom_date_parser = lambda x: datetime.strptime(x, "%m-%d-%Y_%H-%M-%S.%f")

In [5]:
#########
# Get results from Jenkins and organize them into one DataFrame
#########
session = r.Session()
session.auth = (JENKINS_USER, JENKINS_PASS)

res = session.get(f'{JENKINS_URL}/job/{JENKINS_JOB_NAME}/api/json')
data = res.json()
builds = data['builds']

results_raw = pd.DataFrame()
build_parameters = pd.DataFrame()

for build_summary in builds:
    build_number = build_summary['number']
    res = session.get(f'{JENKINS_URL}/job/{JENKINS_JOB_NAME}/{build_number}/api/json')
    build = res.json()
    artifact_list = build['artifacts']
    if build['result'] == 'SUCCESS' and artifact_list:
        for artifact in artifact_list:
            # Get builds, where there is a 'results.csv' artifact
            if artifact['fileName'] == 'results.csv':
                # get results
                res = session.get(f'{JENKINS_URL}/job/{JENKINS_JOB_NAME}/{build_number}/artifact/{artifact["relativePath"]}')
                meas_results_df = pd.read_csv(StringIO(res.text), delimiter=',', parse_dates=['measurement_seq_start_time', 'measurement_start_time', 'measurement_end_time'], date_parser=custom_date_parser)
                results_raw = results_raw.append(meas_results_df, ignore_index=True)
                
                # meas_results_df['id'] is a Series
                meas_seq_id = meas_results_df['id'][0]
            
                # extract parameter info
                parameterItems = list(filter(lambda a: a and a['_class'] == 'hudson.model.ParametersAction' ,build['actions']))[0]['parameters']
                parameters = list(filter(lambda p: p['_class'] in ['hudson.model.BooleanParameterValue', 'hudson.model.StringParameterValue'], parameterItems))
                reshaped = dict()
                for parameter in parameters:
                    reshaped[parameter['name']] = parameter['value']
                # add build number
                reshaped['build_number'] = build_number
                reshaped['id'] = meas_seq_id
                build_parameters = build_parameters.append(reshaped, ignore_index=True)
                

In [20]:
# 'results_raw' always stores all the data from Jenkins
# if an error occurs, the 'results' can be reset without querying Jenkins again
results = results_raw

In [21]:
# Clean build_parameters
build_parameters = build_parameters.fillna(False)
build_parameters['CleanEnhancements'] = build_parameters['CleanEnhancements'].astype(bool)
build_parameters['DeleteCFStack'] = build_parameters['DeleteCFStack'].astype(bool)
build_parameters['CleanWorkSpace'] = build_parameters['CleanWorkSpace'].astype(bool)
build_parameters['DeployTools'] = build_parameters['DeployTools'].astype(bool)
build_parameters['UseKafka'] = build_parameters['UseKafka'].astype(bool)
build_parameters['UseHeartbeats'] = build_parameters['UseHeartbeats'].astype(bool)
build_parameters['build_number'] = build_parameters['build_number'].astype(int)

build_parameters.insert(0, 'build_number', build_parameters.pop('build_number'))
build_parameters.insert(0, 'id', build_parameters.pop('id'))


build_parameters.head()
# build_parameters.info()

KeyError: 'CleanEnhancements'

In [22]:
# Select relevant columns
build_parameters = build_parameters.loc[:,['id', 'build_number', 'UseKafka', 'UseHeartbeats']].reset_index(drop=True)
build_parameters = build_parameters.rename(columns={'UseKafka': 'use_kafka', 'UseHeartbeats': 'use_heartbeats'})
build_parameters.head()

KeyError: "['UseKafka', 'UseHeartbeats'] not in index"

In [23]:
# for development, get only base measurements
results = results[results['comment'] == 'final']
# results.head()
results

Unnamed: 0,id,measurement_seq_start_time,availability,mut,mdt,mtbf,measurement_start_time,measurement_end_time,submitted_jobs,finished_jobs,fault_profile,cluster_type,measurement_count,load_duration,locust_user_count,locust_spawn_rate,prev_stack_git_commit_short,comment
0,449d367f-18a9-4183-8167-93a654b219e0,2022-04-03 13:50:00.617719,1.0,1.0,0.0,1.0,2022-04-03 13:50:09.423966,2022-04-03 13:55:16.756650,57.0,56.0,pod-kill,eks,1,300,1,1,5520ca7,final
1,449d367f-18a9-4183-8167-93a654b219e0,2022-04-03 13:50:00.617719,0.95,0.475,0.05,0.525,2022-04-03 13:56:35.626724,2022-04-03 14:01:42.088234,56.0,53.0,pod-kill,eks,2,300,1,1,5520ca7,final
2,449d367f-18a9-4183-8167-93a654b219e0,2022-04-03 13:50:00.617719,0.95,0.475,0.05,0.525,2022-04-03 14:03:32.935403,2022-04-03 14:08:39.450780,,,pod-kill,eks,3,300,1,1,5520ca7,final
3,449d367f-18a9-4183-8167-93a654b219e0,2022-04-03 13:50:00.617719,0.95,0.475,0.05,0.525,2022-04-03 14:09:56.907627,2022-04-03 14:15:03.852545,118.0,118.0,pod-kill,eks,4,300,1,1,5520ca7,final
4,449d367f-18a9-4183-8167-93a654b219e0,2022-04-03 13:50:00.617719,1.0,1.0,0.0,1.0,2022-04-03 14:16:22.758068,2022-04-03 14:21:30.049286,,,pod-kill,eks,5,300,1,1,5520ca7,final
5,44e0909a-7766-485e-aad6-381a73027237,2022-04-03 13:15:23.869493,0.95,0.475,0.05,0.525,2022-04-03 13:15:32.257735,2022-04-03 13:20:39.297826,60.0,56.0,pod-kill,eks,1,300,1,1,5520ca7,final
6,44e0909a-7766-485e-aad6-381a73027237,2022-04-03 13:15:23.869493,0.95,0.475,0.05,0.525,2022-04-03 13:21:58.183238,2022-04-03 13:27:04.459549,,,pod-kill,eks,2,300,1,1,5520ca7,final
7,44e0909a-7766-485e-aad6-381a73027237,2022-04-03 13:15:23.869493,0.947368,0.475,0.05,0.525,2022-04-03 13:28:21.757347,2022-04-03 13:33:28.754786,117.0,111.0,pod-kill,eks,3,300,1,1,5520ca7,final
8,44e0909a-7766-485e-aad6-381a73027237,2022-04-03 13:15:23.869493,0.95,0.475,0.05,0.525,2022-04-03 13:34:47.757275,2022-04-03 13:39:53.999896,,,pod-kill,eks,4,300,1,1,5520ca7,final
9,44e0909a-7766-485e-aad6-381a73027237,2022-04-03 13:15:23.869493,1.0,0.95,0.05,1.0,2022-04-03 13:41:11.312518,2022-04-03 13:46:18.369452,111.0,110.0,pod-kill,eks,5,300,1,1,5520ca7,final


In [24]:
# Set meas count to the maximum of the measuremenet count in the same measurement sequence

# select columns
results_meas_count = results.loc[:, ['id', 'measurement_count']]
# group by max value
results_meas_max_count = results_meas_count.groupby(by=['id']).max()
# rename
results_meas_max_count = results_meas_max_count.rename(columns={'measurement_count': 'measurement_count_max'})
# merge with results
results = results.merge(results_meas_max_count, on='id').drop(columns=['measurement_count']).rename(columns={'measurement_count_max': 'measurement_count'})
results

Unnamed: 0,id,measurement_seq_start_time,availability,mut,mdt,mtbf,measurement_start_time,measurement_end_time,submitted_jobs,finished_jobs,fault_profile,cluster_type,load_duration,locust_user_count,locust_spawn_rate,prev_stack_git_commit_short,comment,measurement_count
0,449d367f-18a9-4183-8167-93a654b219e0,2022-04-03 13:50:00.617719,1.0,1.0,0.0,1.0,2022-04-03 13:50:09.423966,2022-04-03 13:55:16.756650,57.0,56.0,pod-kill,eks,300,1,1,5520ca7,final,5
1,449d367f-18a9-4183-8167-93a654b219e0,2022-04-03 13:50:00.617719,0.95,0.475,0.05,0.525,2022-04-03 13:56:35.626724,2022-04-03 14:01:42.088234,56.0,53.0,pod-kill,eks,300,1,1,5520ca7,final,5
2,449d367f-18a9-4183-8167-93a654b219e0,2022-04-03 13:50:00.617719,0.95,0.475,0.05,0.525,2022-04-03 14:03:32.935403,2022-04-03 14:08:39.450780,,,pod-kill,eks,300,1,1,5520ca7,final,5
3,449d367f-18a9-4183-8167-93a654b219e0,2022-04-03 13:50:00.617719,0.95,0.475,0.05,0.525,2022-04-03 14:09:56.907627,2022-04-03 14:15:03.852545,118.0,118.0,pod-kill,eks,300,1,1,5520ca7,final,5
4,449d367f-18a9-4183-8167-93a654b219e0,2022-04-03 13:50:00.617719,1.0,1.0,0.0,1.0,2022-04-03 14:16:22.758068,2022-04-03 14:21:30.049286,,,pod-kill,eks,300,1,1,5520ca7,final,5
5,44e0909a-7766-485e-aad6-381a73027237,2022-04-03 13:15:23.869493,0.95,0.475,0.05,0.525,2022-04-03 13:15:32.257735,2022-04-03 13:20:39.297826,60.0,56.0,pod-kill,eks,300,1,1,5520ca7,final,5
6,44e0909a-7766-485e-aad6-381a73027237,2022-04-03 13:15:23.869493,0.95,0.475,0.05,0.525,2022-04-03 13:21:58.183238,2022-04-03 13:27:04.459549,,,pod-kill,eks,300,1,1,5520ca7,final,5
7,44e0909a-7766-485e-aad6-381a73027237,2022-04-03 13:15:23.869493,0.947368,0.475,0.05,0.525,2022-04-03 13:28:21.757347,2022-04-03 13:33:28.754786,117.0,111.0,pod-kill,eks,300,1,1,5520ca7,final,5
8,44e0909a-7766-485e-aad6-381a73027237,2022-04-03 13:15:23.869493,0.95,0.475,0.05,0.525,2022-04-03 13:34:47.757275,2022-04-03 13:39:53.999896,,,pod-kill,eks,300,1,1,5520ca7,final,5
9,44e0909a-7766-485e-aad6-381a73027237,2022-04-03 13:15:23.869493,1.0,0.95,0.05,1.0,2022-04-03 13:41:11.312518,2022-04-03 13:46:18.369452,111.0,110.0,pod-kill,eks,300,1,1,5520ca7,final,5


In [25]:
#  group by measurement ID and get mean of numerical properties
results_avg_by_id = results.groupby(by=['id']).mean()
# results_avg_by_id.head()
results_avg_by_id

Unnamed: 0_level_0,availability,mut,mdt,mtbf,submitted_jobs,finished_jobs,load_duration,locust_user_count,locust_spawn_rate,measurement_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
03631a77-d080-44bf-a582-718e4c39084d,0.325439,0.22963,,,44.777778,44.444444,300.0,1.0,1.0,9.0
0f52ec58-1ae2-482d-9012-6c642d16bade,0.92,0.531667,0.05,0.581667,72.5,65.5,300.0,1.0,1.0,5.0
28c950b7-e744-46aa-80fa-fc429e62d912,0.524912,0.283333,0.284444,0.567778,45.833333,32.5,300.0,1.0,1.0,15.0
2912fcdb-b8a0-4c23-87a9-632fb1fa7762,0.92,0.531667,0.05,0.581667,58.25,55.0,300.0,1.0,1.0,5.0
2b2484d5-302d-4448-ad74-f4413b11a1ba,0.95,0.575,0.04,0.615,70.5,66.25,300.0,1.0,1.0,5.0
387bd9f2-511a-4516-9d78-c58d533ac364,0.736316,0.395556,0.151111,0.546667,55.2,54.6,300.0,1.0,1.0,15.0
42e912b3-5cc4-47bc-a518-672fd0c0d01f,0.98,0.98,0.02,1.0,60.0,60.0,300.0,1.0,1.0,5.0
449d367f-18a9-4183-8167-93a654b219e0,0.97,0.685,0.03,0.715,77.0,75.666667,300.0,1.0,1.0,5.0
44e0909a-7766-485e-aad6-381a73027237,0.959474,0.57,0.05,0.62,96.0,92.333333,300.0,1.0,1.0,5.0
4991a6f2-16d1-4219-8ca5-a3fee9cb2631,0.445175,0.252257,,,48.416667,48.041667,300.0,1.0,1.0,24.0


In [26]:
# get non-numerical data for each measurement sequence
results_non_numerical = results.loc[:, ['id', 'fault_profile', 'measurement_seq_start_time', 'cluster_type', 'comment']] \
    .drop_duplicates() \
    .reset_index(drop=True)
# results_non_numerical.head()
results_non_numerical

Unnamed: 0,id,fault_profile,measurement_seq_start_time,cluster_type,comment
0,449d367f-18a9-4183-8167-93a654b219e0,pod-kill,2022-04-03 13:50:00.617719,eks,final
1,44e0909a-7766-485e-aad6-381a73027237,pod-kill,2022-04-03 13:15:23.869493,eks,final
2,e4b79268-bbe4-44a7-8c1d-e640b9d4e634,pod-kill,2022-04-03 12:32:18.120779,eks,final
3,a325098c-965d-4dbf-8c73-33296ac77d03,pod-failure,2022-04-03 10:51:11.781131,eks,final
4,e63d8cbb-0f7b-487e-8339-4fc0a1839ef5,pod-kill,2022-04-03 09:53:51.786775,eks,final
5,851328f9-dc75-4d2c-a4fe-3ebd54e8e9ac,pod-kill,2022-04-03 09:08:01.368185,eks,final
6,28c950b7-e744-46aa-80fa-fc429e62d912,pod-failure,2022-04-02 18:34:17.993822,eks,final
7,e2c28cb0-801c-41d8-9b2d-4d3f47ad1fab,stress-mem,2022-03-31 12:05:19.992145,eks,final
8,903f2465-335e-4991-8ab6-8d0bd8446406,stress-cpu,2022-03-31 11:03:54.694877,eks,final
9,dee627a1-14bf-4360-92da-c07ac9f7dc19,pod-kill,2022-03-31 08:41:14.994202,eks,final


In [27]:
# merge the grouped by avg results with non-numerical data
results = pd.merge(results_avg_by_id, results_non_numerical, on='id')

# merge the results with build parameters
results = pd.merge(results, build_parameters, on='id')

# filter out measurement sequence with less than MIN_MEASUREMENT_COUNT measurements
# results = results[results['measurement_count'] >= MIN_MEASUREMENT_COUNT]

# results.head()
results

Unnamed: 0,id,availability,mut,mdt,mtbf,submitted_jobs,finished_jobs,load_duration,locust_user_count,locust_spawn_rate,measurement_count,fault_profile,measurement_seq_start_time,cluster_type,comment,build_number,use_kafka,use_heartbeats
0,0f52ec58-1ae2-482d-9012-6c642d16bade,0.92,0.531667,0.05,0.581667,72.5,65.5,300.0,1.0,1.0,5.0,pod-kill,2022-03-31 07:55:35.485558,eks,final,408,False,False
1,28c950b7-e744-46aa-80fa-fc429e62d912,0.524912,0.283333,0.284444,0.567778,45.833333,32.5,300.0,1.0,1.0,15.0,pod-failure,2022-04-02 18:34:17.993822,eks,final,422,True,False
2,2912fcdb-b8a0-4c23-87a9-632fb1fa7762,0.92,0.531667,0.05,0.581667,58.25,55.0,300.0,1.0,1.0,5.0,pod-kill,2022-03-29 18:42:57.866807,eks,final,392,False,False
3,2b2484d5-302d-4448-ad74-f4413b11a1ba,0.95,0.575,0.04,0.615,70.5,66.25,300.0,1.0,1.0,5.0,pod-kill,2022-03-30 08:47:38.627685,eks,final,396,False,False
4,387bd9f2-511a-4516-9d78-c58d533ac364,0.736316,0.395556,0.151111,0.546667,55.2,54.6,300.0,1.0,1.0,15.0,network-delay,2022-03-30 16:12:44.120109,eks,final,403,False,False
5,42e912b3-5cc4-47bc-a518-672fd0c0d01f,0.98,0.98,0.02,1.0,60.0,60.0,300.0,1.0,1.0,5.0,none,2022-03-29 12:24:31.624449,eks,final,387,False,False
6,449d367f-18a9-4183-8167-93a654b219e0,0.97,0.685,0.03,0.715,77.0,75.666667,300.0,1.0,1.0,5.0,pod-kill,2022-04-03 13:50:00.617719,eks,final,434,False,True
7,44e0909a-7766-485e-aad6-381a73027237,0.959474,0.57,0.05,0.62,96.0,92.333333,300.0,1.0,1.0,5.0,pod-kill,2022-04-03 13:15:23.869493,eks,final,433,False,True
8,49a35ce4-b2bb-40b9-815e-41b85fc8696d,0.94,0.54,0.04,0.58,115.333333,108.666667,300.0,1.0,1.0,5.0,pod-kill,2022-03-29 19:59:54.353811,eks,final,393,False,False
9,67e0b885-8e9f-46b6-bab3-28289e78551b,0.86,0.495,0.095,0.59,54.8,53.4,300.0,1.0,1.0,5.0,network-partition,2022-03-28 10:34:52.759123,eks,final,384,False,False


In [28]:
# drop rows where there are NaN data
results = results.dropna()

# drop rows where mtbf metrics have > 1 value
results = results[results['mtbf'] <= 1]

# drop rows where comment is 'test'
resutls = results[results['comment'] != 'test']

# group by fault profile and only keep the newest measurement
time_column = 'measurement_seq_start_time'
latest_idx = results.groupby(['fault_profile', 'use_kafka', 'use_heartbeats'])[time_column].transform('max') == results[time_column]

results = results[latest_idx]
results

Unnamed: 0,id,availability,mut,mdt,mtbf,submitted_jobs,finished_jobs,load_duration,locust_user_count,locust_spawn_rate,measurement_count,fault_profile,measurement_seq_start_time,cluster_type,comment,build_number,use_kafka,use_heartbeats
1,28c950b7-e744-46aa-80fa-fc429e62d912,0.524912,0.283333,0.284444,0.567778,45.833333,32.5,300.0,1.0,1.0,15.0,pod-failure,2022-04-02 18:34:17.993822,eks,final,422,True,False
4,387bd9f2-511a-4516-9d78-c58d533ac364,0.736316,0.395556,0.151111,0.546667,55.2,54.6,300.0,1.0,1.0,15.0,network-delay,2022-03-30 16:12:44.120109,eks,final,403,False,False
6,449d367f-18a9-4183-8167-93a654b219e0,0.97,0.685,0.03,0.715,77.0,75.666667,300.0,1.0,1.0,5.0,pod-kill,2022-04-03 13:50:00.617719,eks,final,434,False,True
12,8ab0e4f9-6224-4a65-97b1-196b787b0dc4,1.0,1.0,0.0,1.0,60.0,60.0,300.0,1.0,1.0,5.0,none,2022-03-29 15:05:56.856168,eks,final,390,False,False
13,903f2465-335e-4991-8ab6-8d0bd8446406,1.0,1.0,0.0,1.0,59.4,59.4,300.0,1.0,1.0,5.0,stress-cpu,2022-03-31 11:03:54.694877,eks,final,410,False,False
15,a325098c-965d-4dbf-8c73-33296ac77d03,0.598947,0.335,0.279444,0.614444,36.8,33.533333,300.0,1.0,1.0,15.0,pod-failure,2022-04-03 10:51:11.781131,eks,final,431,False,True
18,cd3a38d1-4acd-4594-9492-303f32e60b29,0.544912,0.208889,0.201111,0.41,37.666667,37.666667,300.0,1.0,1.0,5.0,io,2022-03-23 20:12:07.239328,eks,final,357,False,False
19,d42b44db-f8fc-49cc-86a8-fc6827ca7b0c,0.634912,0.303889,0.210556,0.514444,40.933333,32.466667,300.0,1.0,1.0,15.0,pod-failure,2022-03-30 20:32:12.322345,eks,final,406,False,False
20,dae3bd05-8b22-4170-ac83-c1640bd05f2e,1.0,1.0,0.0,1.0,60.0,59.0,300.0,1.0,1.0,1.0,network-partition,2022-03-30 18:18:08.086367,eks,final,405,False,False
22,e2c28cb0-801c-41d8-9b2d-4d3f47ad1fab,0.97,0.97,0.03,1.0,74.5,74.25,300.0,1.0,1.0,5.0,stress-mem,2022-03-31 12:05:19.992145,eks,final,411,False,False


In [29]:
def enhancement_labeling(row):
    if (row['use_kafka'] and row['use_heartbeats']):
        return 'Kafka_And_Heartbeats'
    if (row['use_kafka']):
        return 'Kafka'
    if (row['use_heartbeats']):
        return 'Heartbeats'
    return 'None'

In [31]:
results['enhancement'] = results.apply(enhancement_labeling, axis=1)
results = results.drop(columns=['use_kafka', 'use_heartbeats'])
results

Unnamed: 0,id,availability,mut,mdt,mtbf,submitted_jobs,finished_jobs,load_duration,locust_user_count,locust_spawn_rate,measurement_count,fault_profile,measurement_seq_start_time,cluster_type,comment,build_number,enhancement
1,28c950b7-e744-46aa-80fa-fc429e62d912,0.524912,0.283333,0.284444,0.567778,45.833333,32.5,300.0,1.0,1.0,15.0,pod-failure,2022-04-02 18:34:17.993822,eks,final,422,Kafka
4,387bd9f2-511a-4516-9d78-c58d533ac364,0.736316,0.395556,0.151111,0.546667,55.2,54.6,300.0,1.0,1.0,15.0,network-delay,2022-03-30 16:12:44.120109,eks,final,403,
6,449d367f-18a9-4183-8167-93a654b219e0,0.97,0.685,0.03,0.715,77.0,75.666667,300.0,1.0,1.0,5.0,pod-kill,2022-04-03 13:50:00.617719,eks,final,434,Heartbeats
12,8ab0e4f9-6224-4a65-97b1-196b787b0dc4,1.0,1.0,0.0,1.0,60.0,60.0,300.0,1.0,1.0,5.0,none,2022-03-29 15:05:56.856168,eks,final,390,
13,903f2465-335e-4991-8ab6-8d0bd8446406,1.0,1.0,0.0,1.0,59.4,59.4,300.0,1.0,1.0,5.0,stress-cpu,2022-03-31 11:03:54.694877,eks,final,410,
15,a325098c-965d-4dbf-8c73-33296ac77d03,0.598947,0.335,0.279444,0.614444,36.8,33.533333,300.0,1.0,1.0,15.0,pod-failure,2022-04-03 10:51:11.781131,eks,final,431,Heartbeats
18,cd3a38d1-4acd-4594-9492-303f32e60b29,0.544912,0.208889,0.201111,0.41,37.666667,37.666667,300.0,1.0,1.0,5.0,io,2022-03-23 20:12:07.239328,eks,final,357,
19,d42b44db-f8fc-49cc-86a8-fc6827ca7b0c,0.634912,0.303889,0.210556,0.514444,40.933333,32.466667,300.0,1.0,1.0,15.0,pod-failure,2022-03-30 20:32:12.322345,eks,final,406,
20,dae3bd05-8b22-4170-ac83-c1640bd05f2e,1.0,1.0,0.0,1.0,60.0,59.0,300.0,1.0,1.0,1.0,network-partition,2022-03-30 18:18:08.086367,eks,final,405,
22,e2c28cb0-801c-41d8-9b2d-4d3f47ad1fab,0.97,0.97,0.03,1.0,74.5,74.25,300.0,1.0,1.0,5.0,stress-mem,2022-03-31 12:05:19.992145,eks,final,411,


In [32]:
# Save results
results.to_csv('data/results_cleaned.csv')