In [1]:
!pip install plotly==5.6.0

Collecting plotly==5.6.0
  Downloading plotly-5.6.0-py2.py3-none-any.whl (27.7 MB)
[K     |████████████████████████████████| 27.7 MB 2.5 MB/s eta 0:00:01
Collecting tenacity>=6.2.0
  Downloading tenacity-8.0.1-py3-none-any.whl (24 kB)
Installing collected packages: tenacity, plotly
Successfully installed plotly-5.6.0 tenacity-8.0.1


In [2]:
import pandas as pd
pd.set_option('display.max_rows', None)

import os
from io import StringIO
import requests as r
from requests.auth import HTTPBasicAuth
from datetime import datetime

In [3]:
###########
# Constants
###########

JENKINS_URL = 'http://jenkins:8080'
JENKINS_USER = os.getenv('JENKINS_USER')
JENKINS_PASS = os.getenv('JENKINS_PASS')
JENKINS_JOB_NAME = 'kubedepend'

In [4]:
custom_date_parser = lambda x: datetime.strptime(x, "%m-%d-%Y_%H-%M-%S.%f")

In [30]:
#########
# Get results from Jenkins and organize them into one DataFrame
#########
session = r.Session()
session.auth = (JENKINS_USER, JENKINS_PASS)

res = session.get(f'{JENKINS_URL}/job/{JENKINS_JOB_NAME}/api/json')
data = res.json()
builds = data['builds']

results_raw = pd.DataFrame()
build_parameters = pd.DataFrame()

for build_summary in builds:
    build_number = build_summary['number']
    res = session.get(f'{JENKINS_URL}/job/{JENKINS_JOB_NAME}/{build_number}/api/json')
    build = res.json()
    artifact_list = build['artifacts']
    if build['result'] == 'SUCCESS' and artifact_list:
        for artifact in artifact_list:
            # Get builds, where there is a 'results.csv' artifact
            if artifact['fileName'] == 'results.csv':
                # get results
                res = session.get(f'{JENKINS_URL}/job/{JENKINS_JOB_NAME}/{build_number}/artifact/{artifact["relativePath"]}')
                meas_results_df = pd.read_csv(StringIO(res.text), delimiter=',', parse_dates=['measurement_seq_start_time', 'measurement_start_time', 'measurement_end_time'], date_parser=custom_date_parser)
                results_raw = results_raw.append(meas_results_df, ignore_index=True)
                
                # meas_results_df['id'] is a Series
                meas_seq_id = meas_results_df['id'][0]
            
                # extract parameter info
                parameterItems = list(filter(lambda a: a and a['_class'] == 'hudson.model.ParametersAction' ,build['actions']))[0]['parameters']
                parameters = list(filter(lambda p: p['_class'] in ['hudson.model.BooleanParameterValue', 'hudson.model.StringParameterValue'], parameterItems))
                reshaped = dict()
                for parameter in parameters:
                    reshaped[parameter['name']] = parameter['value']
                # add build number
                reshaped['build_number'] = build_number
                reshaped['id'] = meas_seq_id
                build_parameters = build_parameters.append(reshaped, ignore_index=True)
                

In [31]:
# 'results_raw' always stores all the data from Jenkins
# if an error occurs, the 'results' can be reset without querying Jenkins again
results = results_raw

In [32]:
# Clean build_parameters
build_parameters = build_parameters.fillna(False)
build_parameters['CleanEnhancements'] = build_parameters['CleanEnhancements'].astype(bool)
build_parameters['DeleteCFStack'] = build_parameters['DeleteCFStack'].astype(bool)
build_parameters['CleanWorkSpace'] = build_parameters['CleanWorkSpace'].astype(bool)
build_parameters['DeployTools'] = build_parameters['DeployTools'].astype(bool)
build_parameters['UseKafka'] = build_parameters['UseKafka'].astype(bool)
build_parameters['UseHeartbeats'] = build_parameters['UseHeartbeats'].astype(bool)
build_parameters['build_number'] = build_parameters['build_number'].astype(int)

build_parameters.insert(0, 'build_number', build_parameters.pop('build_number'))
build_parameters.insert(0, 'id', build_parameters.pop('id'))


build_parameters.head()
# build_parameters.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31 entries, 0 to 30
Data columns (total 24 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   id                       31 non-null     object
 1   build_number             31 non-null     int64 
 2   CleanEnhancements        31 non-null     bool  
 3   DeleteCFStack            31 non-null     bool  
 4   CleanWorkSpace           31 non-null     bool  
 5   CFStackName              31 non-null     object
 6   EKSClusterName           31 non-null     object
 7   AWSRegion                31 non-null     object
 8   S3Bucket                 31 non-null     object
 9   DeployTools              31 non-null     bool  
 10  UseKafka                 31 non-null     bool  
 11  UseHeartbeats            31 non-null     bool  
 12  DeployApp                31 non-null     object
 13  RunMeasurement           31 non-null     object
 14  MEASFaultProfile         31 non-null     obj

In [33]:
# Select relevant columns
build_parameters = build_parameters.loc[:,['id', 'build_number', 'UseKafka', 'UseHeartbeats']].reset_index(drop=True)
build_parameters = build_parameters.rename(columns={'UseKafka': 'use_kafka', 'UseHeartbeats': 'use_heartbeats'})
build_parameters.head()

Unnamed: 0,id,build_number,use_kafka,use_heartbeats
0,e2c28cb0-801c-41d8-9b2d-4d3f47ad1fab,411,False,False
1,903f2465-335e-4991-8ab6-8d0bd8446406,410,False,False
2,dee627a1-14bf-4360-92da-c07ac9f7dc19,409,False,False
3,0f52ec58-1ae2-482d-9012-6c642d16bade,408,False,False
4,d42b44db-f8fc-49cc-86a8-fc6827ca7b0c,406,False,False


In [34]:
# for development, get only base measurements
results = results[results['comment'] == 'final']
# results.head()
results

Unnamed: 0,id,measurement_seq_start_time,availability,mut,mdt,mtbf,measurement_start_time,measurement_end_time,submitted_jobs,finished_jobs,fault_profile,cluster_type,measurement_count,load_duration,locust_user_count,locust_spawn_rate,prev_stack_git_commit_short,comment
0,e2c28cb0-801c-41d8-9b2d-4d3f47ad1fab,2022-03-31 12:05:19.992145,1.0,1.0,0.0,1.0,2022-03-31 12:05:28.462341,2022-03-31 12:10:35.194232,60.0,60.0,stress-mem,eks,1,300,1,1,6ce90f9,final
1,e2c28cb0-801c-41d8-9b2d-4d3f47ad1fab,2022-03-31 12:05:19.992145,1.0,1.0,0.0,1.0,2022-03-31 12:12:27.717935,2022-03-31 12:17:34.381246,60.0,60.0,stress-mem,eks,2,300,1,1,6ce90f9,final
2,e2c28cb0-801c-41d8-9b2d-4d3f47ad1fab,2022-03-31 12:05:19.992145,1.0,1.0,0.0,1.0,2022-03-31 12:19:26.881030,2022-03-31 12:24:33.832174,,,stress-mem,eks,3,300,1,1,6ce90f9,final
3,e2c28cb0-801c-41d8-9b2d-4d3f47ad1fab,2022-03-31 12:05:19.992145,1.0,1.0,0.0,1.0,2022-03-31 12:27:12.161697,2022-03-31 12:32:19.059183,118.0,117.0,stress-mem,eks,4,300,1,1,6ce90f9,final
4,e2c28cb0-801c-41d8-9b2d-4d3f47ad1fab,2022-03-31 12:05:19.992145,0.85,0.85,0.15,1.0,2022-03-31 12:34:44.141708,2022-03-31 12:39:50.893827,60.0,60.0,stress-mem,eks,5,300,1,1,6ce90f9,final
5,903f2465-335e-4991-8ab6-8d0bd8446406,2022-03-31 11:03:54.694877,1.0,1.0,0.0,1.0,2022-03-31 11:04:03.215071,2022-03-31 11:09:09.696136,60.0,60.0,stress-cpu,eks,1,300,1,1,6ce90f9,final
6,903f2465-335e-4991-8ab6-8d0bd8446406,2022-03-31 11:03:54.694877,1.0,1.0,0.0,1.0,2022-03-31 11:11:01.991690,2022-03-31 11:16:08.085424,60.0,60.0,stress-cpu,eks,2,300,1,1,6ce90f9,final
7,903f2465-335e-4991-8ab6-8d0bd8446406,2022-03-31 11:03:54.694877,1.0,1.0,0.0,1.0,2022-03-31 11:18:01.501576,2022-03-31 11:23:08.966063,59.0,59.0,stress-cpu,eks,3,300,1,1,6ce90f9,final
8,903f2465-335e-4991-8ab6-8d0bd8446406,2022-03-31 11:03:54.694877,1.0,1.0,0.0,1.0,2022-03-31 11:25:33.971871,2022-03-31 11:30:40.635801,60.0,60.0,stress-cpu,eks,4,300,1,1,6ce90f9,final
9,903f2465-335e-4991-8ab6-8d0bd8446406,2022-03-31 11:03:54.694877,1.0,1.0,0.0,1.0,2022-03-31 11:33:40.002975,2022-03-31 11:38:47.268555,58.0,58.0,stress-cpu,eks,5,300,1,1,6ce90f9,final


In [35]:
# Set meas count to the maximum of the measuremenet count in the same measurement sequence

# select columns
results_meas_count = results.loc[:, ['id', 'measurement_count']]
# group by max value
results_meas_max_count = results_meas_count.groupby(by=['id']).max()
# rename
results_meas_max_count = results_meas_max_count.rename(columns={'measurement_count': 'measurement_count_max'})
# merge with results
results = results.merge(results_meas_max_count, on='id').drop(columns=['measurement_count']).rename(columns={'measurement_count_max': 'measurement_count'})
results

Unnamed: 0,id,measurement_seq_start_time,availability,mut,mdt,mtbf,measurement_start_time,measurement_end_time,submitted_jobs,finished_jobs,fault_profile,cluster_type,load_duration,locust_user_count,locust_spawn_rate,prev_stack_git_commit_short,comment,measurement_count
0,e2c28cb0-801c-41d8-9b2d-4d3f47ad1fab,2022-03-31 12:05:19.992145,1.0,1.0,0.0,1.0,2022-03-31 12:05:28.462341,2022-03-31 12:10:35.194232,60.0,60.0,stress-mem,eks,300,1,1,6ce90f9,final,5
1,e2c28cb0-801c-41d8-9b2d-4d3f47ad1fab,2022-03-31 12:05:19.992145,1.0,1.0,0.0,1.0,2022-03-31 12:12:27.717935,2022-03-31 12:17:34.381246,60.0,60.0,stress-mem,eks,300,1,1,6ce90f9,final,5
2,e2c28cb0-801c-41d8-9b2d-4d3f47ad1fab,2022-03-31 12:05:19.992145,1.0,1.0,0.0,1.0,2022-03-31 12:19:26.881030,2022-03-31 12:24:33.832174,,,stress-mem,eks,300,1,1,6ce90f9,final,5
3,e2c28cb0-801c-41d8-9b2d-4d3f47ad1fab,2022-03-31 12:05:19.992145,1.0,1.0,0.0,1.0,2022-03-31 12:27:12.161697,2022-03-31 12:32:19.059183,118.0,117.0,stress-mem,eks,300,1,1,6ce90f9,final,5
4,e2c28cb0-801c-41d8-9b2d-4d3f47ad1fab,2022-03-31 12:05:19.992145,0.85,0.85,0.15,1.0,2022-03-31 12:34:44.141708,2022-03-31 12:39:50.893827,60.0,60.0,stress-mem,eks,300,1,1,6ce90f9,final,5
5,903f2465-335e-4991-8ab6-8d0bd8446406,2022-03-31 11:03:54.694877,1.0,1.0,0.0,1.0,2022-03-31 11:04:03.215071,2022-03-31 11:09:09.696136,60.0,60.0,stress-cpu,eks,300,1,1,6ce90f9,final,5
6,903f2465-335e-4991-8ab6-8d0bd8446406,2022-03-31 11:03:54.694877,1.0,1.0,0.0,1.0,2022-03-31 11:11:01.991690,2022-03-31 11:16:08.085424,60.0,60.0,stress-cpu,eks,300,1,1,6ce90f9,final,5
7,903f2465-335e-4991-8ab6-8d0bd8446406,2022-03-31 11:03:54.694877,1.0,1.0,0.0,1.0,2022-03-31 11:18:01.501576,2022-03-31 11:23:08.966063,59.0,59.0,stress-cpu,eks,300,1,1,6ce90f9,final,5
8,903f2465-335e-4991-8ab6-8d0bd8446406,2022-03-31 11:03:54.694877,1.0,1.0,0.0,1.0,2022-03-31 11:25:33.971871,2022-03-31 11:30:40.635801,60.0,60.0,stress-cpu,eks,300,1,1,6ce90f9,final,5
9,903f2465-335e-4991-8ab6-8d0bd8446406,2022-03-31 11:03:54.694877,1.0,1.0,0.0,1.0,2022-03-31 11:33:40.002975,2022-03-31 11:38:47.268555,58.0,58.0,stress-cpu,eks,300,1,1,6ce90f9,final,5


In [36]:
#  group by measurement ID and get mean of numerical properties
results_avg_by_id = results.groupby(by=['id']).mean()
# results_avg_by_id.head()
results_avg_by_id

Unnamed: 0_level_0,availability,mut,mdt,mtbf,submitted_jobs,finished_jobs,load_duration,locust_user_count,locust_spawn_rate,measurement_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
03631a77-d080-44bf-a582-718e4c39084d,0.325439,0.22963,,,44.777778,44.444444,300.0,1.0,1.0,9.0
0f52ec58-1ae2-482d-9012-6c642d16bade,0.92,0.531667,0.05,0.581667,72.5,65.5,300.0,1.0,1.0,5.0
2912fcdb-b8a0-4c23-87a9-632fb1fa7762,0.92,0.531667,0.05,0.581667,58.25,55.0,300.0,1.0,1.0,5.0
2b2484d5-302d-4448-ad74-f4413b11a1ba,0.95,0.575,0.04,0.615,70.5,66.25,300.0,1.0,1.0,5.0
387bd9f2-511a-4516-9d78-c58d533ac364,0.736316,0.395556,0.151111,0.546667,55.2,54.6,300.0,1.0,1.0,15.0
42e912b3-5cc4-47bc-a518-672fd0c0d01f,0.98,0.98,0.02,1.0,60.0,60.0,300.0,1.0,1.0,5.0
4991a6f2-16d1-4219-8ca5-a3fee9cb2631,0.445175,0.252257,,,48.416667,48.041667,300.0,1.0,1.0,24.0
49a35ce4-b2bb-40b9-815e-41b85fc8696d,0.94,0.54,0.04,0.58,115.333333,108.666667,300.0,1.0,1.0,5.0
67e0b885-8e9f-46b6-bab3-28289e78551b,0.86,0.495,0.095,0.59,54.8,53.4,300.0,1.0,1.0,5.0
7c3071b8-cbde-4d2c-ae91-39ea3a11e52b,0.86,0.359167,0.07,0.429167,53.8,51.8,300.0,1.0,1.0,5.0


In [37]:
# get non-numerical data for each measurement sequence
results_non_numerical = results.loc[:, ['id', 'fault_profile', 'measurement_seq_start_time', 'cluster_type', 'comment']] \
    .drop_duplicates() \
    .reset_index(drop=True)
# results_non_numerical.head()
results_non_numerical

Unnamed: 0,id,fault_profile,measurement_seq_start_time,cluster_type,comment
0,e2c28cb0-801c-41d8-9b2d-4d3f47ad1fab,stress-mem,2022-03-31 12:05:19.992145,eks,final
1,903f2465-335e-4991-8ab6-8d0bd8446406,stress-cpu,2022-03-31 11:03:54.694877,eks,final
2,dee627a1-14bf-4360-92da-c07ac9f7dc19,pod-kill,2022-03-31 08:41:14.994202,eks,final
3,0f52ec58-1ae2-482d-9012-6c642d16bade,pod-kill,2022-03-31 07:55:35.485558,eks,final
4,d42b44db-f8fc-49cc-86a8-fc6827ca7b0c,pod-failure,2022-03-30 20:32:12.322345,eks,final
5,dae3bd05-8b22-4170-ac83-c1640bd05f2e,network-partition,2022-03-30 18:18:08.086367,eks,final
6,7c3071b8-cbde-4d2c-ae91-39ea3a11e52b,network-partition,2022-03-30 18:31:11.125831,eks,final
7,387bd9f2-511a-4516-9d78-c58d533ac364,network-delay,2022-03-30 16:12:44.120109,eks,final
8,bcfdcd44-a057-49ec-b96c-07eeb7a261bc,stress-cpu,2022-03-30 09:30:24.944296,eks,final
9,2b2484d5-302d-4448-ad74-f4413b11a1ba,pod-kill,2022-03-30 08:47:38.627685,eks,final


In [38]:
# merge the grouped by avg results with non-numerical data
results = pd.merge(results_avg_by_id, results_non_numerical, on='id')

# merge the results with build parameters
results = pd.merge(results, build_parameters, on='id')

# filter out measurement sequence with less than MIN_MEASUREMENT_COUNT measurements
# results = results[results['measurement_count'] >= MIN_MEASUREMENT_COUNT]

# results.head()
results

Unnamed: 0,id,availability,mut,mdt,mtbf,submitted_jobs,finished_jobs,load_duration,locust_user_count,locust_spawn_rate,measurement_count,fault_profile,measurement_seq_start_time,cluster_type,comment,build_number,use_kafka,use_heartbeats
0,0f52ec58-1ae2-482d-9012-6c642d16bade,0.92,0.531667,0.05,0.581667,72.5,65.5,300.0,1.0,1.0,5.0,pod-kill,2022-03-31 07:55:35.485558,eks,final,408,False,False
1,2912fcdb-b8a0-4c23-87a9-632fb1fa7762,0.92,0.531667,0.05,0.581667,58.25,55.0,300.0,1.0,1.0,5.0,pod-kill,2022-03-29 18:42:57.866807,eks,final,392,False,False
2,2b2484d5-302d-4448-ad74-f4413b11a1ba,0.95,0.575,0.04,0.615,70.5,66.25,300.0,1.0,1.0,5.0,pod-kill,2022-03-30 08:47:38.627685,eks,final,396,False,False
3,387bd9f2-511a-4516-9d78-c58d533ac364,0.736316,0.395556,0.151111,0.546667,55.2,54.6,300.0,1.0,1.0,15.0,network-delay,2022-03-30 16:12:44.120109,eks,final,403,False,False
4,42e912b3-5cc4-47bc-a518-672fd0c0d01f,0.98,0.98,0.02,1.0,60.0,60.0,300.0,1.0,1.0,5.0,none,2022-03-29 12:24:31.624449,eks,final,387,False,False
5,49a35ce4-b2bb-40b9-815e-41b85fc8696d,0.94,0.54,0.04,0.58,115.333333,108.666667,300.0,1.0,1.0,5.0,pod-kill,2022-03-29 19:59:54.353811,eks,final,393,False,False
6,67e0b885-8e9f-46b6-bab3-28289e78551b,0.86,0.495,0.095,0.59,54.8,53.4,300.0,1.0,1.0,5.0,network-partition,2022-03-28 10:34:52.759123,eks,final,384,False,False
7,8795a947-0c52-4a4f-aa0a-cb342446e2f1,0.87,0.376667,0.07,0.446667,72.75,68.25,300.0,1.0,1.0,5.0,pod-kill,2022-03-29 17:57:22.975413,eks,final,391,False,False
8,8ab0e4f9-6224-4a65-97b1-196b787b0dc4,1.0,1.0,0.0,1.0,60.0,60.0,300.0,1.0,1.0,5.0,none,2022-03-29 15:05:56.856168,eks,final,390,False,False
9,903f2465-335e-4991-8ab6-8d0bd8446406,1.0,1.0,0.0,1.0,59.4,59.4,300.0,1.0,1.0,5.0,stress-cpu,2022-03-31 11:03:54.694877,eks,final,410,False,False


In [21]:
# drop rows where there are NaN data
results = results.dropna()

# drop rows where mtbf metrics have > 1 value
results = results[results['mtbf'] <= 1]

# drop rows where comment is 'test'
resutls = results[results['comment'] != 'test']

# group by fault profile and only keep the newest measurement
time_column = 'measurement_seq_start_time'
latest_idx = results.groupby(['fault_profile', 'use_kafka', 'use_heartbeats'])[time_column].transform('max') == results[time_column]

results = results[latest_idx]
results

KeyError: 'use_heartbeats'

In [14]:
# Save results
results.to_csv('data/results_cleaned.csv')