In [97]:
!pip install plotly==5.6.0



In [120]:
import plotly.express as px
import pandas as pd
pd.options.plotting.backend = "plotly"

import os
from io import StringIO
import requests as r
from requests.auth import HTTPBasicAuth
from datetime import datetime

In [99]:
###########
# Constants
###########

JENKINS_URL = 'http://jenkins:8080'
JENKINS_USER = os.getenv('JENKINS_USER')
JENKINS_PASS = os.getenv('JENKINS_PASS')
JENKINS_JOB_NAME = 'kubedepend'

MIN_MEASUREMENT_COUNT = 5

In [123]:
custom_date_parser = lambda x: datetime.strptime(x, "%m-%d-%Y_%H-%M-%S.%f")

In [160]:
#########
# Get results from Jenkins and organize them into one DataFrame
#########
session = r.Session()
session.auth = (JENKINS_USER, JENKINS_PASS)

res = session.get(f'{JENKINS_URL}/job/{JENKINS_JOB_NAME}/api/json')
data = res.json()
builds = data['builds']

results = pd.DataFrame()
build_parameters = pd.DataFrame()

for build_summary in builds:
    build_number = build_summary['number']
    res = session.get(f'{JENKINS_URL}/job/{JENKINS_JOB_NAME}/{build_number}/api/json')
    build = res.json()
    artifact_list = build['artifacts']
    if artifact_list:
        for artifact in artifact_list:
            # Get builds, where there is a 'results.csv' artifact
            if artifact['fileName'] == 'results.csv':
                # get results
                res = session.get(f'{JENKINS_URL}/job/{JENKINS_JOB_NAME}/{build_number}/artifact/{artifact["relativePath"]}')
                meas_results_df = pd.read_csv(StringIO(res.text), delimiter=',', parse_dates=['measurement_seq_start_time', 'measurement_start_time', 'measurement_end_time'], date_parser=custom_date_parser)
                results = results.append(meas_results_df, ignore_index=True)
                
                meas_seq_id = meas_results_df['id']
#                 print(meas_seq_id)
                
                # get parameters
                print(build['actions'])
    
                action = list(filter(lambda a: a['_class'] == 'hudson.model.ParametersAction', build['actions']))[0]
                print(action)

#                 parameters = action[0]
#                 print(parameters)
                

[{'_class': 'hudson.model.ParametersAction', 'parameters': [{'_class': 'jenkins.plugins.parameter_separator.ParameterSeparatorValue', 'name': 'CLEAN'}, {'_class': 'hudson.model.BooleanParameterValue', 'name': 'CleanEnhancements', 'value': False}, {'_class': 'hudson.model.BooleanParameterValue', 'name': 'DeleteCFStack', 'value': False}, {'_class': 'hudson.model.BooleanParameterValue', 'name': 'CleanWorkSpace', 'value': True}, {'_class': 'jenkins.plugins.parameter_separator.ParameterSeparatorValue', 'name': 'INFRASTRUCTURE'}, {'_class': 'hudson.model.StringParameterValue', 'name': 'CFStackName', 'value': 'morova-eks-cluster'}, {'_class': 'hudson.model.StringParameterValue', 'name': 'EKSClusterName', 'value': 'morova-eks-cluster'}, {'_class': 'hudson.model.StringParameterValue', 'name': 'AWSRegion', 'value': 'us-east-2'}, {'_class': 'hudson.model.StringParameterValue', 'name': 'S3Bucket', 'value': 'morova-bucket'}, {'_class': 'hudson.model.BooleanParameterValue', 'name': 'DeployTools', 'v

KeyError: '_class'

In [125]:
# for development, get only base measurements
results = results[results['comment'] == 'base']
results.head()

Unnamed: 0,id,measurement_seq_start_time,availability,mut,mdt,mtbf,measurement_start_time,measurement_end_time,submitted_jobs,finished_jobs,fault_profile,cluster_type,measurement_count,load_duration,locust_user_count,locust_spawn_rate,prev_stack_git_commit_short,comment
0,9152f84b-7245-4347-972f-2cb96892e8c3,2022-03-06 11:26:28.082508,1.0,1.0,0.0,1.0,2022-03-06 11:26:37.990010,2022-03-06 11:31:45.722866,60.0,60.0,none,eks,10,300,1,1,5ee510d,base
1,9152f84b-7245-4347-972f-2cb96892e8c3,2022-03-06 11:26:28.082508,1.0,1.0,0.0,1.0,2022-03-06 11:32:39.159891,2022-03-06 11:37:47.290404,60.0,60.0,none,eks,10,300,1,1,5ee510d,base
2,9152f84b-7245-4347-972f-2cb96892e8c3,2022-03-06 11:26:28.082508,1.0,1.0,0.0,1.0,2022-03-06 11:39:12.893594,2022-03-06 11:44:20.317779,60.0,60.0,none,eks,10,300,1,1,5ee510d,base
3,9152f84b-7245-4347-972f-2cb96892e8c3,2022-03-06 11:26:28.082508,1.0,1.0,0.0,1.0,2022-03-06 11:45:47.656683,2022-03-06 11:50:54.600987,60.0,60.0,none,eks,10,300,1,1,5ee510d,base
4,9152f84b-7245-4347-972f-2cb96892e8c3,2022-03-06 11:26:28.082508,1.0,1.0,0.0,1.0,2022-03-06 11:52:22.103019,2022-03-06 11:57:29.635244,60.0,60.0,none,eks,10,300,1,1,5ee510d,base


In [126]:
# TODO clean data
results.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35 entries, 0 to 34
Data columns (total 18 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   id                           35 non-null     object        
 1   measurement_seq_start_time   35 non-null     datetime64[ns]
 2   availability                 35 non-null     float64       
 3   mut                          35 non-null     float64       
 4   mdt                          31 non-null     float64       
 5   mtbf                         24 non-null     float64       
 6   measurement_start_time       35 non-null     datetime64[ns]
 7   measurement_end_time         35 non-null     datetime64[ns]
 8   submitted_jobs               35 non-null     float64       
 9   finished_jobs                35 non-null     float64       
 10  fault_profile                35 non-null     object        
 11  cluster_type                 35 non-null     ob

In [127]:
#  group by measurement ID and get mean of numerical properties
results_avg_by_id = results.groupby(by=['id']).mean()
results_avg_by_id.head()

Unnamed: 0_level_0,availability,mut,mdt,mtbf,submitted_jobs,finished_jobs,measurement_count,load_duration,locust_user_count,locust_spawn_rate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
520f89e5-da6d-41c5-811a-7be604cd1bd5,0.95,0.95,0.05,1.0,36.0,35.8,5.0,180.0,1.0,1.0
9152f84b-7245-4347-972f-2cb96892e8c3,1.0,1.0,0.0,1.0,60.0,60.0,10.0,300.0,1.0,1.0
953e2115-b3c3-4f9f-a0b6-1068e79d86f1,0.975,175.5,4.5,180.0,36.0,36.0,10.0,180.0,1.0,1.0
b4a08b44-235f-4957-9ff8-db8bee87eb65,0.95,171.0,12.5,,35.9,35.9,10.0,180.0,1.0,1.0


In [128]:
# get non-numerical data for each measurement sequence
results_non_numerical = results.loc[:, ['id', 'fault_profile', 'measurement_seq_start_time', 'cluster_type', 'comment']] \
    .drop_duplicates() \
    .reset_index(drop=True)
results_non_numerical.head()

Unnamed: 0,id,fault_profile,measurement_seq_start_time,cluster_type,comment
0,9152f84b-7245-4347-972f-2cb96892e8c3,none,2022-03-06 11:26:28.082508,eks,base
1,520f89e5-da6d-41c5-811a-7be604cd1bd5,none,2022-03-06 10:53:31.260856,eks,base
2,953e2115-b3c3-4f9f-a0b6-1068e79d86f1,none,2022-03-05 20:32:20.446613,eks,base
3,b4a08b44-235f-4957-9ff8-db8bee87eb65,none,2022-03-05 17:41:37.884827,eks,base


In [129]:
# merge the grouped by avg results with non-numerical data
results = pd.merge(results_avg_by_id, results_non_numerical, on='id')

# filter out measurement sequence with less than MIN_MEASUREMENT_COUNT measurements
results = results[results['measurement_count'] >= MIN_MEASUREMENT_COUNT]

results.head()

Unnamed: 0,id,availability,mut,mdt,mtbf,submitted_jobs,finished_jobs,measurement_count,load_duration,locust_user_count,locust_spawn_rate,fault_profile,measurement_seq_start_time,cluster_type,comment
0,520f89e5-da6d-41c5-811a-7be604cd1bd5,0.95,0.95,0.05,1.0,36.0,35.8,5.0,180.0,1.0,1.0,none,2022-03-06 10:53:31.260856,eks,base
1,9152f84b-7245-4347-972f-2cb96892e8c3,1.0,1.0,0.0,1.0,60.0,60.0,10.0,300.0,1.0,1.0,none,2022-03-06 11:26:28.082508,eks,base
2,953e2115-b3c3-4f9f-a0b6-1068e79d86f1,0.975,175.5,4.5,180.0,36.0,36.0,10.0,180.0,1.0,1.0,none,2022-03-05 20:32:20.446613,eks,base
3,b4a08b44-235f-4957-9ff8-db8bee87eb65,0.95,171.0,12.5,,35.9,35.9,10.0,180.0,1.0,1.0,none,2022-03-05 17:41:37.884827,eks,base


In [140]:
# drop rows where there are NaN data
results = results.dropna()

# drop rows where mtbf metrics have > 1 value
results = results[results['mtbf'] <= 1]

# drop rows where comment is 'test'
resutls = results[results['comment'] != 'test']

# group by fault profile and only keep the newest measurement
time_column = 'measurement_seq_start_time'
latest_idx = results.groupby(['fault_profile'])[time_column].transform('max') == results[time_column]

results = results[latest_idx]
results

Unnamed: 0,id,availability,mut,mdt,mtbf,submitted_jobs,finished_jobs,measurement_count,load_duration,locust_user_count,locust_spawn_rate,fault_profile,measurement_seq_start_time,cluster_type,comment
1,9152f84b-7245-4347-972f-2cb96892e8c3,1.0,1.0,0.0,1.0,60.0,60.0,10.0,300.0,1.0,1.0,none,2022-03-06 11:26:28.082508,eks,base


In [147]:
metrics = results[['fault_profile', 'availability', 'mut', 'mdt', 'mtbf']]
metrics.head()

Unnamed: 0,fault_profile,availability,mut,mdt,mtbf
1,none,1.0,1.0,0.0,1.0


In [148]:
# TEST
cols = ['fault_profile', 'availability', 'mut', 'mdt', 'mtbf']
metrics = pd.concat([metrics, pd.DataFrame([['test1', 0.8, 0.6, 0.2, 0.8], ['test2', 0.4, 0.3, 0.5, 0.8]], columns=cols)], ignore_index=True)
metrics.head()

Unnamed: 0,fault_profile,availability,mut,mdt,mtbf
0,none,1.0,1.0,0.0,1.0
1,test1,0.8,0.6,0.2,0.8
2,test2,0.4,0.3,0.5,0.8


In [149]:
# create long data from wide
metrics_long = metrics.melt(id_vars=['fault_profile'], var_name='metric')
metrics_long.head()

Unnamed: 0,fault_profile,metric,value
0,none,availability,1.0
1,test1,availability,0.8
2,test2,availability,0.4
3,none,mut,1.0
4,test1,mut,0.6


In [150]:


fig = px.line_polar(metrics_long, r="value", theta="metric", color="fault_profile", line_close=True)
fig.show()

In [112]:
# Availability on average per fault profiles
ax = results_avg_fault_profile_idx.loc[:, 'availability'].plot.bar(figsize=(16, 13), fontsize=18)
for p in ax.patches:
    ax.annotate('{:.3f}'.format(p.get_height()), (p.get_x() - 0.06, p.get_height() + 0.01))

TypeError: bar() got an unexpected keyword argument 'figsize'

In [None]:
# MUT, MDT, MTBF on average per fault profiles
#  .legend(prop={'size': 50}
ax = results_avg_fault_profile_idx.loc[:, ['mut', 'mdt', 'mtbf']] \
    .plot.bar(figsize=(20, 15), fontsize=18, ylabel="seconds")
for p in ax.patches:
    ax.annotate('{:.3f}'.format(p.get_height()), (p.get_x() - 0.06, p.get_height() + 5))