In [None]:
!pip install plotly==5.6.0

In [None]:
import pandas as pd
pd.set_option('display.max_rows', None)

import os
from io import StringIO
import requests as r
from requests.auth import HTTPBasicAuth
from datetime import datetime

In [None]:
###########
# Constants
###########

JENKINS_URL = 'http://jenkins:8080'
JENKINS_USER = os.getenv('JENKINS_USER')
JENKINS_PASS = os.getenv('JENKINS_PASS')
JENKINS_JOB_NAME = 'kubedepend'

In [None]:
custom_date_parser = lambda x: datetime.strptime(x, "%m-%d-%Y_%H-%M-%S.%f")

In [None]:
#########
# Get results from Jenkins and organize them into one DataFrame
#########
session = r.Session()
session.auth = (JENKINS_USER, JENKINS_PASS)

res = session.get(f'{JENKINS_URL}/job/{JENKINS_JOB_NAME}/api/json')
data = res.json()
builds = data['builds']

results_raw = pd.DataFrame()
build_parameters = pd.DataFrame()

for build_summary in builds:
    build_number = build_summary['number']
    res = session.get(f'{JENKINS_URL}/job/{JENKINS_JOB_NAME}/{build_number}/api/json')
    build = res.json()
    artifact_list = build['artifacts']
    if build['result'] == 'SUCCESS' and artifact_list:
        for artifact in artifact_list:
            # Get builds, where there is a 'results.csv' artifact
            if artifact['fileName'] == 'results.csv':
                # get results
                res = session.get(f'{JENKINS_URL}/job/{JENKINS_JOB_NAME}/{build_number}/artifact/{artifact["relativePath"]}')
                meas_results_df = pd.read_csv(StringIO(res.text), delimiter=',', parse_dates=['measurement_seq_start_time', 'measurement_start_time', 'measurement_end_time'], date_parser=custom_date_parser)
                results_raw = results_raw.append(meas_results_df, ignore_index=True)
                
                # meas_results_df['id'] is a Series
                meas_seq_id = meas_results_df['id'][0]
            
                # extract parameter info
                parameterItems = list(filter(lambda a: a and a['_class'] == 'hudson.model.ParametersAction' ,build['actions']))[0]['parameters']
                parameters = list(filter(lambda p: p['_class'] in ['hudson.model.BooleanParameterValue', 'hudson.model.StringParameterValue'], parameterItems))
                reshaped = dict()
                for parameter in parameters:
                    reshaped[parameter['name']] = parameter['value']
                # add build number
                reshaped['build_number'] = build_number
                reshaped['id'] = meas_seq_id
                build_parameters = build_parameters.append(reshaped, ignore_index=True)
                

In [None]:
# 'results_raw' always stores all the data from Jenkins
# if an error occurs, the 'results' can be reset without querying Jenkins again
results = results_raw

In [None]:
# Clean build_parameters
build_parameters = build_parameters.fillna(False)
build_parameters['CleanEnhancements'] = build_parameters['CleanEnhancements'].astype(bool)
build_parameters['DeleteCFStack'] = build_parameters['DeleteCFStack'].astype(bool)
build_parameters['CleanWorkSpace'] = build_parameters['CleanWorkSpace'].astype(bool)
build_parameters['DeployTools'] = build_parameters['DeployTools'].astype(bool)
build_parameters['UseKafka'] = build_parameters['UseKafka'].astype(bool)
build_parameters['UseHeartbeats'] = build_parameters['UseHeartbeats'].astype(bool)
build_parameters['build_number'] = build_parameters['build_number'].astype(int)

build_parameters.insert(0, 'build_number', build_parameters.pop('build_number'))
build_parameters.insert(0, 'id', build_parameters.pop('id'))


build_parameters.head()
# build_parameters.info()

In [None]:
# Select relevant columns
build_parameters = build_parameters.loc[:,['id', 'build_number', 'UseKafka', 'UseHeartbeats']].reset_index(drop=True)
build_parameters = build_parameters.rename(columns={'UseKafka': 'use_kafka', 'UseHeartbeats': 'use_heartbeats'})
build_parameters.head()

In [None]:
# for development, get only base measurements
results = results[results['comment'] == 'final']
# results.head()
results

In [None]:
# Set meas count to the maximum of the measuremenet count in the same measurement sequence

# select columns
results_meas_count = results.loc[:, ['id', 'measurement_count']]
# group by max value
results_meas_max_count = results_meas_count.groupby(by=['id']).max()
# rename
results_meas_max_count = results_meas_max_count.rename(columns={'measurement_count': 'measurement_count_max'})
# merge with results
results = results.merge(results_meas_max_count, on='id').drop(columns=['measurement_count']).rename(columns={'measurement_count_max': 'measurement_count'})
results

In [None]:
#  group by measurement ID and get mean of numerical properties
results_avg_by_id = results.groupby(by=['id']).mean()
# results_avg_by_id.head()
results_avg_by_id

In [None]:
# get non-numerical data for each measurement sequence
results_non_numerical = results.loc[:, ['id', 'fault_profile', 'measurement_seq_start_time', 'cluster_type', 'comment']] \
    .drop_duplicates() \
    .reset_index(drop=True)
# results_non_numerical.head()
results_non_numerical

In [None]:
# merge the grouped by avg results with non-numerical data
results = pd.merge(results_avg_by_id, results_non_numerical, on='id')

# merge the results with build parameters
results = pd.merge(results, build_parameters, on='id')

# filter out measurement sequence with less than MIN_MEASUREMENT_COUNT measurements
# results = results[results['measurement_count'] >= MIN_MEASUREMENT_COUNT]

# results.head()
results

In [None]:
# drop rows where there are NaN data
results = results.dropna()

# drop rows where mtbf metrics have > 1 value
results = results[results['mtbf'] <= 1]

# drop rows where comment is 'test'
resutls = results[results['comment'] != 'test']

# group by fault profile and only keep the newest measurement
time_column = 'measurement_seq_start_time'
latest_idx = results.groupby(['fault_profile', 'use_kafka', 'use_heartbeats'])[time_column].transform('max') == results[time_column]

results = results[latest_idx]
results

In [None]:
def enhancement_labeling(row):
    if (row['use_kafka'] and row['use_heartbeats']):
        return 'Kafka_And_Heartbeats'
    if (row['use_kafka']):
        return 'Kafka'
    if (row['use_heartbeats']):
        return 'Heartbeats'
    return 'None'

In [None]:
results['enhancement'] = results.apply(enhancement_labeling, axis=1)
results = results.drop(columns=['use_kafka', 'use_heartbeats'])
results

In [None]:
# Save results
results.to_csv('data/results_cleaned.csv')