# Collect data for github action workflow runs

In this notebook, we collect historical test data like the test duration values from running workflows on Github using the GitHub API

## Collect data for selected workflow runs of a repository

From historical test workflow runs, want to extract
- time durations
- workflow run status & conclusion

We can get workflow IDs of the test that we are interested in from https://api.github.com/repos/{ORG}/{REPO}/actions/workflows 

In [1]:
from dotenv import find_dotenv, load_dotenv
import os
import json
import subprocess
import datetime
from subprocess import PIPE
import pandas as pd
pd.options.mode.chained_assignment = None


from ipynb.fs.defs.osp_helper_functions import (
    CephCommunication,
    fit_distribution,
    standardize,
    filter_test_type,
    fetch_all_tests,
    best_distribution,
    optimal_stopping_point,
)

import warnings

warnings.filterwarnings("ignore")

In [2]:
load_dotenv(find_dotenv(), override=True)
TOKEN = os.getenv("GITHUB_ACCESS_TOKEN")

For example, lets collect data for the test ID 28698040 for the workflow runs in the repository `oss-aspen/8Knot` https://api.github.com/repos/oss-aspen/8Knot/actions/workflows

In [3]:
def get_page_numbers(test_id):
    """
    Get the total count of tests.
    Find the pages on github-actions.
    """
    command = """curl \
      -H "Accept: application/vnd.github+json" \
      -H "Authorization: Bearer {}"\
      -H "X-GitHub-Api-Version: 2022-11-28" \
      https://api.github.com/repos/oss-aspen/8Knot/actions/workflows/{}/runs?""".format(TOKEN,test_id)
    args = []
    args.append(command)
    output = subprocess.run(args, shell=True, check=True, stdout=PIPE, stderr=PIPE)
    output = json.loads(output.stdout)
    total_count = output['total_count']
    page_numbers = int(total_count/30) # by default number of tests on one page is 30
    return page_numbers

In [4]:
def get_runs(test_id, page_numbers):
    """
    This function takes test_id and number of pages of workflow runs as input.
    Interacts with github api and collects the data for the tests with the specified id.
    Outputs the data frame with test data.
    """
    for p in range(1,page_numbers+1):
        command = """curl \
      -H "Accept: application/vnd.github+json" \
      -H "Authorization: Bearer {}"\
      -H "X-GitHub-Api-Version: 2022-11-28" \
      https://api.github.com/repos/oss-aspen/8Knot/actions/workflows/{}/runs?page={}""".format(TOKEN, test_id, p)
        args = []
        args.append(command)

        output = subprocess.run(args, shell=True, check=True, stdout=PIPE, stderr=PIPE)
        output = json.loads(output.stdout)

        if p==1:
            df = pd.json_normalize(output['workflow_runs'])
        else:
            df2 = pd.json_normalize(output['workflow_runs'])
            df = pd.concat([df, df2], axis=0)
    return df

In [5]:
test_id = "28698040" # Pre-commit test
page_numbers = get_page_numbers(test_id)
page_numbers

9

In [6]:
df = get_runs(test_id, page_numbers)

In [7]:
df

Unnamed: 0,id,name,node_id,head_branch,head_sha,path,display_title,run_number,event,status,...,head_repository.merges_url,head_repository.archive_url,head_repository.downloads_url,head_repository.issues_url,head_repository.pulls_url,head_repository.milestones_url,head_repository.notifications_url,head_repository.labels_url,head_repository.releases_url,head_repository.deployments_url
0,4316905413,pre-commit,WFR_kwLOGwuno88AAAABAU6_xQ,dev,280dafba7c1dc87090f484d765648cf17e562ca6,.github/workflows/pre-commit.yml,Update README.md,302,push,completed,...,https://api.github.com/repos/oss-aspen/8Knot/m...,https://api.github.com/repos/oss-aspen/8Knot/{...,https://api.github.com/repos/oss-aspen/8Knot/d...,https://api.github.com/repos/oss-aspen/8Knot/i...,https://api.github.com/repos/oss-aspen/8Knot/p...,https://api.github.com/repos/oss-aspen/8Knot/m...,https://api.github.com/repos/oss-aspen/8Knot/n...,https://api.github.com/repos/oss-aspen/8Knot/l...,https://api.github.com/repos/oss-aspen/8Knot/r...,https://api.github.com/repos/oss-aspen/8Knot/d...
1,4298089675,pre-commit,WFR_kwLOGwuno88AAAABAC-kyw,update-readme-where-login-is,dd46964f2c98d1e8be720f8cbc99c8e24169f568,.github/workflows/pre-commit.yml,Update README.md,301,pull_request,completed,...,https://api.github.com/repos/oss-aspen/8Knot/m...,https://api.github.com/repos/oss-aspen/8Knot/{...,https://api.github.com/repos/oss-aspen/8Knot/d...,https://api.github.com/repos/oss-aspen/8Knot/i...,https://api.github.com/repos/oss-aspen/8Knot/p...,https://api.github.com/repos/oss-aspen/8Knot/m...,https://api.github.com/repos/oss-aspen/8Knot/n...,https://api.github.com/repos/oss-aspen/8Knot/l...,https://api.github.com/repos/oss-aspen/8Knot/r...,https://api.github.com/repos/oss-aspen/8Knot/d...
2,4298071211,pre-commit,WFR_kwLOGwuno88AAAABAC9cqw,login-documentation-update,c7032ff26955cbc09baec89bed48cee83b8934f6,.github/workflows/pre-commit.yml,Update and rename AUGUR_INT.md to AUGUR_LOGIN.md,300,pull_request,completed,...,https://api.github.com/repos/oss-aspen/8Knot/m...,https://api.github.com/repos/oss-aspen/8Knot/{...,https://api.github.com/repos/oss-aspen/8Knot/d...,https://api.github.com/repos/oss-aspen/8Knot/i...,https://api.github.com/repos/oss-aspen/8Knot/p...,https://api.github.com/repos/oss-aspen/8Knot/m...,https://api.github.com/repos/oss-aspen/8Knot/n...,https://api.github.com/repos/oss-aspen/8Knot/l...,https://api.github.com/repos/oss-aspen/8Knot/r...,https://api.github.com/repos/oss-aspen/8Knot/d...
3,4297918764,pre-commit,WFR_kwLOGwuno88AAAABAC0JLA,dev,677fa9b3e3995c0907acf9786cc6258a8e34e117,.github/workflows/pre-commit.yml,Add license,299,push,completed,...,https://api.github.com/repos/oss-aspen/8Knot/m...,https://api.github.com/repos/oss-aspen/8Knot/{...,https://api.github.com/repos/oss-aspen/8Knot/d...,https://api.github.com/repos/oss-aspen/8Knot/i...,https://api.github.com/repos/oss-aspen/8Knot/p...,https://api.github.com/repos/oss-aspen/8Knot/m...,https://api.github.com/repos/oss-aspen/8Knot/n...,https://api.github.com/repos/oss-aspen/8Knot/l...,https://api.github.com/repos/oss-aspen/8Knot/r...,https://api.github.com/repos/oss-aspen/8Knot/d...
4,4247826657,pre-commit,WFR_kwLOGwuno879MLDh,metric-patch,28babd49bd997723559eca60e142373d281f8565,.github/workflows/pre-commit.yml,patch home page metric failure,298,pull_request,completed,...,https://api.github.com/repos/JamesKunstle/8Kno...,https://api.github.com/repos/JamesKunstle/8Kno...,https://api.github.com/repos/JamesKunstle/8Kno...,https://api.github.com/repos/JamesKunstle/8Kno...,https://api.github.com/repos/JamesKunstle/8Kno...,https://api.github.com/repos/JamesKunstle/8Kno...,https://api.github.com/repos/JamesKunstle/8Kno...,https://api.github.com/repos/JamesKunstle/8Kno...,https://api.github.com/repos/JamesKunstle/8Kno...,https://api.github.com/repos/JamesKunstle/8Kno...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25,2616419264,pre-commit,WFR_kwLOGwuno86b82PA,dev,3627f5dd692616bddc4b52f27e6f32d37058efc6,.github/workflows/pre-commit.yml,Merge pull request #109 from JamesKunstle/logging,24,push,completed,...,https://api.github.com/repos/oss-aspen/8Knot/m...,https://api.github.com/repos/oss-aspen/8Knot/{...,https://api.github.com/repos/oss-aspen/8Knot/d...,https://api.github.com/repos/oss-aspen/8Knot/i...,https://api.github.com/repos/oss-aspen/8Knot/p...,https://api.github.com/repos/oss-aspen/8Knot/m...,https://api.github.com/repos/oss-aspen/8Knot/n...,https://api.github.com/repos/oss-aspen/8Knot/l...,https://api.github.com/repos/oss-aspen/8Knot/r...,https://api.github.com/repos/oss-aspen/8Knot/d...
26,2591416939,pre-commit,WFR_kwLOGwuno86adeJr,logging,feddee53b81539efb38196132557f5324dc18b4a,.github/workflows/pre-commit.yml,Switched app to use logging instead of print,23,pull_request,completed,...,https://api.github.com/repos/JamesKunstle/8Kno...,https://api.github.com/repos/JamesKunstle/8Kno...,https://api.github.com/repos/JamesKunstle/8Kno...,https://api.github.com/repos/JamesKunstle/8Kno...,https://api.github.com/repos/JamesKunstle/8Kno...,https://api.github.com/repos/JamesKunstle/8Kno...,https://api.github.com/repos/JamesKunstle/8Kno...,https://api.github.com/repos/JamesKunstle/8Kno...,https://api.github.com/repos/JamesKunstle/8Kno...,https://api.github.com/repos/JamesKunstle/8Kno...
27,2585823572,pre-commit,WFR_kwLOGwuno86aIIlU,feature,a45cbdd82b60bb0ebd402179bb7c1760f09bf663,.github/workflows/pre-commit.yml,check for config.json and production environment,22,pull_request,completed,...,https://api.github.com/repos/JamesKunstle/8Kno...,https://api.github.com/repos/JamesKunstle/8Kno...,https://api.github.com/repos/JamesKunstle/8Kno...,https://api.github.com/repos/JamesKunstle/8Kno...,https://api.github.com/repos/JamesKunstle/8Kno...,https://api.github.com/repos/JamesKunstle/8Kno...,https://api.github.com/repos/JamesKunstle/8Kno...,https://api.github.com/repos/JamesKunstle/8Kno...,https://api.github.com/repos/JamesKunstle/8Kno...,https://api.github.com/repos/JamesKunstle/8Kno...
28,2584522093,pre-commit,WFR_kwLOGwuno86aDK1t,dev,b0150565cc68a858966be2989874ced408e3b54a,.github/workflows/pre-commit.yml,multiple minor fixes,21,pull_request,completed,...,https://api.github.com/repos/cdolfi/8Knot/merges,https://api.github.com/repos/cdolfi/8Knot/{arc...,https://api.github.com/repos/cdolfi/8Knot/down...,https://api.github.com/repos/cdolfi/8Knot/issu...,https://api.github.com/repos/cdolfi/8Knot/pull...,https://api.github.com/repos/cdolfi/8Knot/mile...,https://api.github.com/repos/cdolfi/8Knot/noti...,https://api.github.com/repos/cdolfi/8Knot/labe...,https://api.github.com/repos/cdolfi/8Knot/rele...,https://api.github.com/repos/cdolfi/8Knot/depl...


In [8]:
test_df = df[['created_at','updated_at', 'id', 'status', 'conclusion']]
test_df.value_counts() #verify all entries are collected. 

created_at            updated_at            id          status     conclusion
2022-06-29T16:02:03Z  2022-06-29T16:02:43Z  2584302525  completed  failure       1
2022-12-05T22:48:15Z  2022-12-05T22:48:56Z  3624665064  completed  failure       1
2022-11-21T16:23:09Z  2022-11-21T16:23:31Z  3516199905  completed  failure       1
2022-11-21T16:24:12Z  2022-11-21T16:24:38Z  3516208366  completed  failure       1
2022-11-21T18:07:16Z  2022-11-21T18:07:44Z  3516954588  completed  failure       1
                                                                                ..
2022-10-10T15:25:24Z  2022-10-10T15:26:26Z  3220487298  completed  failure       1
2022-10-10T15:26:12Z  2022-10-10T15:27:12Z  3220492742  completed  failure       1
2022-10-10T15:40:43Z  2022-10-10T15:41:50Z  3220588709  completed  failure       1
2022-10-10T15:46:40Z  2022-10-10T15:47:46Z  3220626519  completed  failure       1
2023-03-02T18:04:52Z  2023-03-02T18:05:21Z  4316905413  completed  success       1
Length: 2

In [9]:
test_df['run_duration'] = test_df.apply(lambda x: (datetime.datetime.strptime(x['updated_at'],"%Y-%m-%dT%H:%M:%SZ") - \
                                           datetime.datetime.strptime(x['created_at'],"%Y-%m-%dT%H:%M:%SZ")).total_seconds(), axis = 1)
test_df['test'] = test_id

In [10]:
test_df.head()

Unnamed: 0,created_at,updated_at,id,status,conclusion,run_duration,test
0,2023-03-02T18:04:52Z,2023-03-02T18:05:21Z,4316905413,completed,success,29.0,28698040
1,2023-02-28T22:46:09Z,2023-02-28T22:46:39Z,4298089675,completed,success,30.0,28698040
2,2023-02-28T22:42:47Z,2023-02-28T22:43:10Z,4298071211,completed,failure,23.0,28698040
3,2023-02-28T22:21:25Z,2023-02-28T22:21:46Z,4297918764,completed,success,21.0,28698040
4,2023-02-22T23:15:44Z,2023-02-22T23:16:07Z,4247826657,completed,success,23.0,28698040


In [11]:
test_df.dtypes

created_at       object
updated_at       object
id                int64
status           object
conclusion       object
run_duration    float64
test             object
dtype: object

In [12]:
test_df['run_duration'].isnull().sum()

0

In [13]:
# generating passing and failing dfs which are neccesary for computing fit distributions
passing_test = test_df[test_df['conclusion'] == 'success'] 
failures_test = test_df[test_df['conclusion'] == 'failure'] 

In [14]:
passing_test.shape

(121, 7)

In [15]:
failures_test.shape

(149, 7)

In [16]:
failures_test['run_duration'].value_counts()

48.0       7
30.0       6
64.0       6
34.0       6
26.0       5
23.0       5
49.0       5
32.0       5
28.0       4
47.0       4
42.0       4
55.0       4
43.0       4
36.0       4
54.0       4
38.0       4
27.0       4
21.0       4
50.0       4
45.0       4
39.0       3
52.0       3
58.0       3
31.0       3
29.0       3
60.0       3
22.0       3
61.0       2
33.0       2
25.0       2
44.0       2
65.0       2
63.0       2
53.0       2
46.0       2
51.0       2
41.0       2
57.0       2
37.0       1
81.0       1
68.0       1
21751.0    1
4818.0     1
56.0       1
87.0       1
67.0       1
24.0       1
62.0       1
59.0       1
40.0       1
66.0       1
Name: run_duration, dtype: int64

## Conclusion

In this notebook, we interact with the github api to collect the data for all workflow runs. In future work, we will look into using this data to perform statistical tests using OSP model.

## Fit Distribution
After extracting the data for one test, we would want to find the best distribution to perform optimal stopping point calculation. We find chi square and p-values to find the best distribution.

In [41]:
from sklearn.preprocessing import StandardScaler
import numpy as np
import scipy.stats

In [39]:
def standardize(df, column, pct, pct_lower):
    """
    Function to standardize the features by removing the mean
    and scaling to unit variance using StandardScaler library.

    Returns standandardized feature, length of the feature
    and the original feature.
    """
    sc = StandardScaler()
    y = df[column][df[column].notnull()].to_list()
    y.sort()
    len_y = len(y)
    y = y[int(pct_lower * len_y) : int(len_y * pct)]
    len_y = len(y)
    yy = [[x] for x in y]
    sc.fit(yy)
    y_std = sc.transform(yy)
    y_std = y_std.flatten()
    return y_std, len_y, y

In [33]:
y_std, size, y_org = standardize(failures_test, "run_duration", 0.99, 0.01)

In [36]:
dist_names = [
    "weibull_min",
    "norm",
    "weibull_max",
    "beta",
    "invgauss",
    "uniform",
    "gamma",
    "expon",
    "lognorm",
    "pearson3",
    "triang",
]

chi_square_statistics = []

# 50 bins
percentile_bins = np.linspace(0, 100, 50)
percentile_cutoffs = np.percentile(y_std, percentile_bins)
observed_frequency, bins = np.histogram(y_std, bins=percentile_cutoffs)
cum_observed_frequency = np.cumsum(observed_frequency)
# Data frame to store results
dist_param = pd.DataFrame()
dist_param["Distribution Names"] = dist_names
param_list = []

In [38]:
# Loop through candidate distributions
for distribution in dist_names:
    # Set up distribution and get fitted distribution parameters
    dist = getattr(scipy.stats, distribution)
    param = dist.fit(y_std)
    param_list.append(param)

    # Get expected counts in percentile bins
    # cdf of fitted distribution across bins
    cdf_fitted = dist.cdf(percentile_cutoffs, *param)
    expected_frequency = []
    for bin in range(len(percentile_bins) - 1):
        expected_cdf_area = cdf_fitted[bin + 1] - cdf_fitted[bin]
        expected_frequency.append(expected_cdf_area)

    # Chi-square Statistics
    expected_frequency = np.array(expected_frequency) * size
    print(expected_frequency)
    cum_expected_frequency = np.cumsum(expected_frequency)
    print(cum_expected_frequency)
    ss = scipy.stats.chisquare(
        f_obs=cum_observed_frequency, f_exp=cum_expected_frequency
    )
    chi_square_statistics.append(ss)

[ 3.93563875 20.85187517  8.19001345  6.61240709  8.63079018  0.
  3.5965909   3.18739083  0.54856518  2.87094963  2.05770016  0.60509684
  2.39527887  1.61172021  2.0617548   1.92788013  0.          3.51687643
  2.21641993  0.92743902  1.45480941  2.70987278  1.26616644  1.21324748
  0.59992637  1.68338515  0.59836399  1.51588049  0.59652275  0.40414783
  0.          0.96620234  0.63234575  0.30148606  1.53095577  0.87229706
  1.04416295  0.62056346  0.17699254  1.52705757  0.73171696  1.40476438
  0.67482571  1.2985479   0.62517256  0.          1.18140932  7.42315506
 40.20162808]
[  3.93563875  24.78751392  32.97752737  39.58993446  48.22072463
  48.22072463  51.81731554  55.00470637  55.55327155  58.42422118
  60.48192134  61.08701819  63.48229706  65.09401727  67.15577207
  69.0836522   69.0836522   72.60052863  74.81694857  75.74438759
  77.199197    79.90906978  81.17523622  82.3884837   82.98841007
  84.67179522  85.2701592   86.7860397   87.38256244  87.78671028
  87.78671028 

ValueError: For each axis slice, the sum of the observed frequencies must agree with the sum of the expected frequencies to a relative tolerance of 1e-08, but the percent differences are:
0.039142166864563226

In [None]:
# Append results to data frame
dist_param["Parameters"] = param_list
dist_param.set_index("Distribution Names")
# Sort by minimum ch-square statistics
results = pd.DataFrame()
results["Distribution"] = dist_names
results["chi_square and p-value"] = chi_square_statistics
results.sort_values(["chi_square and p-value"], inplace=True)

print("\nDistributions listed by Betterment of fit:")
print("............................................")
print(results)

In [73]:
failure_dist, failures_r = fit_distribution(failures_test, "run_duration", 0.99, 0.01)

ValueError: For each axis slice, the sum of the observed frequencies must agree with the sum of the expected frequencies to a relative tolerance of 1e-08, but the percent differences are:
0.205264694966213