# Repo Level Metrics

In [8]:
import os
import json
import boto3
import datetime as dt
import pandas as pd

from github import Github
from pprint import pprint

from srcopsmetrics.metrics import Metrics
from srcopsmetrics.entities.issue import Issue
from srcopsmetrics.entities.pull_request import PullRequest

from dotenv import find_dotenv, load_dotenv

In [2]:
load_dotenv(find_dotenv())

True

In [3]:
## CEPH Bucket variables
## Create a .env file on your local with the correct configs,
s3_endpoint_url = os.getenv("S3_ENDPOINT")
s3_access_key = os.getenv("S3_ACCESS_KEY")
s3_secret_key = os.getenv("S3_SECRET_KEY")
s3_bucket = os.getenv("S3_BUCKET")
s3_path = os.getenv("S3_PROJECT_KEY", "open-services-group/metrics/github")
repo_slug = os.getenv("REPO_SLUG")
s3_input_data_path = "raw_data"
REMOTE = os.getenv("REMOTE")

In [4]:
s3 = boto3.resource("s3",endpoint_url=s3_endpoint_url,aws_access_key_id=s3_access_key,aws_secret_access_key=s3_secret_key)

In [9]:
# Function to read data from Ceph
def read_data_from_ceph(s3_file_path):
    print("Getting dataset from Ceph")
    content = s3.Object(s3_bucket, s3_file_path)
    file = content.get()["Body"].read().decode("utf-8")
    data = json.loads(file)
    separated_list = data.splitlines()
    data_dict = { str(i): json.loads(separated_list[i]) for i in range(0, len(separated_list) ) }
    df = pd.DataFrame(data_dict).T
    return df

In [10]:
issues_df = read_data_from_ceph(f"{s3_path}/srcopsmetrics/bot_knowledge/{repo_slug}/Issue.json")
issues_df.head()

Getting dataset from Ceph


Unnamed: 0,title,body,created_by,created_at,closed_by,closed_at,labels,interactions,id
0,Prepare GPU image for training teacher network...,**Is your feature request related to a problem...,pacospace,1644527768,,,"{'enhancement': {'color': 'a2eeef', 'labeled_a...",{'pacospace': 8},128
1,Value Error related to S3 in demo2 notebook,Value Error related to S3 at Import in when ru...,andraNew,1641590227,andraNew,1642091274.0,"{'bug': {'color': 'd73a4a', 'labeled_at': 1641...","{'andraNew': 123, 'erikerlandson': 90, 'chauha...",127
2,Create Jupyterbook,Add _toc.yaml and _config.yml for the repo and...,oindrillac,1639764581,oindrillac,1640024593.0,{},{},125
3,Should we state which base image tag is used i...,See: https://github.com/os-climate/aicoe-osc-d...,pacospace,1639086094,MichaelClifford,1639189607.0,{},"{'chauhankaranraj': 71, 'pacospace': 39}",122
4,Use overlays for building images,**Is your feature request related to a problem...,chauhankaranraj,1639077221,,,"{'enhancement': {'color': 'a2eeef', 'labeled_a...","{'pacospace': 19, 'chauhankaranraj': 40}",121


In [11]:
prs_df = read_data_from_ceph(f"{s3_path}/srcopsmetrics/bot_knowledge/{repo_slug}/PullRequest.json")
prs_df.head()

Getting dataset from Ceph


Unnamed: 0,title,body,size,created_by,created_at,closed_at,closed_by,merged_at,merged_by,commits_number,changed_files_number,interactions,reviews,labels,commits,changed_files,first_review_at,first_approve_at,id
0,Add manifests for GPU image build,Signed-off-by: Francesco Murdaca <fmurdaca@red...,XL,pacospace,1644855027,,,,,1,9,{'pacospace': 22},"{'881638114': {'author': 'erikerlandson', 'wor...",[],[d2af49daf938560856f315e573cd9e54e92aa570],"[manifests/.sops.yaml, manifests/README.md, ma...",1644863765.0,1644863765.0,129
1,Updated documentation,closes #125 \r\ncloses #110 \r\n\r\nJupyterBoo...,M,oindrillac,1639777200,1640024594.0,oindrillac,1640024593.0,oindrillac,1,5,"{'oindrillac': 13, 'chauhankaranraj': 18}","{'835457435': {'author': 'aakankshaduggal', 'w...",[],[523e26606333956764986a296149ca56edf56b40],"[README.md, _config.yml, _toc.yml, notebooks/d...",1639778743.0,1639793939.0,126
2,Update README,This PR \r\n- updates the README to mention th...,XS,chauhankaranraj,1639532708,1639591055.0,MichaelClifford,1639591055.0,MichaelClifford,1,1,{'MichaelClifford': 1},{},[],[ad9668f096e1e5ebc5b123d1c6df4ea5ed98af5a],[notebooks/demo2/README.md],,,124
3,Use specific versions instead of latest for im...,Closes #122,S,chauhankaranraj,1639094750,1639189607.0,MichaelClifford,1639189607.0,MichaelClifford,1,3,{'MichaelClifford': 1},"{'828089083': {'author': 'oindrillac', 'words_...",[],[c0666de738d99cf9a40e2867fc8f4e51a305d086],"[Dockerfile, notebooks/demo2/inference.pipelin...",1639094897.0,1639094897.0,123
4,Finalize Demo 2 readme,This PR adds Superset dashboard link to the Re...,XS,Shreyanand,1638982645,1639079020.0,MichaelClifford,1639079020.0,MichaelClifford,1,1,{'MichaelClifford': 1},"{'827649205': {'author': 'chauhankaranraj', 'w...",[],[ce265997ec1e5bf368d804f571e660edb261a77a],[notebooks/demo2/README.md],1639072873.0,1639072873.0,119


## Count-type Metrics

In [28]:
num_open_issues = issues_df['closed_at'].isna().sum()
num_open_issues

18

### TODO: let thoth team know we need assignees information for issues

In [None]:
num_open_issues_wout_assignees = issues_df[issues_df['closed_at'].isna()]
num_open_issues_wout_assignees

In [30]:
num_open_prs = prs_df['closed_at'].isna().sum()
num_open_prs

3

In [40]:
num_open_prs_90d = len(prs_df[prs_df['created_at'] > (dt.datetime.now()-dt.timedelta(days=90)).timestamp()])
num_closed_prs_90d = len(prs_df[prs_df['closed_at'] > (dt.datetime.now()-dt.timedelta(days=90)).timestamp()])
print(num_open_prs_90d, num_closed_prs_90d)

ratio = num_closed_prs_90d / num_open_prs_90d
ratio

6 7


1.1666666666666667

In [3]:
_GITHUB_ACCESS_TOKEN = os.getenv('GITHUB_ACCESS_TOKEN')

In [9]:
g = Github(_GITHUB_ACCESS_TOKEN)
repo = g.get_repo("open-services-group/metrics")
issues = repo.get_issues(state="open")
pprint(issues.get_page(0))

[Issue(title="KPI metric calculation notebooks", number=19),
 Issue(title="Automate metric collection and processing workflow", number=18),
 Issue(title="[WIP] Add GitHub data analysis notebook", number=17),
 Issue(title="Notebook to explore the issue/pr data for OSG repos", number=16),
 Issue(title="Automatic update of dependencies by Kebechet for the rhel:8 environment", number=15),
 Issue(title="Spike: Define OKR completion", number=10),
 Issue(title="[EPIC] Visualization", number=7),
 Issue(title="[EPIC] EDA of metrics", number=6),
 Issue(title="[EPIC] Metric collection", number=5),
 Issue(title="Spike: Define initial set of metrics", number=3)]


In [4]:
repository = "os-climate/aicoe-osc-demo"

gh_repo = Github(login_or_token=_GITHUB_ACCESS_TOKEN, timeout=50).get_repo(repository)
prs = PullRequest(gh_repo).load_previous_knowledge(is_local=False)
# issues = Issue(gh_repo)#.load_previous_knowledge(is_local=False)

AttributeError: 'NoneType' object has no attribute 'endswith'

In [6]:
metrics = Metrics(repository="os-climate/aicoe-osc-demo")

In [7]:
issue_metrics = metrics.process_issues()
issue_metrics

ValueError: Length mismatch: Expected axis has 0 elements, new values have 3 elements

In [8]:
pr_metrics = metrics.process_pull_requests()
pr_metrics

ValueError: Length mismatch: Expected axis has 0 elements, new values have 9 elements

In [None]:
# define repos for which we want to calculate metrics
REPOS = ["os-climate/aicoe-osc-demo"]

In [None]:
## CEPH Bucket variables
## Create a .env file on your local with the correct configs,
s3_endpoint_url = os.getenv("S3_ENDPOINT")
s3_access_key = os.getenv("S3_ACCESS_KEY")
s3_secret_key = os.getenv("S3_SECRET_KEY")
s3_bucket = os.getenv("S3_BUCKET")
s3_path = os.getenv("S3_PROJECT_KEY", "open-services-group/metrics/github")
s3_input_data_path = "raw_data"
REMOTE = os.getenv("REMOTE")