
# Load Organization Repos (`processed/ORG/repos.json`)

In [None]:
from gitea_repo_extractor import GiteaRepoExtractor

url = "https://opendev.org/api/v1"
repo_extractor =  GiteaRepoExtractor(api_url=url)

repo_extractor.extract_repos_from_org("openstack")


# Extract raw commits (`raw/ORG/REPO.commits.json`)


In [None]:
import json
from gitea_repo_extractor import GiteaRepoExtractor

url = "https://opendev.org/api/v1"
repo_extractor =  GiteaRepoExtractor(api_url=url)

ORG = "openstack"
with open(f"../data/processed/{ORG}/repos.json", "r") as file:
    data = json.load(file)

    for repo in data:
        print(repo["name"])
        repo_extractor.extract_raw_commits_from_repo(org=ORG, owner=repo["owner"], repo_fullname=repo["name"])


# Extract commit messages and dates (`processed/ORG/commits-messages-dates/REPO.pickle`)

In [None]:
import os
from gitea_repo_extractor import GiteaRepoExtractor

url = "https://opendev.org/api/v1"
repo_extractor =  GiteaRepoExtractor(api_url=url)
ORG = 'openstack'

for subdir, dirs, files in os.walk(f"../data/raw/{ORG}"):
    for file in files:
        repo = file.split(".")[0]
        print("Extracting commits from: ", repo)
        path = os.path.join(subdir, file)
        repo_extractor.save_commits_messages_dates(path,
                                                    ORG,
                                                    repo)


# Extract repos with at least 2 commits per month (`processed/ORG/months-valid.json`)


In [None]:
from repovalidator import RepoValidator
import os
import json

ORG = "openstack"
repos_bool = []
for subdir, dirs, files in os.walk(f"../data/processed/{ORG}/commits-messages-dates"):
    for file in files:
        filename = file.split(".")[0]
        complete_path = os.path.join(subdir, file)
        repos_bool.append({"name": filename, "is_valid": RepoValidator(complete_path).has_at_least_2_commits_per_month()})
        
with open(f"../data/processed/{ORG}/months-valid.json", "w") as f:
    valids = list(map(lambda y: y["name"], filter(lambda x: x["is_valid"], repos_bool)))
    json.dump(valids, f)


# Extract commits with their files (`/processed/ORG/commits-files/REPO.commits.pickle`)


In [None]:
from gitea_repo_extractor import GiteaRepoExtractor
import json
import os

url = "https://opendev.org/api/v1"
repo_extractor =  GiteaRepoExtractor(api_url=url)
ORG = 'openstack'

with open(f"../data/processed/{ORG}/months-valid.json", "r") as file:
    data = json.load(file)

    for repo in data: 
        repo_extractor.extract_commits_to_df(repo, ORG)


# Extract repos with at least 11% of IaC files (`processed/ORG/valid-repos.json`)


In [None]:
from repovalidator import RepoValidator
import os
import json

ORG = 'openstack'

valids = []
for subdir, dirs, files in os.walk(f"../data/processed/{ORG}/commits-files"):
    for file in files:
        repo = file.split(".")[0]
        path = os.path.join(subdir, file)
        RepoValidator(path)
        if RepoValidator(path).has_11_percent_of_iac():
            valids.append(repo)

with open(f'../data/processed/{ORG}/valid-repos.json', "w") as f:
    json.dump(valids, f)

# Extract XCMs (`processed/ORG/REPO.xcms.json`)

In [None]:
import json
from gitea_commit_message_processor import GiteaCommitMessageProcessor

url = "https://opendev.org/api/v1"
message_processor =  GiteaCommitMessageProcessor(api_url=url)
ORG = 'openstack'

with open(f"../data/processed/{ORG}/valid-repos.json", "r") as file:
    data = json.load(file)
    for repo in data: 
        message_processor.extract_xcm(ORG, repo)

print("done")