# Get the list of repositories (`processed/ORG/repos.json`)

In [None]:
from repoextractor import RepoExtractor

ORG = "wikimedia"
RepoExtractor().extract_repo_names(ORG)


# Extract raw commits (`raw/ORG/REPO.commits.json`)


In [2]:
from repoextractor import RepoExtractor
import json
import datetime

ORG = "wikimedia"

today = datetime.datetime.today()
deadline = datetime.timedelta(days=365*2)
since = today - deadline

with open(f"../data/processed/{ORG}/repos.json", "r") as file:
    data = json.load(file)
    
    start_index = 0
    for i, repo in enumerate(data):
        if repo["name"] == "mediawiki-extensions-Collection":
            start_index = i
            break

    for repo in data[start_index + 1:]:
        print(repo["name"])
        RepoExtractor().extract_raw_commits_from_repo(repo["name"], since)
    
    

wikimedia/wikipedia-ios



# Extract commit messages and dates (`processed/ORG/commits-messages-dates/REPO.pickle`)


In [3]:
from repoextractor import RepoExtractor
import os

for subdir, dirs, files in os.walk(f"../data/raw/{ORG}"):
    for file in files:
        repo = file.split(".")[0]
        path = os.path.join(subdir, file)
        RepoExtractor().save_commits_messages_dates(path,
                                                    ORG,
                                                    repo)
        



# Extract repos with at least 2 commits per month (`processed/ORG/months-valid.json`)


In [None]:
from repovalidator import RepoValidator
import json
import os

repos_bool = []
for subdir, dirs, files in os.walk(f"../data/processed/{ORG}/commits-messages-dates"):
    for file in files:
        filename = file.split(".")[0]
        complete_path = os.path.join(subdir, file)
        repos_bool.append({"name": filename, "is_valid": RepoValidator(complete_path).has_at_least_2_commits_per_month()})
        
with open(f"../data/processed/{ORG}/months-valid.json", "w") as f:
    valids = list(map(lambda y: y["name"], filter(lambda x: x["is_valid"], repos_bool)))
    json.dump(valids, f)


# Extract commits with their files (`/processed/ORG/commits-files/REPO.commits.pickle`)


In [None]:
from repoextractor import RepoExtractor
import json
import datetime

ORG = "wikimedia"
today = datetime.datetime.today()
delta = datetime.timedelta(days=365*2)
since = today - delta

with open(f"../data/processed/{ORG}/months-valid.json", "r") as file:
    extractor = RepoExtractor()
    data = json.load(file)

    for repo in data: 
        r = extractor.get_repo(f"{ORG}/{repo}")
        extractor.extract_repo(r, ORG, since=since)
        
        



# Extract repos with at least 11% of IaC files (`processed/ORG/valid-repos.json`)


In [None]:
from repovalidator import RepoValidator
import json
import os

ORG = "wikimedia"

valids = []
for subdir, dirs, files in os.walk(f"../data/processed/{ORG}/commits-files"):
    for file in files:
        repo = file.split(".")[0]
        path = os.path.join(subdir, file)
        print(f"    Validating repo {repo}")
        if RepoValidator(path).has_11_percent_of_iac():
            valids.append(repo)
valids            

In [2]:
with open(f'../data/processed/{ORG}/valid-repos.json', "w") as f:
    json.dump(valids, f)

# Extract XCMs (`processed/ORG/REPO.xcms.json`)

In [4]:
import json
from commit_message_processor import CommitMessageProcessor

ORG = "wikimedia"

with open(f"../data/processed/{ORG}/valid-repos.json", "r") as file:
    processor = CommitMessageProcessor()
    data = json.load(file)
    for repo in data: 
        processor.extract_xcm(ORG, repo)

print("done")

    Extracting XCM from repo 'integration-config'
   1597 commits total
        Saving xcms to ../data/processed/wikimedia/xcms/integration-config.json
done
